Clean logs for DeepSeek-V3 launching (#6079)

73600673 · Baizhou Zhang · GitHub · 8f508cc7 · 73600673 · 73600673
Unverified Commit 73600673 authored May 07, 2025 by Baizhou Zhang Committed by GitHub May 07, 2025
7 changed files
--- a/python/sglang/srt/distributed/device_communicators/pynccl.py
+++ b/python/sglang/srt/distributed/device_communicators/pynccl.py
@@ -75,6 +75,7 @@ class PyNcclCommunicator:
        self.available = True
        self.disabled = False

+        if self.rank == 0:
            logger.info("sglang is using nccl==%s", self.nccl.ncclGetVersion())

        if self.rank == 0:

--- a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
@@ -29,6 +29,7 @@ from sglang.srt.utils import (
    get_device_name,
    is_cuda,
    is_hip,
+    log_info_on_rank0,
 )

 _is_hip = is_hip()
@@ -945,7 +946,9 @@ def get_moe_configs(
            # For example, updating the Triton version might cause all old configs to become suboptimal.
            # To achieve the best performance, consider re-tuning the Triton fused MOE kernel in your environment.
            # For the tuning method, refer to: https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton
-            logger.info("Using MoE kernel config from %s.", config_file_path)
+            log_info_on_rank0(
+                logger, f"Using MoE kernel config from {config_file_path}."
+            )
            # If a configuration has been found, return it
            return {int(key): val for key, val in json.load(f).items()}


--- a/python/sglang/srt/layers/quantization/fp8.py
+++ b/python/sglang/srt/layers/quantization/fp8.py
@@ -66,6 +66,7 @@ from sglang.srt.utils import (
    get_bool_env_var,
    is_cuda,
    is_hip,
+    log_info_on_rank0,
    print_warning_once,
    set_weight_attrs,
 )
@@ -104,10 +105,7 @@ class Fp8Config(QuantizationConfig):
    ) -> None:
        self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
        if is_checkpoint_fp8_serialized:
-            logger.warning(
-                "Detected fp8 checkpoint. Please note that the "
-                "format is experimental and subject to change."
-            )
+            log_info_on_rank0(logger, "Detected fp8 checkpoint.")
        if activation_scheme not in ACTIVATION_SCHEMES:
            raise ValueError(f"Unsupported activation scheme {activation_scheme}")
        self.activation_scheme = activation_scheme

--- a/python/sglang/srt/layers/quantization/fp8_kernel.py
+++ b/python/sglang/srt/layers/quantization/fp8_kernel.py
@@ -30,6 +30,7 @@ from sglang.srt.utils import (
    get_device_name,
    is_cuda,
    is_hip,
+    log_info_on_rank0,
    supports_custom_op,
 )

@@ -698,9 +699,9 @@ def get_w8a8_block_fp8_configs(
    )
    if os.path.exists(config_file_path):
        with open(config_file_path) as f:
-            logger.info(
-                "Using configuration from %s for W8A8 Block FP8 kernel.",
-                config_file_path,
+            log_info_on_rank0(
+                logger,
+                f"Using configuration from {config_file_path} for W8A8 Block FP8 kernel.",
            )
            # If a configuration has been found, return it
            return {int(key): val for key, val in json.load(f).items()}

--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -278,6 +278,7 @@ class ModelRunner:
                    server_args.attention_backend = "fa3"
                else:
                    server_args.attention_backend = "triton"
+            if self.should_log:
                logger.info(
                    f"Attention backend not set. Use {server_args.attention_backend} backend by default."
                )
@@ -290,6 +291,7 @@ class ModelRunner:
                    "flashmla",
                    "cutlass_mla",
                ]:
+                    if self.should_log:
                        logger.info(
                            f"MLA optimization is turned on. Use {server_args.attention_backend} backend."
                        )
@@ -311,6 +313,7 @@ class ModelRunner:
            server_args.attention_backend = "triton"

        if server_args.enable_double_sparsity:
+            if self.should_log:
                logger.info(
                    "Double sparsity optimization is turned on. Use triton backend without CUDA graph."
                )
@@ -324,6 +327,7 @@ class ModelRunner:

        if self.is_multimodal:
            self.mem_fraction_static *= 0.90
+            if self.should_log:
                logger.info(
                    f"Automatically reduce --mem-fraction-static to {self.mem_fraction_static:.3f} "
                    f"because this is a multimodal model."
@@ -336,10 +340,12 @@ class ModelRunner:
        if not self.use_mla_backend:
            server_args.disable_chunked_prefix_cache = True
        elif self.page_size > 1:
+            if self.should_log:
                logger.info("Disable chunked prefix cache when page size > 1.")
            server_args.disable_chunked_prefix_cache = True

        if not server_args.disable_chunked_prefix_cache:
+            if self.should_log:
                logger.info("Chunked prefix cache is turned on.")

    def init_torch_distributed(self):
@@ -433,6 +439,7 @@ class ModelRunner:
            torch.set_num_threads(1)
        if self.device == "cuda":
            if torch.cuda.get_device_capability()[0] < 8:
+                if self.should_log:
                    logger.info(
                        "Compute capability below sm80. Use float16 due to lack of bfloat16 support."
                    )
@@ -471,6 +478,7 @@ class ModelRunner:
                    self.model.load_kv_cache_scales(
                        self.server_args.quantization_param_path
                    )
+                    if self.should_log:
                        logger.info(
                            "Loaded KV cache scaling factors from %s",
                            self.server_args.quantization_param_path,
@@ -1021,6 +1029,7 @@ class ModelRunner:
        )

    def apply_torch_tp(self):
+        if self.should_log:
            logger.info(f"Enabling torch tensor parallelism on {self.tp_size} devices.")
        from sglang.srt.model_parallel import tensor_parallel


--- a/python/sglang/srt/models/deepseek_v2.py
+++ b/python/sglang/srt/models/deepseek_v2.py
@@ -88,6 +88,7 @@ from sglang.srt.utils import (
    get_int_env_var,
    is_cuda,
    is_hip,
+    log_info_on_rank0,
 )

 _is_hip = is_hip()
@@ -1485,8 +1486,9 @@ class DeepseekV2ForCausalLM(nn.Module):
            ):
                self.n_share_experts_fusion = 0
                global_server_args_dict["n_share_experts_fusion"] = 0
-                logger.info(
-                    "Only Deepseek V3/R1 can use shared experts fusion optimization. Shared experts fusion optimization is disabled."
+                log_info_on_rank0(
+                    logger,
+                    "Only Deepseek V3/R1 can use shared experts fusion optimization. Shared experts fusion optimization is disabled.",
                )
            else:
                assert (
@@ -1501,8 +1503,9 @@ class DeepseekV2ForCausalLM(nn.Module):
            ):
                self.n_share_experts_fusion = self.tp_size
                global_server_args_dict["n_share_experts_fusion"] = self.tp_size
-                logger.info(
-                    "Deepseek V3/R1 with fp8 can use shared experts fusion optimization when SM version >=90. Shared experts fusion optimization is enabled."
+                log_info_on_rank0(
+                    logger,
+                    "Deepseek V3/R1 with fp8 can use shared experts fusion optimization when SM version >=90. Shared experts fusion optimization is enabled.",
                )

    def get_input_embeddings(self) -> nn.Embedding:

--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -2096,3 +2096,10 @@ class BumpAllocator:
        output = self._buffer[self._pointer : self._pointer + size]
        self._pointer += size
        return output
+
+
+def log_info_on_rank0(logger, msg):
+    from sglang.srt.distributed import get_tensor_model_parallel_rank
+
+    if get_tensor_model_parallel_rank() == 0:
+        logger.info(msg)