Unverified Commit 73600673 authored by Baizhou Zhang's avatar Baizhou Zhang Committed by GitHub
Browse files

Clean logs for DeepSeek-V3 launching (#6079)

parent 8f508cc7
...@@ -75,6 +75,7 @@ class PyNcclCommunicator: ...@@ -75,6 +75,7 @@ class PyNcclCommunicator:
self.available = True self.available = True
self.disabled = False self.disabled = False
if self.rank == 0:
logger.info("sglang is using nccl==%s", self.nccl.ncclGetVersion()) logger.info("sglang is using nccl==%s", self.nccl.ncclGetVersion())
if self.rank == 0: if self.rank == 0:
......
...@@ -29,6 +29,7 @@ from sglang.srt.utils import ( ...@@ -29,6 +29,7 @@ from sglang.srt.utils import (
get_device_name, get_device_name,
is_cuda, is_cuda,
is_hip, is_hip,
log_info_on_rank0,
) )
_is_hip = is_hip() _is_hip = is_hip()
...@@ -945,7 +946,9 @@ def get_moe_configs( ...@@ -945,7 +946,9 @@ def get_moe_configs(
# For example, updating the Triton version might cause all old configs to become suboptimal. # For example, updating the Triton version might cause all old configs to become suboptimal.
# To achieve the best performance, consider re-tuning the Triton fused MOE kernel in your environment. # To achieve the best performance, consider re-tuning the Triton fused MOE kernel in your environment.
# For the tuning method, refer to: https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton # For the tuning method, refer to: https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton
logger.info("Using MoE kernel config from %s.", config_file_path) log_info_on_rank0(
logger, f"Using MoE kernel config from {config_file_path}."
)
# If a configuration has been found, return it # If a configuration has been found, return it
return {int(key): val for key, val in json.load(f).items()} return {int(key): val for key, val in json.load(f).items()}
......
...@@ -66,6 +66,7 @@ from sglang.srt.utils import ( ...@@ -66,6 +66,7 @@ from sglang.srt.utils import (
get_bool_env_var, get_bool_env_var,
is_cuda, is_cuda,
is_hip, is_hip,
log_info_on_rank0,
print_warning_once, print_warning_once,
set_weight_attrs, set_weight_attrs,
) )
...@@ -104,10 +105,7 @@ class Fp8Config(QuantizationConfig): ...@@ -104,10 +105,7 @@ class Fp8Config(QuantizationConfig):
) -> None: ) -> None:
self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
if is_checkpoint_fp8_serialized: if is_checkpoint_fp8_serialized:
logger.warning( log_info_on_rank0(logger, "Detected fp8 checkpoint.")
"Detected fp8 checkpoint. Please note that the "
"format is experimental and subject to change."
)
if activation_scheme not in ACTIVATION_SCHEMES: if activation_scheme not in ACTIVATION_SCHEMES:
raise ValueError(f"Unsupported activation scheme {activation_scheme}") raise ValueError(f"Unsupported activation scheme {activation_scheme}")
self.activation_scheme = activation_scheme self.activation_scheme = activation_scheme
......
...@@ -30,6 +30,7 @@ from sglang.srt.utils import ( ...@@ -30,6 +30,7 @@ from sglang.srt.utils import (
get_device_name, get_device_name,
is_cuda, is_cuda,
is_hip, is_hip,
log_info_on_rank0,
supports_custom_op, supports_custom_op,
) )
...@@ -698,9 +699,9 @@ def get_w8a8_block_fp8_configs( ...@@ -698,9 +699,9 @@ def get_w8a8_block_fp8_configs(
) )
if os.path.exists(config_file_path): if os.path.exists(config_file_path):
with open(config_file_path) as f: with open(config_file_path) as f:
logger.info( log_info_on_rank0(
"Using configuration from %s for W8A8 Block FP8 kernel.", logger,
config_file_path, f"Using configuration from {config_file_path} for W8A8 Block FP8 kernel.",
) )
# If a configuration has been found, return it # If a configuration has been found, return it
return {int(key): val for key, val in json.load(f).items()} return {int(key): val for key, val in json.load(f).items()}
......
...@@ -278,6 +278,7 @@ class ModelRunner: ...@@ -278,6 +278,7 @@ class ModelRunner:
server_args.attention_backend = "fa3" server_args.attention_backend = "fa3"
else: else:
server_args.attention_backend = "triton" server_args.attention_backend = "triton"
if self.should_log:
logger.info( logger.info(
f"Attention backend not set. Use {server_args.attention_backend} backend by default." f"Attention backend not set. Use {server_args.attention_backend} backend by default."
) )
...@@ -290,6 +291,7 @@ class ModelRunner: ...@@ -290,6 +291,7 @@ class ModelRunner:
"flashmla", "flashmla",
"cutlass_mla", "cutlass_mla",
]: ]:
if self.should_log:
logger.info( logger.info(
f"MLA optimization is turned on. Use {server_args.attention_backend} backend." f"MLA optimization is turned on. Use {server_args.attention_backend} backend."
) )
...@@ -311,6 +313,7 @@ class ModelRunner: ...@@ -311,6 +313,7 @@ class ModelRunner:
server_args.attention_backend = "triton" server_args.attention_backend = "triton"
if server_args.enable_double_sparsity: if server_args.enable_double_sparsity:
if self.should_log:
logger.info( logger.info(
"Double sparsity optimization is turned on. Use triton backend without CUDA graph." "Double sparsity optimization is turned on. Use triton backend without CUDA graph."
) )
...@@ -324,6 +327,7 @@ class ModelRunner: ...@@ -324,6 +327,7 @@ class ModelRunner:
if self.is_multimodal: if self.is_multimodal:
self.mem_fraction_static *= 0.90 self.mem_fraction_static *= 0.90
if self.should_log:
logger.info( logger.info(
f"Automatically reduce --mem-fraction-static to {self.mem_fraction_static:.3f} " f"Automatically reduce --mem-fraction-static to {self.mem_fraction_static:.3f} "
f"because this is a multimodal model." f"because this is a multimodal model."
...@@ -336,10 +340,12 @@ class ModelRunner: ...@@ -336,10 +340,12 @@ class ModelRunner:
if not self.use_mla_backend: if not self.use_mla_backend:
server_args.disable_chunked_prefix_cache = True server_args.disable_chunked_prefix_cache = True
elif self.page_size > 1: elif self.page_size > 1:
if self.should_log:
logger.info("Disable chunked prefix cache when page size > 1.") logger.info("Disable chunked prefix cache when page size > 1.")
server_args.disable_chunked_prefix_cache = True server_args.disable_chunked_prefix_cache = True
if not server_args.disable_chunked_prefix_cache: if not server_args.disable_chunked_prefix_cache:
if self.should_log:
logger.info("Chunked prefix cache is turned on.") logger.info("Chunked prefix cache is turned on.")
def init_torch_distributed(self): def init_torch_distributed(self):
...@@ -433,6 +439,7 @@ class ModelRunner: ...@@ -433,6 +439,7 @@ class ModelRunner:
torch.set_num_threads(1) torch.set_num_threads(1)
if self.device == "cuda": if self.device == "cuda":
if torch.cuda.get_device_capability()[0] < 8: if torch.cuda.get_device_capability()[0] < 8:
if self.should_log:
logger.info( logger.info(
"Compute capability below sm80. Use float16 due to lack of bfloat16 support." "Compute capability below sm80. Use float16 due to lack of bfloat16 support."
) )
...@@ -471,6 +478,7 @@ class ModelRunner: ...@@ -471,6 +478,7 @@ class ModelRunner:
self.model.load_kv_cache_scales( self.model.load_kv_cache_scales(
self.server_args.quantization_param_path self.server_args.quantization_param_path
) )
if self.should_log:
logger.info( logger.info(
"Loaded KV cache scaling factors from %s", "Loaded KV cache scaling factors from %s",
self.server_args.quantization_param_path, self.server_args.quantization_param_path,
...@@ -1021,6 +1029,7 @@ class ModelRunner: ...@@ -1021,6 +1029,7 @@ class ModelRunner:
) )
def apply_torch_tp(self): def apply_torch_tp(self):
if self.should_log:
logger.info(f"Enabling torch tensor parallelism on {self.tp_size} devices.") logger.info(f"Enabling torch tensor parallelism on {self.tp_size} devices.")
from sglang.srt.model_parallel import tensor_parallel from sglang.srt.model_parallel import tensor_parallel
......
...@@ -88,6 +88,7 @@ from sglang.srt.utils import ( ...@@ -88,6 +88,7 @@ from sglang.srt.utils import (
get_int_env_var, get_int_env_var,
is_cuda, is_cuda,
is_hip, is_hip,
log_info_on_rank0,
) )
_is_hip = is_hip() _is_hip = is_hip()
...@@ -1485,8 +1486,9 @@ class DeepseekV2ForCausalLM(nn.Module): ...@@ -1485,8 +1486,9 @@ class DeepseekV2ForCausalLM(nn.Module):
): ):
self.n_share_experts_fusion = 0 self.n_share_experts_fusion = 0
global_server_args_dict["n_share_experts_fusion"] = 0 global_server_args_dict["n_share_experts_fusion"] = 0
logger.info( log_info_on_rank0(
"Only Deepseek V3/R1 can use shared experts fusion optimization. Shared experts fusion optimization is disabled." logger,
"Only Deepseek V3/R1 can use shared experts fusion optimization. Shared experts fusion optimization is disabled.",
) )
else: else:
assert ( assert (
...@@ -1501,8 +1503,9 @@ class DeepseekV2ForCausalLM(nn.Module): ...@@ -1501,8 +1503,9 @@ class DeepseekV2ForCausalLM(nn.Module):
): ):
self.n_share_experts_fusion = self.tp_size self.n_share_experts_fusion = self.tp_size
global_server_args_dict["n_share_experts_fusion"] = self.tp_size global_server_args_dict["n_share_experts_fusion"] = self.tp_size
logger.info( log_info_on_rank0(
"Deepseek V3/R1 with fp8 can use shared experts fusion optimization when SM version >=90. Shared experts fusion optimization is enabled." logger,
"Deepseek V3/R1 with fp8 can use shared experts fusion optimization when SM version >=90. Shared experts fusion optimization is enabled.",
) )
def get_input_embeddings(self) -> nn.Embedding: def get_input_embeddings(self) -> nn.Embedding:
......
...@@ -2096,3 +2096,10 @@ class BumpAllocator: ...@@ -2096,3 +2096,10 @@ class BumpAllocator:
output = self._buffer[self._pointer : self._pointer + size] output = self._buffer[self._pointer : self._pointer + size]
self._pointer += size self._pointer += size
return output return output
def log_info_on_rank0(logger, msg):
from sglang.srt.distributed import get_tensor_model_parallel_rank
if get_tensor_model_parallel_rank() == 0:
logger.info(msg)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment