Unverified Commit a42ab317 authored by Wentao Ye's avatar Wentao Ye Committed by GitHub
Browse files

[Log] Optimize startup log (#28948)


Signed-off-by: default avataryewentao256 <zhyanwentao@126.com>
Signed-off-by: default avatarWentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: default avatarNick Hill <nhill@redhat.com>
parent b7f1f490
...@@ -872,8 +872,10 @@ def get_moe_configs( ...@@ -872,8 +872,10 @@ def get_moe_configs(
for config_file_path in config_file_paths: for config_file_path in config_file_paths:
if os.path.exists(config_file_path): if os.path.exists(config_file_path):
with open(config_file_path) as f: with open(config_file_path) as f:
logger.info( logger.info_once(
"Using configuration from %s for MoE layer.", config_file_path "Using configuration from %s for MoE layer.",
config_file_path,
scope="global",
) )
# If a configuration has been found, return it # If a configuration has been found, return it
tuned_config = json.load(f) tuned_config = json.load(f)
......
...@@ -162,9 +162,11 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend: ...@@ -162,9 +162,11 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend:
# deepGEMM on supported platforms with block-quantized weights # deepGEMM on supported platforms with block-quantized weights
if envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM and block_quant: if envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM and block_quant:
if not has_deep_gemm(): if not has_deep_gemm():
logger.warning_once("DeepGEMM backend requested but not available.") logger.warning_once(
"DeepGEMM backend requested but not available.", scope="local"
)
elif is_deep_gemm_supported(): elif is_deep_gemm_supported():
logger.info_once("Using DeepGEMM backend for FP8 MoE") logger.info_once("Using DeepGEMM backend for FP8 MoE", scope="local")
return Fp8MoeBackend.DEEPGEMM return Fp8MoeBackend.DEEPGEMM
# CUTLASS BlockScaled GroupedGemm on SM100 with block-quantized weights # CUTLASS BlockScaled GroupedGemm on SM100 with block-quantized weights
...@@ -173,7 +175,9 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend: ...@@ -173,7 +175,9 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend:
and current_platform.is_device_capability(100) and current_platform.is_device_capability(100)
and block_quant and block_quant
): ):
logger.info_once("Using Cutlass BlockScaled GroupedGemm backend for FP8 MoE") logger.info_once(
"Using Cutlass BlockScaled GroupedGemm backend for FP8 MoE", scope="local"
)
return Fp8MoeBackend.CUTLASS_BLOCK_SCALED_GROUPED_GEMM return Fp8MoeBackend.CUTLASS_BLOCK_SCALED_GROUPED_GEMM
# default to Triton # default to Triton
......
...@@ -139,18 +139,19 @@ class TorchProfilerWrapper(WorkerProfiler): ...@@ -139,18 +139,19 @@ class TorchProfilerWrapper(WorkerProfiler):
self.local_rank = local_rank self.local_rank = local_rank
torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
logger.info( if local_rank in (None, 0):
"Torch profiling enabled. Traces will be saved to: %s", logger.info(
torch_profiler_trace_dir, "Torch profiling enabled. Traces will be saved to: %s",
) torch_profiler_trace_dir,
logger.debug( )
"Profiler config: record_shapes=%s," logger.debug(
"profile_memory=%s,with_stack=%s,with_flops=%s", "Profiler config: record_shapes=%s,"
envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, "profile_memory=%s,with_stack=%s,with_flops=%s",
envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
envs.VLLM_TORCH_PROFILER_WITH_STACK, envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
envs.VLLM_TORCH_PROFILER_WITH_FLOPS, envs.VLLM_TORCH_PROFILER_WITH_STACK,
) envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
)
self.profiler = torch.profiler.profile( self.profiler = torch.profiler.profile(
activities=[ activities=[
torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CPU,
......
...@@ -1236,10 +1236,11 @@ def _report_kv_cache_config( ...@@ -1236,10 +1236,11 @@ def _report_kv_cache_config(
max_concurrency = get_max_concurrency_for_kv_cache_config( max_concurrency = get_max_concurrency_for_kv_cache_config(
vllm_config, kv_cache_config vllm_config, kv_cache_config
) )
logger.info( logger.info_once(
"Maximum concurrency for %s tokens per request: %.2fx", "Maximum concurrency for %s tokens per request: %.2fx",
max_model_len_str, max_model_len_str,
max_concurrency, max_concurrency,
scope="local",
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment