Unverified Commit 8f87eb46 authored by Wentao Ye's avatar Wentao Ye Committed by GitHub
Browse files

[Refactor] Clean up log once `scope="local"` (#40540)


Signed-off-by: default avataryewentao256 <zhyanwentao@126.com>
Signed-off-by: default avatarWentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: default avatargemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
parent cfa49213
......@@ -292,7 +292,6 @@ class CompilerManager:
"from the cache, took %.3f s",
str(compile_range),
elapsed,
scope="local",
)
return compiled_graph
......@@ -377,7 +376,6 @@ class CompilerManager:
logger.info_once(
"Cache the graph of compile range %s for later use",
str(compile_range),
scope="local",
)
logger.debug_once(
"Store the %s-th graph for compile range%s from %s via handle %s",
......@@ -385,7 +383,6 @@ class CompilerManager:
str(compile_range),
self.compiler.name,
handle,
scope="local",
)
# after compiling the last graph, record the end time
......@@ -399,7 +396,6 @@ class CompilerManager:
"Compiling a graph for compile range %s takes %.2f s",
str(compile_range),
elapsed,
scope="local",
)
return compiled_graph
......@@ -1072,12 +1068,11 @@ class VllmBackend:
disable_cache = disable_cache or is_ngram_gpu_enabled
if disable_cache:
logger.info_once("vLLM's torch.compile cache is disabled.", scope="local")
logger.info_once("vLLM's torch.compile cache is disabled.")
else:
logger.info_once(
"Using cache directory: %s for vLLM's torch.compile",
local_cache_dir,
scope="local",
)
self.compiler_manager.initialize_cache(
......@@ -1134,9 +1129,7 @@ class VllmBackend:
from .monitor import torch_compile_start_time
dynamo_time = time.perf_counter() - torch_compile_start_time
logger.info_once(
"Dynamo bytecode transform time: %.2f s", dynamo_time, scope="local"
)
logger.info_once("Dynamo bytecode transform time: %.2f s", dynamo_time)
if self.is_encoder:
self.compilation_config.encoder_compilation_time += dynamo_time
else:
......@@ -1215,7 +1208,6 @@ class VllmBackend:
logger.info_once(
"Saved compiler manager cache in %.2f seconds.",
elapsed,
scope="local",
)
from torch._guards import detect_fake_mode
......@@ -1254,9 +1246,7 @@ class VllmBackend:
with open(graph_path, "w") as f:
f.write(src)
logger.debug_once(
"Computation graph saved to %s", graph_path, scope="local"
)
logger.debug_once("Computation graph saved to %s", graph_path)
self._called = True
graph_to_serialize = (
......
......@@ -665,7 +665,6 @@ def _support_torch_compile(
logger.info_once(
"saved AOT compiled function to %s",
self._aot_compilation_path,
scope="local",
)
except Exception as e:
logger.warning(
......
......@@ -45,7 +45,7 @@ def monitor_torch_compile(
else:
total_compile_time = time.perf_counter() - torch_compile_start_time
if compilation_config.mode == CompilationMode.VLLM_COMPILE:
logger.info_once(message, total_compile_time, scope="local")
logger.info_once(message, total_compile_time)
finally:
if depyf_cm is not None:
try:
......@@ -76,7 +76,6 @@ def monitor_profiling_run() -> Generator[None, None, None]:
logger.info_once(
"Initial profiling/warmup run took %.2f s",
elapsed,
scope="local",
)
......
......@@ -239,7 +239,6 @@ class SchedulerConfig:
logger.info_once(
"Chunked prefill is enabled with max_num_batched_tokens=%d.",
self.max_num_batched_tokens,
scope="local",
)
if self.max_num_partial_prefills > 1:
......
......@@ -716,9 +716,7 @@ class VllmConfig:
self.instance_id = f"{time.time_ns()}"
if self.performance_mode != "balanced":
logger.info_once(
"Performance mode set to '%s'.", self.performance_mode, scope="local"
)
logger.info_once("Performance mode set to '%s'.", self.performance_mode)
self.try_verify_and_update_config()
......@@ -818,7 +816,6 @@ class VllmConfig:
"Async scheduling not supported with %s-based "
"speculative decoding and will be disabled.",
self.speculative_config.method,
scope="local",
)
self.scheduler_config.async_scheduling = False
elif (
......@@ -828,7 +825,6 @@ class VllmConfig:
logger.warning_once(
"Async scheduling is not compatible with "
"disable_padded_drafter_batch=True and will be disabled.",
scope="local",
)
self.scheduler_config.async_scheduling = False
elif not executor_supports_async_sched:
......@@ -836,7 +832,6 @@ class VllmConfig:
"Async scheduling will be disabled because it is not supported "
"with the `%s` distributed executor backend. ",
executor_backend,
scope="local",
)
self.scheduler_config.async_scheduling = False
else:
......@@ -855,7 +850,6 @@ class VllmConfig:
logger.info_once(
"Disabling NCCL for DP synchronization "
"when using async scheduling.",
scope="local",
)
self.parallel_config.disable_nccl_for_dp_synchronization = True
else:
......@@ -870,7 +864,6 @@ class VllmConfig:
logger.warning_once(
"Disabling cascade attention (not yet compatible with "
"async speculative decoding).",
scope="local",
)
self.model_config.disable_cascade_attn = True
......@@ -1231,7 +1224,6 @@ class VllmConfig:
self.model_config.disable_cascade_attn = True
logger.warning_once(
"Disabling cascade attention when VLLM_BATCH_INVARIANT is enabled.",
scope="local",
)
if self.parallel_config.use_ubatching:
......@@ -1418,7 +1410,6 @@ class VllmConfig:
" performance. Consider increasing max_num_batched_tokens to"
" accommodate the additional draft token slots, or decrease"
" num_speculative_tokens or max_num_seqs.",
scope="local",
)
max_num_scheduled_tokens = self.scheduler_config.max_num_scheduled_tokens
......
......@@ -108,9 +108,7 @@ class PyNcclCommunicator:
if self.rank == 0:
# get the unique id from NCCL
self.unique_id = self.nccl.ncclGetUniqueId()
logger.info_once(
"vLLM is using nccl==%s", self.nccl.ncclGetVersion(), scope="local"
)
logger.info_once("vLLM is using nccl==%s", self.nccl.ncclGetVersion())
else:
# construct an empty unique id
self.unique_id = ncclUniqueId()
......
......@@ -2254,7 +2254,6 @@ class EngineArgs:
"This model does not officially support disabling chunked prefill. "
"Disabling this manually may cause the engine to crash "
"or produce incorrect outputs.",
scope="local",
)
elif (
model_config.runner_type == "pooling"
......@@ -2265,7 +2264,6 @@ class EngineArgs:
"This model does not officially support chunked prefill. "
"Enabling this manually may cause the engine to crash "
"or produce incorrect outputs.",
scope="local",
)
if self.enable_prefix_caching is None:
......@@ -2284,7 +2282,6 @@ class EngineArgs:
"This model does not officially support prefix caching. "
"Enabling this manually may cause the engine to crash "
"or produce incorrect outputs.",
scope="local",
)
# Disable chunked prefill and prefix caching for:
......
......@@ -387,7 +387,6 @@ class LoRAModelManager:
"LoRA is not supported for non-gated MoE gate module."
" %s will be ignored.",
module_name,
scope="local",
)
continue
......
......@@ -332,7 +332,6 @@ class Attention(nn.Module, AttentionLayerBase):
logger.warning_once(
"Disabling prefix caching for FLASHINFER/TRITON_MLA "
"with batch invariance, as it is not yet supported.",
scope="local",
)
cache_config.enable_prefix_caching = False
......
......@@ -427,7 +427,6 @@ class MLAAttention(nn.Module, AttentionLayerBase):
logger.warning_once(
"Disabling prefix caching for TRITON_MLA / FLASHINFER "
"with batch invariance, as it is not yet supported.",
scope="local",
)
cache_config.enable_prefix_caching = False
......@@ -1523,9 +1522,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
if use_fp8:
fp8_dtype = current_platform.fp8_dtype()
logger.info_once(
"FP8 prefill attention enabled: query data type is FP8", scope="local"
)
logger.info_once("FP8 prefill attention enabled: query data type is FP8")
return fp8_dtype
elif vllm_config.attention_config.use_prefill_query_quantization:
logger.info_once(
......@@ -1533,7 +1530,6 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
" use_prefill_query_quantization is enabled. Please"
" ensure that --kv-cache-dtype is set to fp8 and your prefill"
" backend is compatible with FP8 attention.",
scope="local",
)
return model_dtype
elif (
......@@ -1547,7 +1543,6 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
"prefill latency. To enable, add: "
'--attention-config \'{"use_prefill_query_quantization"'
": true}'",
scope="local",
)
return model_dtype
......@@ -2225,21 +2220,19 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
)
if use_trtllm_ragged_deepseek_prefill():
logger.info_once(
"Using TRT-LLM ragged DeepSeek prefill for MLA", scope="local"
)
logger.info_once("Using TRT-LLM ragged DeepSeek prefill for MLA")
self._run_prefill_context_chunk = (
self._run_prefill_context_chunk_trtllm_ragged
)
self._run_prefill_new_tokens = self._run_prefill_new_tokens_trtllm_ragged
self._pad_v = False
elif use_flashinfer_prefill():
logger.info_once("Using FlashInfer prefill for MLA", scope="local")
logger.info_once("Using FlashInfer prefill for MLA")
self._run_prefill_context_chunk = self._run_prefill_context_chunk_fi
self._run_prefill_new_tokens = self._run_prefill_new_tokens_fi
self._pad_v = False
elif use_cudnn_prefill():
logger.info_once("Using CUDNN prefill for MLA", scope="local")
logger.info_once("Using CUDNN prefill for MLA")
self._run_prefill_context_chunk = self._run_prefill_context_chunk_cudnn
self._run_prefill_new_tokens = self._run_prefill_new_tokens_cudnn
self._pad_v = False
......@@ -2250,7 +2243,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
"available. Please install flash_attn or use "
"--attention-backend ROCM_AITER_MLA."
)
logger.info_once("Using FlashAttention prefill for MLA", scope="local")
logger.info_once("Using FlashAttention prefill for MLA")
self._run_prefill_context_chunk = self._run_prefill_context_chunk_fa
self._run_prefill_new_tokens = self._run_prefill_new_tokens_fa
......
......@@ -227,9 +227,7 @@ class MMEncoderAttention(CustomOp):
if self.attn_backend == AttentionBackendEnum.FLASHINFER:
_get_flashinfer_workspace_buffer()
logger.info_once(
f"Using {self.attn_backend} for MMEncoderAttention.", scope="local"
)
logger.info_once(f"Using {self.attn_backend} for MMEncoderAttention.")
@classmethod
def enabled(cls) -> bool:
......
......@@ -1020,7 +1020,7 @@ def override_envs_for_invariance(
"You are using a non-decode-invariant form of batch invariance. "
"This will not be invariant between prefill and decode."
)
logger.warning_once(warning, scope="local")
logger.warning_once(warning)
os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
......
......@@ -369,7 +369,6 @@ class BatchedDeepGemmExperts(mk.FusedMoEExpertsModular):
logger.warning_once(
"DPMetadata unavailable. Defaulting expected_m to "
f"{max_tokens_per_expert}.",
scope="local",
)
return max_tokens_per_expert
......
......@@ -1091,7 +1091,6 @@ def get_moe_configs(
"Using default MoE config. Performance might be sub-optimal! "
"Config file not found at %s",
", ".join(config_file_paths),
scope="local",
)
return None
......
......@@ -123,7 +123,6 @@ class NixlEPPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
"NixlEPPrepareAndFinalize is setup to dispatch raw/unquantized "
f"activations despite ({fused_experts.__class__.__name__}) being able "
"to support quantized activations.",
scope="local",
)
def num_dispatchers(self) -> int:
......
......@@ -266,7 +266,7 @@ def select_fp8_moe_backend(
k_cls, config, weight_key, activation_key, activation_format
)
if supported:
logger.info_once(_make_log_backend(backend), scope="local")
logger.info_once(_make_log_backend(backend))
return backend, k_cls
raise ValueError(_make_log_unsupported(backend, reason))
......@@ -337,12 +337,10 @@ def select_fp8_moe_backend(
)
if supported:
logger.info_once(_make_log_backend(backend), scope="local")
logger.info_once(_make_log_backend(backend))
return backend, k_cls
else:
logger.debug_once(
_make_log_unsupported(backend, reason), scope="local"
)
logger.debug_once(_make_log_unsupported(backend, reason))
raise NotImplementedError(
"Found VLLM_USE_FLASHINFER_MOE_FP8=1, but no "
......@@ -396,10 +394,10 @@ def select_fp8_moe_backend(
activation_format,
)
if supported:
logger.info_once(_make_log_backend(backend), scope="local")
logger.info_once(_make_log_backend(backend))
return backend, k_cls
else:
logger.debug_once(_make_log_unsupported(backend, reason), scope="local")
logger.debug_once(_make_log_unsupported(backend, reason))
# TODO(rob): per discussion with TPU team, we need a way to register
# MoE backends by OOT plugins, rather than having an explicit list
......@@ -580,7 +578,7 @@ def make_fp8_moe_kernel(
)
assert prepare_finalize is not None
logger.info_once("Using %s", prepare_finalize.__class__.__name__, scope="local")
logger.info_once("Using %s", prepare_finalize.__class__.__name__)
# Create Experts.
if prepare_finalize.activation_format == mk.FusedMoEActivationFormat.BatchedExperts:
......
......@@ -117,7 +117,7 @@ def select_int8_moe_backend(
k_cls, config, weight_key, activation_key, activation_format
)
if supported:
logger.info_once(_make_log_backend(backend), scope="local")
logger.info_once(_make_log_backend(backend))
return backend, k_cls
raise ValueError(_make_log_unsupported(backend, reason))
......@@ -138,10 +138,10 @@ def select_int8_moe_backend(
activation_format,
)
if supported:
logger.info_once(_make_log_backend(backend), scope="local")
logger.info_once(_make_log_backend(backend))
return backend, k_cls
else:
logger.debug_once(_make_log_unsupported(backend, reason), scope="local")
logger.debug_once(_make_log_unsupported(backend, reason))
raise NotImplementedError(
"No Int8 MoE backend supports the deployment configuration."
......@@ -193,7 +193,7 @@ def make_int8_moe_kernel(
)
assert prepare_finalize is not None
logger.info_once("Using %s", prepare_finalize.__class__.__name__, scope="local")
logger.info_once("Using %s", prepare_finalize.__class__.__name__)
# Create Experts.
if prepare_finalize.activation_format == mk.FusedMoEActivationFormat.BatchedExperts:
......
......@@ -269,7 +269,7 @@ def select_gpt_oss_mxfp4_moe_backend(
k_cls, config, weight_key, activation_key, activation_format
)
if supported:
logger.info_once(_make_log_backend(backend), scope="local")
logger.info_once(_make_log_backend(backend))
return backend, k_cls
raise ValueError(_make_log_unsupported(backend, reason))
......@@ -363,10 +363,10 @@ def select_gpt_oss_mxfp4_moe_backend(
k_cls, config, kMxfp4Static, activation_key, activation_format
)
if supported:
logger.info_once(_make_log_backend(backend), scope="local")
logger.info_once(_make_log_backend(backend))
return backend, k_cls
else:
logger.debug_once(_make_log_unsupported(backend, reason), scope="local")
logger.debug_once(_make_log_unsupported(backend, reason))
if current_platform.is_xpu():
backend = Mxfp4MoeBackend.XPU
......@@ -861,7 +861,7 @@ def make_mxfp4_moe_kernel(
)
assert prepare_finalize is not None
logger.info_once("Using %s", prepare_finalize.__class__.__name__, scope="local")
logger.info_once("Using %s", prepare_finalize.__class__.__name__)
# Create Experts.
if prepare_finalize.activation_format == mk.FusedMoEActivationFormat.BatchedExperts:
......
......@@ -252,12 +252,10 @@ def select_nvfp4_moe_backend(
activation_format,
)
if supported:
logger.info_once(_make_log_backend(backend), scope="local")
logger.info_once(_make_log_backend(backend))
return backend, k_cls
else:
logger.debug_once(
_make_log_unsupported(backend, reason), scope="local"
)
logger.debug_once(_make_log_unsupported(backend, reason))
raise NotImplementedError(
"Found VLLM_USE_FLASHINFER_MOE_FP4=1, but no "
......@@ -282,10 +280,10 @@ def select_nvfp4_moe_backend(
)
if supported:
logger.info_once(_make_log_backend(backend), scope="local")
logger.info_once(_make_log_backend(backend))
return backend, k_cls
else:
logger.debug_once(_make_log_unsupported(backend, reason), scope="local")
logger.debug_once(_make_log_unsupported(backend, reason))
raise NotImplementedError(
"No NvFp4 MoE backend supports the deployment configuration."
......
......@@ -210,7 +210,7 @@ def select_unquantized_moe_backend(
k_cls, config, None, None, activation_format
)
if supported:
logger.info_once(_make_log_backend(backend), scope="local")
logger.info_once(_make_log_backend(backend))
return backend, k_cls
raise ValueError(_make_log_unsupported(backend, reason))
......@@ -271,12 +271,10 @@ def select_unquantized_moe_backend(
k_cls, moe_config, None, None, activation_format
)
if supported:
logger.info_once(_make_log_backend(backend), scope="local")
logger.info_once(_make_log_backend(backend))
return backend, k_cls
else:
logger.debug_once(
_make_log_unsupported(backend, reason), scope="local"
)
logger.debug_once(_make_log_unsupported(backend, reason))
raise NotImplementedError(
"Found VLLM_USE_FLASHINFER_MOE_FP16=1, but no "
......@@ -298,10 +296,10 @@ def select_unquantized_moe_backend(
k_cls, moe_config, None, None, activation_format
)
if supported:
logger.info_once(_make_log_backend(backend), scope="local")
logger.info_once(_make_log_backend(backend))
return backend, k_cls
logger.debug_once(_make_log_unsupported(backend, reason), scope="local")
logger.debug_once(_make_log_unsupported(backend, reason))
raise NotImplementedError(
"No Unquantized MoE backend supports the deployment configuration."
......@@ -355,7 +353,7 @@ def make_unquantized_moe_kernel(
)
assert prepare_finalize is not None
logger.info_once("Using %s", prepare_finalize.__class__.__name__, scope="local")
logger.info_once("Using %s", prepare_finalize.__class__.__name__)
# Create Experts
if prepare_finalize.activation_format == mk.FusedMoEActivationFormat.BatchedExperts:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment