Unverified Commit 8f87eb46 authored by Wentao Ye's avatar Wentao Ye Committed by GitHub
Browse files

[Refactor] Clean up log once `scope="local"` (#40540)


Signed-off-by: default avataryewentao256 <zhyanwentao@126.com>
Signed-off-by: default avatarWentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: default avatargemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
parent cfa49213
...@@ -135,7 +135,6 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular): ...@@ -135,7 +135,6 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
"DeepEPLLPrepareAndFinalize is setup to dispatch raw/unquantized " "DeepEPLLPrepareAndFinalize is setup to dispatch raw/unquantized "
f"activations despite ({fused_experts.__class__.__name__}) being able " f"activations despite ({fused_experts.__class__.__name__}) being able "
"to support quantized activations.", "to support quantized activations.",
scope="local",
) )
def num_dispatchers(self) -> int: def num_dispatchers(self) -> int:
......
...@@ -69,16 +69,14 @@ class SharedExperts: ...@@ -69,16 +69,14 @@ class SharedExperts:
# TODO: Remove this after more extensive testings with TP/DP # TODO: Remove this after more extensive testings with TP/DP
# and other execution modes # and other execution modes
if envs.VLLM_DISABLE_SHARED_EXPERTS_STREAM: if envs.VLLM_DISABLE_SHARED_EXPERTS_STREAM:
logger.debug_once("Disabling MoE shared_experts cuda stream", scope="local") logger.debug_once("Disabling MoE shared_experts cuda stream")
self._stream = None self._stream = None
else: else:
# TODO(rob): enable shared expert overlap with non-cuda-alike. # TODO(rob): enable shared expert overlap with non-cuda-alike.
# aux_stream() returns None on non-cuda-alike platforms. # aux_stream() returns None on non-cuda-alike platforms.
self._stream = aux_stream() self._stream = aux_stream()
if self._stream is not None: if self._stream is not None:
logger.debug_once( logger.debug_once("Enabled separate cuda stream for MoE shared_experts")
"Enabled separate cuda stream for MoE shared_experts", scope="local"
)
@property @property
def _disable_shared_experts_overlap(self) -> bool: def _disable_shared_experts_overlap(self) -> bool:
......
...@@ -143,15 +143,14 @@ class ChunkGatedDeltaRule(CustomOp): ...@@ -143,15 +143,14 @@ class ChunkGatedDeltaRule(CustomOp):
use_flashinfer = supports_flashinfer use_flashinfer = supports_flashinfer
if use_flashinfer: if use_flashinfer:
logger.info_once("Using FlashInfer GDN prefill kernel", scope="local") logger.info_once("Using FlashInfer GDN prefill kernel")
logger.info_once( logger.info_once(
"FlashInfer GDN prefill kernel is JIT-compiled; first run may " "FlashInfer GDN prefill kernel is JIT-compiled; first run may "
"take a while to compile. Set `--gdn-prefill-backend triton` to " "take a while to compile. Set `--gdn-prefill-backend triton` to "
"avoid JIT compile time.", "avoid JIT compile time.",
scope="local",
) )
else: else:
logger.info_once("Using Triton/FLA GDN prefill kernel", scope="local") logger.info_once("Using Triton/FLA GDN prefill kernel")
self._forward_method = ( self._forward_method = (
self.forward_cuda if use_flashinfer else self.forward_native self.forward_cuda if use_flashinfer else self.forward_native
......
...@@ -44,10 +44,10 @@ class CompressedTensorsW4A4Mxfp4MoEMethod(CompressedTensorsMoEMethod): ...@@ -44,10 +44,10 @@ class CompressedTensorsW4A4Mxfp4MoEMethod(CompressedTensorsMoEMethod):
self.use_cutlass_mxfp4 = CutlassExpertsMxfp4._supports_current_device() self.use_cutlass_mxfp4 = CutlassExpertsMxfp4._supports_current_device()
self.experts_cls: type[mk.FusedMoEExperts] self.experts_cls: type[mk.FusedMoEExperts]
if self.use_cutlass_mxfp4: if self.use_cutlass_mxfp4:
logger.info_once("Using CutlassExpertsMxfp4 for MXFP4 MoE", scope="local") logger.info_once("Using CutlassExpertsMxfp4 for MXFP4 MoE")
self.experts_cls = CutlassExpertsMxfp4 self.experts_cls = CutlassExpertsMxfp4
else: else:
logger.info_once("Using MarlinExperts for MXFP4 MoE", scope="local") logger.info_once("Using MarlinExperts for MXFP4 MoE")
self.experts_cls = MarlinExperts self.experts_cls = MarlinExperts
def create_weights( def create_weights(
......
...@@ -87,7 +87,6 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod): ...@@ -87,7 +87,6 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
logger.info_once( logger.info_once(
f"Using {self.kernel_backend} backend for WNA16 MoE " f"Using {self.kernel_backend} backend for WNA16 MoE "
f"(group_size={self.group_size}, num_bits={self.num_bits})", f"(group_size={self.group_size}, num_bits={self.num_bits})",
scope="local",
) )
def get_weight_shape( def get_weight_shape(
......
...@@ -83,7 +83,6 @@ class Mxfp4Config(QuantizationConfig): ...@@ -83,7 +83,6 @@ class Mxfp4Config(QuantizationConfig):
logger.debug_once( logger.debug_once(
"MXFP4 linear layer is not implemented - falling back to " "MXFP4 linear layer is not implemented - falling back to "
"UnquantizedLinearMethod.", "UnquantizedLinearMethod.",
scope="local",
) )
return UnquantizedLinearMethod() return UnquantizedLinearMethod()
elif isinstance(layer, FusedMoE): elif isinstance(layer, FusedMoE):
...@@ -92,7 +91,6 @@ class Mxfp4Config(QuantizationConfig): ...@@ -92,7 +91,6 @@ class Mxfp4Config(QuantizationConfig):
logger.debug_once( logger.debug_once(
"MXFP4 attention layer is not implemented. " "MXFP4 attention layer is not implemented. "
"Skipping quantization for this layer.", "Skipping quantization for this layer.",
scope="local",
) )
return None return None
......
...@@ -111,7 +111,6 @@ def get_flashinfer_moe_backend() -> FlashinferMoeBackend: ...@@ -111,7 +111,6 @@ def get_flashinfer_moe_backend() -> FlashinferMoeBackend:
logger.info_once( logger.info_once(
"Flashinfer TRTLLM MOE backend is only supported on " "Flashinfer TRTLLM MOE backend is only supported on "
"SM100 and later, using CUTLASS backend instead", "SM100 and later, using CUTLASS backend instead",
scope="local",
) )
return FlashinferMoeBackend.CUTLASS return FlashinferMoeBackend.CUTLASS
return backend_map[flashinfer_moe_backend] return backend_map[flashinfer_moe_backend]
...@@ -239,7 +238,6 @@ def align_fp4_moe_weights_for_fi( ...@@ -239,7 +238,6 @@ def align_fp4_moe_weights_for_fi(
"Padding intermediate size from %d to %d for up/down projection weights.", "Padding intermediate size from %d to %d for up/down projection weights.",
intermediate, intermediate,
padded_intermediate, padded_intermediate,
scope="local",
) )
up_mult = 2 if is_act_and_mul else 1 up_mult = 2 if is_act_and_mul else 1
...@@ -285,7 +283,6 @@ def align_trtllm_fp4_moe_hidden_dim_for_fi( ...@@ -285,7 +283,6 @@ def align_trtllm_fp4_moe_hidden_dim_for_fi(
"performance degradation.", "performance degradation.",
hidden_size, hidden_size,
padded_hidden_size, padded_hidden_size,
scope="local",
) )
padded_w13 = w13.new_zeros((num_experts, gate_up_dim, padded_hidden_size // 2)) padded_w13 = w13.new_zeros((num_experts, gate_up_dim, padded_hidden_size // 2))
...@@ -331,7 +328,6 @@ def align_fp8_moe_weights_for_fi( ...@@ -331,7 +328,6 @@ def align_fp8_moe_weights_for_fi(
"Padding intermediate size from %d to %d for up/down projection weights.", "Padding intermediate size from %d to %d for up/down projection weights.",
intermediate, intermediate,
padded_intermediate, padded_intermediate,
scope="local",
) )
up_mult = 2 if is_act_and_mul else 1 up_mult = 2 if is_act_and_mul else 1
......
...@@ -70,7 +70,6 @@ class BaseModelLoader(ABC): ...@@ -70,7 +70,6 @@ class BaseModelLoader(ABC):
logger.debug_once( logger.debug_once(
"Peak GPU memory after loading weights: %s GiB", "Peak GPU memory after loading weights: %s GiB",
format_gib(peak_memory), format_gib(peak_memory),
scope="local",
) )
# Process weights into kernel format. Note that when using online # Process weights into kernel format. Note that when using online
......
...@@ -384,7 +384,6 @@ class DefaultModelLoader(BaseModelLoader): ...@@ -384,7 +384,6 @@ class DefaultModelLoader(BaseModelLoader):
logger.info_once( logger.info_once(
"Loading weights took %.2f seconds", "Loading weights took %.2f seconds",
self.counter_after_loading_weights - self.counter_before_loading_weights, self.counter_after_loading_weights - self.counter_before_loading_weights,
scope="local",
) )
# We only enable strict check for non-quantized models # We only enable strict check for non-quantized models
# that have loaded weights tracking currently. # that have loaded weights tracking currently.
......
...@@ -157,7 +157,6 @@ class ShardedStateLoader(BaseModelLoader): ...@@ -157,7 +157,6 @@ class ShardedStateLoader(BaseModelLoader):
logger.info_once( logger.info_once(
"Loading weights took %.2f seconds", "Loading weights took %.2f seconds",
counter_after_loading_weights - counter_before_loading_weights, counter_after_loading_weights - counter_before_loading_weights,
scope="local",
) )
if state_dict: if state_dict:
raise ValueError(f"Missing keys {tuple(state_dict)} in loaded state!") raise ValueError(f"Missing keys {tuple(state_dict)} in loaded state!")
......
...@@ -118,11 +118,9 @@ def set_offloader(instance: BaseOffloader) -> None: ...@@ -118,11 +118,9 @@ def set_offloader(instance: BaseOffloader) -> None:
global _instance global _instance
_instance = instance _instance = instance
if isinstance(instance, NoopOffloader): if isinstance(instance, NoopOffloader):
logger.debug_once( logger.debug_once("Offloader set to NoopOffloader (no offloading).")
"Offloader set to NoopOffloader (no offloading).", scope="local"
)
else: else:
logger.info_once("Offloader set to %s", type(instance).__name__, scope="local") logger.info_once("Offloader set to %s", type(instance).__name__)
def create_offloader(offload_config: "OffloadConfig") -> BaseOffloader: def create_offloader(offload_config: "OffloadConfig") -> BaseOffloader:
......
...@@ -369,7 +369,6 @@ class CudaPlatformBase(Platform): ...@@ -369,7 +369,6 @@ class CudaPlatformBase(Platform):
"Using %s attention backend out of potential backends: %s.", "Using %s attention backend out of potential backends: %s.",
selected_backend.name, selected_backend.name,
"[" + ", ".join(f"'{b[0].name}'" for b in valid_backends_priorities) + "]", "[" + ", ".join(f"'{b[0].name}'" for b in valid_backends_priorities) + "]",
scope="local",
) )
return selected_backend.get_path() return selected_backend.get_path()
...@@ -423,7 +422,6 @@ class CudaPlatformBase(Platform): ...@@ -423,7 +422,6 @@ class CudaPlatformBase(Platform):
if is_backend_supported: if is_backend_supported:
logger.info_once( logger.info_once(
f"Using backend {vit_attn_backend} for vit attention", f"Using backend {vit_attn_backend} for vit attention",
scope="local",
) )
return vit_attn_backend return vit_attn_backend
except ImportError: except ImportError:
......
...@@ -63,7 +63,7 @@ class WorkerProfiler(ABC): ...@@ -63,7 +63,7 @@ class WorkerProfiler(ABC):
"""Call _stop with error handling but no safeguards.""" """Call _stop with error handling but no safeguards."""
try: try:
self._stop() self._stop()
logger.info_once("Profiler stopped successfully.", scope="local") logger.info_once("Profiler stopped successfully.")
except Exception as e: except Exception as e:
logger.warning("Failed to stop profiler: %s", e) logger.warning("Failed to stop profiler: %s", e)
self._running = False # Always mark as not running, assume stop worked self._running = False # Always mark as not running, assume stop worked
...@@ -93,7 +93,7 @@ class WorkerProfiler(ABC): ...@@ -93,7 +93,7 @@ class WorkerProfiler(ABC):
and self._delay_iters > 0 and self._delay_iters > 0
and self._active_iteration_count == self._delay_iters and self._active_iteration_count == self._delay_iters
): ):
logger.info_once("Starting profiler after delay...", scope="local") logger.info_once("Starting profiler after delay...")
self._call_start() self._call_start()
# Call profiler step for schedule-based profiling # Call profiler step for schedule-based profiling
...@@ -109,9 +109,7 @@ class WorkerProfiler(ABC): ...@@ -109,9 +109,7 @@ class WorkerProfiler(ABC):
# Automatically stop the profiler after max iters # Automatically stop the profiler after max iters
# will be marked as not running, but leave as active so that stop # will be marked as not running, but leave as active so that stop
# can clean up properly # can clean up properly
logger.info_once( logger.info_once("Max profiling iterations reached. Stopping profiler...")
"Max profiling iterations reached. Stopping profiler...", scope="local"
)
self._call_stop() self._call_stop()
return return
...@@ -141,7 +139,7 @@ class WorkerProfiler(ABC): ...@@ -141,7 +139,7 @@ class WorkerProfiler(ABC):
def shutdown(self) -> None: def shutdown(self) -> None:
"""Ensure profiler is stopped when shutting down.""" """Ensure profiler is stopped when shutting down."""
logger.info_once("Shutting down profiler", scope="local") logger.info_once("Shutting down profiler")
if self._running: if self._running:
self.stop() self.stop()
...@@ -176,7 +174,6 @@ class TorchProfilerWrapper(WorkerProfiler): ...@@ -176,7 +174,6 @@ class TorchProfilerWrapper(WorkerProfiler):
logger.info_once( logger.info_once(
"Torch profiling enabled. Traces will be saved to: %s", "Torch profiling enabled. Traces will be saved to: %s",
torch_profiler_trace_dir, torch_profiler_trace_dir,
scope="local",
) )
logger.debug( logger.debug(
"Profiler config: record_shapes=%s," "Profiler config: record_shapes=%s,"
...@@ -216,7 +213,6 @@ class TorchProfilerWrapper(WorkerProfiler): ...@@ -216,7 +213,6 @@ class TorchProfilerWrapper(WorkerProfiler):
profiler_config.wait_iterations, profiler_config.wait_iterations,
profiler_config.warmup_iterations, profiler_config.warmup_iterations,
profiler_config.active_iterations, profiler_config.active_iterations,
scope="local",
) )
self.profiler = torch.profiler.profile( self.profiler = torch.profiler.profile(
......
...@@ -106,16 +106,14 @@ def is_deep_gemm_e8m0_used() -> bool: ...@@ -106,16 +106,14 @@ def is_deep_gemm_e8m0_used() -> bool:
_lazy_init() _lazy_init()
if _fp8_gemm_nt_impl is None: if _fp8_gemm_nt_impl is None:
logger.info_once( logger.info_once("DeepGEMM E8M0 disabled: _fp8_gemm_nt_impl not found")
"DeepGEMM E8M0 disabled: _fp8_gemm_nt_impl not found", scope="local"
)
return False return False
if envs.VLLM_USE_DEEP_GEMM_E8M0: if envs.VLLM_USE_DEEP_GEMM_E8M0:
logger.info_once("DeepGEMM E8M0 enabled on current platform.", scope="local") logger.info_once("DeepGEMM E8M0 enabled on current platform.")
return True return True
logger.info_once("DeepGEMM E8M0 disabled on current configuration.", scope="local") logger.info_once("DeepGEMM E8M0 disabled on current configuration.")
return False return False
......
...@@ -66,14 +66,12 @@ def import_triton_kernels(): ...@@ -66,14 +66,12 @@ def import_triton_kernels():
logger.debug_once( logger.debug_once(
f"Loading module triton_kernels from {triton_kernels.__file__}.", f"Loading module triton_kernels from {triton_kernels.__file__}.",
scope="local",
) )
elif _has_module("vllm.third_party.triton_kernels"): elif _has_module("vllm.third_party.triton_kernels"):
import vllm.third_party.triton_kernels as triton_kernels import vllm.third_party.triton_kernels as triton_kernels
logger.debug_once( logger.debug_once(
f"Loading module triton_kernels from {triton_kernels.__file__}.", f"Loading module triton_kernels from {triton_kernels.__file__}.",
scope="local",
) )
sys.modules["triton_kernels"] = triton_kernels sys.modules["triton_kernels"] = triton_kernels
else: else:
......
...@@ -118,7 +118,6 @@ def get_flash_attn_version( ...@@ -118,7 +118,6 @@ def get_flash_attn_version(
logger.warning_once( logger.warning_once(
"Cannot use FA version 4 with batch invariance, " "Cannot use FA version 4 with batch invariance, "
"defaulting to FA version 2.", "defaulting to FA version 2.",
scope="local",
) )
fa_version = 2 fa_version = 2
......
...@@ -637,7 +637,6 @@ class FlashAttentionImpl(AttentionImpl): ...@@ -637,7 +637,6 @@ class FlashAttentionImpl(AttentionImpl):
logger.info_once( logger.info_once(
"Using FlashAttention version %s", "Using FlashAttention version %s",
self.vllm_flash_attn_version, self.vllm_flash_attn_version,
scope="local",
) )
# Cache the batch invariant result for use in forward passes # Cache the batch invariant result for use in forward passes
self.batch_invariant_enabled = envs.VLLM_BATCH_INVARIANT self.batch_invariant_enabled = envs.VLLM_BATCH_INVARIANT
......
...@@ -1334,7 +1334,7 @@ def _report_kv_cache_config( ...@@ -1334,7 +1334,7 @@ def _report_kv_cache_config(
dcp_size, dcp_size,
) )
num_tokens_str = f"{num_tokens:,}" num_tokens_str = f"{num_tokens:,}"
logger.info_once("GPU KV cache size: %s tokens", num_tokens_str, scope="local") logger.info_once("GPU KV cache size: %s tokens", num_tokens_str)
max_model_len_str = f"{vllm_config.model_config.max_model_len:,}" max_model_len_str = f"{vllm_config.model_config.max_model_len:,}"
max_concurrency = get_max_concurrency_for_kv_cache_config( max_concurrency = get_max_concurrency_for_kv_cache_config(
vllm_config, kv_cache_config vllm_config, kv_cache_config
...@@ -1343,7 +1343,6 @@ def _report_kv_cache_config( ...@@ -1343,7 +1343,6 @@ def _report_kv_cache_config(
"Maximum concurrency for %s tokens per request: %.2fx", "Maximum concurrency for %s tokens per request: %.2fx",
max_model_len_str, max_model_len_str,
max_concurrency, max_concurrency,
scope="local",
) )
...@@ -1445,7 +1444,6 @@ def _auto_fit_max_model_len( ...@@ -1445,7 +1444,6 @@ def _auto_fit_max_model_len(
"Auto-fit max_model_len: attention-free model, " "Auto-fit max_model_len: attention-free model, "
"using derived max_model_len=%d", "using derived max_model_len=%d",
original_max, original_max,
scope="local",
) )
return return
...@@ -1472,7 +1470,6 @@ def _auto_fit_max_model_len( ...@@ -1472,7 +1470,6 @@ def _auto_fit_max_model_len(
"Auto-fit max_model_len: full model context length %d fits in " "Auto-fit max_model_len: full model context length %d fits in "
"available GPU memory", "available GPU memory",
original_max, original_max,
scope="local",
) )
else: else:
# Need to reduce max_model_len to fit in memory # Need to reduce max_model_len to fit in memory
...@@ -1483,7 +1480,6 @@ def _auto_fit_max_model_len( ...@@ -1483,7 +1480,6 @@ def _auto_fit_max_model_len(
original_max, original_max,
auto_fit_max, auto_fit_max,
format_gib(limiting_worker_mem), format_gib(limiting_worker_mem),
scope="local",
) )
......
...@@ -293,7 +293,6 @@ class EngineCore: ...@@ -293,7 +293,6 @@ class EngineCore:
compile_time + encoder_compile_time, compile_time + encoder_compile_time,
compile_time, compile_time,
encoder_compile_time, encoder_compile_time,
scope="local",
) )
elif compile_time > 0: elif compile_time > 0:
logger.info_once( logger.info_once(
...@@ -301,13 +300,11 @@ class EngineCore: ...@@ -301,13 +300,11 @@ class EngineCore:
"%.2f s (compilation: %.2f s)", "%.2f s (compilation: %.2f s)",
elapsed, elapsed,
compile_time, compile_time,
scope="local",
) )
else: else:
logger.info_once( logger.info_once(
"init engine (profile, create kv cache, warmup model) took %.2f s", "init engine (profile, create kv cache, warmup model) took %.2f s",
elapsed, elapsed,
scope="local",
) )
return scheduler_kv_cache_config return scheduler_kv_cache_config
......
...@@ -1032,7 +1032,6 @@ def set_multiprocessing_worker_envs(): ...@@ -1032,7 +1032,6 @@ def set_multiprocessing_worker_envs():
"external environment to tune this value as needed.", "external environment to tune this value as needed.",
current_parallelism, current_parallelism,
default_omp_num_threads, default_omp_num_threads,
scope="local",
) )
os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads) os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads)
torch.set_num_threads(default_omp_num_threads) torch.set_num_threads(default_omp_num_threads)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment