Unverified Commit 8f87eb46 authored by Wentao Ye's avatar Wentao Ye Committed by GitHub
Browse files

[Refactor] Clean up log once `scope="local"` (#40540)


Signed-off-by: default avataryewentao256 <zhyanwentao@126.com>
Signed-off-by: default avatarWentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: default avatargemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
parent cfa49213
......@@ -135,7 +135,6 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
"DeepEPLLPrepareAndFinalize is setup to dispatch raw/unquantized "
f"activations despite ({fused_experts.__class__.__name__}) being able "
"to support quantized activations.",
scope="local",
)
def num_dispatchers(self) -> int:
......
......@@ -69,16 +69,14 @@ class SharedExperts:
# TODO: Remove this after more extensive testings with TP/DP
# and other execution modes
if envs.VLLM_DISABLE_SHARED_EXPERTS_STREAM:
logger.debug_once("Disabling MoE shared_experts cuda stream", scope="local")
logger.debug_once("Disabling MoE shared_experts cuda stream")
self._stream = None
else:
# TODO(rob): enable shared expert overlap with non-cuda-alike.
# aux_stream() returns None on non-cuda-alike platforms.
self._stream = aux_stream()
if self._stream is not None:
logger.debug_once(
"Enabled separate cuda stream for MoE shared_experts", scope="local"
)
logger.debug_once("Enabled separate cuda stream for MoE shared_experts")
@property
def _disable_shared_experts_overlap(self) -> bool:
......
......@@ -143,15 +143,14 @@ class ChunkGatedDeltaRule(CustomOp):
use_flashinfer = supports_flashinfer
if use_flashinfer:
logger.info_once("Using FlashInfer GDN prefill kernel", scope="local")
logger.info_once("Using FlashInfer GDN prefill kernel")
logger.info_once(
"FlashInfer GDN prefill kernel is JIT-compiled; first run may "
"take a while to compile. Set `--gdn-prefill-backend triton` to "
"avoid JIT compile time.",
scope="local",
)
else:
logger.info_once("Using Triton/FLA GDN prefill kernel", scope="local")
logger.info_once("Using Triton/FLA GDN prefill kernel")
self._forward_method = (
self.forward_cuda if use_flashinfer else self.forward_native
......
......@@ -44,10 +44,10 @@ class CompressedTensorsW4A4Mxfp4MoEMethod(CompressedTensorsMoEMethod):
self.use_cutlass_mxfp4 = CutlassExpertsMxfp4._supports_current_device()
self.experts_cls: type[mk.FusedMoEExperts]
if self.use_cutlass_mxfp4:
logger.info_once("Using CutlassExpertsMxfp4 for MXFP4 MoE", scope="local")
logger.info_once("Using CutlassExpertsMxfp4 for MXFP4 MoE")
self.experts_cls = CutlassExpertsMxfp4
else:
logger.info_once("Using MarlinExperts for MXFP4 MoE", scope="local")
logger.info_once("Using MarlinExperts for MXFP4 MoE")
self.experts_cls = MarlinExperts
def create_weights(
......
......@@ -87,7 +87,6 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
logger.info_once(
f"Using {self.kernel_backend} backend for WNA16 MoE "
f"(group_size={self.group_size}, num_bits={self.num_bits})",
scope="local",
)
def get_weight_shape(
......
......@@ -83,7 +83,6 @@ class Mxfp4Config(QuantizationConfig):
logger.debug_once(
"MXFP4 linear layer is not implemented - falling back to "
"UnquantizedLinearMethod.",
scope="local",
)
return UnquantizedLinearMethod()
elif isinstance(layer, FusedMoE):
......@@ -92,7 +91,6 @@ class Mxfp4Config(QuantizationConfig):
logger.debug_once(
"MXFP4 attention layer is not implemented. "
"Skipping quantization for this layer.",
scope="local",
)
return None
......
......@@ -111,7 +111,6 @@ def get_flashinfer_moe_backend() -> FlashinferMoeBackend:
logger.info_once(
"Flashinfer TRTLLM MOE backend is only supported on "
"SM100 and later, using CUTLASS backend instead",
scope="local",
)
return FlashinferMoeBackend.CUTLASS
return backend_map[flashinfer_moe_backend]
......@@ -239,7 +238,6 @@ def align_fp4_moe_weights_for_fi(
"Padding intermediate size from %d to %d for up/down projection weights.",
intermediate,
padded_intermediate,
scope="local",
)
up_mult = 2 if is_act_and_mul else 1
......@@ -285,7 +283,6 @@ def align_trtllm_fp4_moe_hidden_dim_for_fi(
"performance degradation.",
hidden_size,
padded_hidden_size,
scope="local",
)
padded_w13 = w13.new_zeros((num_experts, gate_up_dim, padded_hidden_size // 2))
......@@ -331,7 +328,6 @@ def align_fp8_moe_weights_for_fi(
"Padding intermediate size from %d to %d for up/down projection weights.",
intermediate,
padded_intermediate,
scope="local",
)
up_mult = 2 if is_act_and_mul else 1
......
......@@ -70,7 +70,6 @@ class BaseModelLoader(ABC):
logger.debug_once(
"Peak GPU memory after loading weights: %s GiB",
format_gib(peak_memory),
scope="local",
)
# Process weights into kernel format. Note that when using online
......
......@@ -384,7 +384,6 @@ class DefaultModelLoader(BaseModelLoader):
logger.info_once(
"Loading weights took %.2f seconds",
self.counter_after_loading_weights - self.counter_before_loading_weights,
scope="local",
)
# We only enable strict check for non-quantized models
# that have loaded weights tracking currently.
......
......@@ -157,7 +157,6 @@ class ShardedStateLoader(BaseModelLoader):
logger.info_once(
"Loading weights took %.2f seconds",
counter_after_loading_weights - counter_before_loading_weights,
scope="local",
)
if state_dict:
raise ValueError(f"Missing keys {tuple(state_dict)} in loaded state!")
......
......@@ -118,11 +118,9 @@ def set_offloader(instance: BaseOffloader) -> None:
global _instance
_instance = instance
if isinstance(instance, NoopOffloader):
logger.debug_once(
"Offloader set to NoopOffloader (no offloading).", scope="local"
)
logger.debug_once("Offloader set to NoopOffloader (no offloading).")
else:
logger.info_once("Offloader set to %s", type(instance).__name__, scope="local")
logger.info_once("Offloader set to %s", type(instance).__name__)
def create_offloader(offload_config: "OffloadConfig") -> BaseOffloader:
......
......@@ -369,7 +369,6 @@ class CudaPlatformBase(Platform):
"Using %s attention backend out of potential backends: %s.",
selected_backend.name,
"[" + ", ".join(f"'{b[0].name}'" for b in valid_backends_priorities) + "]",
scope="local",
)
return selected_backend.get_path()
......@@ -423,7 +422,6 @@ class CudaPlatformBase(Platform):
if is_backend_supported:
logger.info_once(
f"Using backend {vit_attn_backend} for vit attention",
scope="local",
)
return vit_attn_backend
except ImportError:
......
......@@ -63,7 +63,7 @@ class WorkerProfiler(ABC):
"""Call _stop with error handling but no safeguards."""
try:
self._stop()
logger.info_once("Profiler stopped successfully.", scope="local")
logger.info_once("Profiler stopped successfully.")
except Exception as e:
logger.warning("Failed to stop profiler: %s", e)
self._running = False # Always mark as not running, assume stop worked
......@@ -93,7 +93,7 @@ class WorkerProfiler(ABC):
and self._delay_iters > 0
and self._active_iteration_count == self._delay_iters
):
logger.info_once("Starting profiler after delay...", scope="local")
logger.info_once("Starting profiler after delay...")
self._call_start()
# Call profiler step for schedule-based profiling
......@@ -109,9 +109,7 @@ class WorkerProfiler(ABC):
# Automatically stop the profiler after max iters
# will be marked as not running, but leave as active so that stop
# can clean up properly
logger.info_once(
"Max profiling iterations reached. Stopping profiler...", scope="local"
)
logger.info_once("Max profiling iterations reached. Stopping profiler...")
self._call_stop()
return
......@@ -141,7 +139,7 @@ class WorkerProfiler(ABC):
def shutdown(self) -> None:
"""Ensure profiler is stopped when shutting down."""
logger.info_once("Shutting down profiler", scope="local")
logger.info_once("Shutting down profiler")
if self._running:
self.stop()
......@@ -176,7 +174,6 @@ class TorchProfilerWrapper(WorkerProfiler):
logger.info_once(
"Torch profiling enabled. Traces will be saved to: %s",
torch_profiler_trace_dir,
scope="local",
)
logger.debug(
"Profiler config: record_shapes=%s,"
......@@ -216,7 +213,6 @@ class TorchProfilerWrapper(WorkerProfiler):
profiler_config.wait_iterations,
profiler_config.warmup_iterations,
profiler_config.active_iterations,
scope="local",
)
self.profiler = torch.profiler.profile(
......
......@@ -106,16 +106,14 @@ def is_deep_gemm_e8m0_used() -> bool:
_lazy_init()
if _fp8_gemm_nt_impl is None:
logger.info_once(
"DeepGEMM E8M0 disabled: _fp8_gemm_nt_impl not found", scope="local"
)
logger.info_once("DeepGEMM E8M0 disabled: _fp8_gemm_nt_impl not found")
return False
if envs.VLLM_USE_DEEP_GEMM_E8M0:
logger.info_once("DeepGEMM E8M0 enabled on current platform.", scope="local")
logger.info_once("DeepGEMM E8M0 enabled on current platform.")
return True
logger.info_once("DeepGEMM E8M0 disabled on current configuration.", scope="local")
logger.info_once("DeepGEMM E8M0 disabled on current configuration.")
return False
......
......@@ -66,14 +66,12 @@ def import_triton_kernels():
logger.debug_once(
f"Loading module triton_kernels from {triton_kernels.__file__}.",
scope="local",
)
elif _has_module("vllm.third_party.triton_kernels"):
import vllm.third_party.triton_kernels as triton_kernels
logger.debug_once(
f"Loading module triton_kernels from {triton_kernels.__file__}.",
scope="local",
)
sys.modules["triton_kernels"] = triton_kernels
else:
......
......@@ -118,7 +118,6 @@ def get_flash_attn_version(
logger.warning_once(
"Cannot use FA version 4 with batch invariance, "
"defaulting to FA version 2.",
scope="local",
)
fa_version = 2
......
......@@ -637,7 +637,6 @@ class FlashAttentionImpl(AttentionImpl):
logger.info_once(
"Using FlashAttention version %s",
self.vllm_flash_attn_version,
scope="local",
)
# Cache the batch invariant result for use in forward passes
self.batch_invariant_enabled = envs.VLLM_BATCH_INVARIANT
......
......@@ -1334,7 +1334,7 @@ def _report_kv_cache_config(
dcp_size,
)
num_tokens_str = f"{num_tokens:,}"
logger.info_once("GPU KV cache size: %s tokens", num_tokens_str, scope="local")
logger.info_once("GPU KV cache size: %s tokens", num_tokens_str)
max_model_len_str = f"{vllm_config.model_config.max_model_len:,}"
max_concurrency = get_max_concurrency_for_kv_cache_config(
vllm_config, kv_cache_config
......@@ -1343,7 +1343,6 @@ def _report_kv_cache_config(
"Maximum concurrency for %s tokens per request: %.2fx",
max_model_len_str,
max_concurrency,
scope="local",
)
......@@ -1445,7 +1444,6 @@ def _auto_fit_max_model_len(
"Auto-fit max_model_len: attention-free model, "
"using derived max_model_len=%d",
original_max,
scope="local",
)
return
......@@ -1472,7 +1470,6 @@ def _auto_fit_max_model_len(
"Auto-fit max_model_len: full model context length %d fits in "
"available GPU memory",
original_max,
scope="local",
)
else:
# Need to reduce max_model_len to fit in memory
......@@ -1483,7 +1480,6 @@ def _auto_fit_max_model_len(
original_max,
auto_fit_max,
format_gib(limiting_worker_mem),
scope="local",
)
......
......@@ -293,7 +293,6 @@ class EngineCore:
compile_time + encoder_compile_time,
compile_time,
encoder_compile_time,
scope="local",
)
elif compile_time > 0:
logger.info_once(
......@@ -301,13 +300,11 @@ class EngineCore:
"%.2f s (compilation: %.2f s)",
elapsed,
compile_time,
scope="local",
)
else:
logger.info_once(
"init engine (profile, create kv cache, warmup model) took %.2f s",
elapsed,
scope="local",
)
return scheduler_kv_cache_config
......
......@@ -1032,7 +1032,6 @@ def set_multiprocessing_worker_envs():
"external environment to tune this value as needed.",
current_parallelism,
default_omp_num_threads,
scope="local",
)
os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads)
torch.set_num_threads(default_omp_num_threads)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment