[Refactor] Clean up log once `scope="local"` (#40540)

Signed-off-by: yewentao256 <zhyanwentao@126.com> Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>

[Refactor] Clean up log once `scope="local"` (#40540)
Signed-off-by: yewentao256 <zhyanwentao@126.com> Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
8f87eb46 · Wentao Ye · GitHub · cfa49213 · 8f87eb46 · 8f87eb46
Unverified Commit 8f87eb46 authored Apr 22, 2026 by Wentao Ye Committed by GitHub Apr 22, 2026
20 changed files
--- a/vllm/model_executor/layers/fused_moe/prepare_finalize/deepep_ll.py
+++ b/vllm/model_executor/layers/fused_moe/prepare_finalize/deepep_ll.py
@@ -135,7 +135,6 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
                "DeepEPLLPrepareAndFinalize is setup to dispatch raw/unquantized "
                f"activations despite ({fused_experts.__class__.__name__}) being able "
                "to support quantized activations.",
-                scope="local",
            )

    def num_dispatchers(self) -> int:

--- a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py
+++ b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py
@@ -69,16 +69,14 @@ class SharedExperts:
        # TODO: Remove this after more extensive testings with TP/DP
        # and other execution modes
        if envs.VLLM_DISABLE_SHARED_EXPERTS_STREAM:
-            logger.debug_once("Disabling MoE shared_experts cuda stream", scope="local")
+            logger.debug_once("Disabling MoE shared_experts cuda stream")
            self._stream = None
        else:
            # TODO(rob): enable shared expert overlap with non-cuda-alike.
            # aux_stream() returns None on non-cuda-alike platforms.
            self._stream = aux_stream()
            if self._stream is not None:
-                logger.debug_once(
-                    "Enabled separate cuda stream for MoE shared_experts", scope="local"
-                )
+                logger.debug_once("Enabled separate cuda stream for MoE shared_experts")

    @property
    def _disable_shared_experts_overlap(self) -> bool:

--- a/vllm/model_executor/layers/mamba/gdn_linear_attn.py
+++ b/vllm/model_executor/layers/mamba/gdn_linear_attn.py
@@ -143,15 +143,14 @@ class ChunkGatedDeltaRule(CustomOp):
            use_flashinfer = supports_flashinfer

        if use_flashinfer:
-            logger.info_once("Using FlashInfer GDN prefill kernel", scope="local")
+            logger.info_once("Using FlashInfer GDN prefill kernel")
            logger.info_once(
                "FlashInfer GDN prefill kernel is JIT-compiled; first run may "
                "take a while to compile. Set `--gdn-prefill-backend triton` to "
                "avoid JIT compile time.",
-                scope="local",
            )
        else:
-            logger.info_once("Using Triton/FLA GDN prefill kernel", scope="local")
+            logger.info_once("Using Triton/FLA GDN prefill kernel")

        self._forward_method = (
            self.forward_cuda if use_flashinfer else self.forward_native

--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_mxfp4.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_mxfp4.py
@@ -44,10 +44,10 @@ class CompressedTensorsW4A4Mxfp4MoEMethod(CompressedTensorsMoEMethod):
        self.use_cutlass_mxfp4 = CutlassExpertsMxfp4._supports_current_device()
        self.experts_cls: type[mk.FusedMoEExperts]
        if self.use_cutlass_mxfp4:
-            logger.info_once("Using CutlassExpertsMxfp4 for MXFP4 MoE", scope="local")
+            logger.info_once("Using CutlassExpertsMxfp4 for MXFP4 MoE")
            self.experts_cls = CutlassExpertsMxfp4
        else:
-            logger.info_once("Using MarlinExperts for MXFP4 MoE", scope="local")
+            logger.info_once("Using MarlinExperts for MXFP4 MoE")
            self.experts_cls = MarlinExperts

    def create_weights(

--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_wna16_marlin.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_wna16_marlin.py
@@ -87,7 +87,6 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
        logger.info_once(
            f"Using {self.kernel_backend} backend for WNA16 MoE "
            f"(group_size={self.group_size}, num_bits={self.num_bits})",
-            scope="local",
        )

    def get_weight_shape(

--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -83,7 +83,6 @@ class Mxfp4Config(QuantizationConfig):
            logger.debug_once(
                "MXFP4 linear layer is not implemented - falling back to "
                "UnquantizedLinearMethod.",
-                scope="local",
            )
            return UnquantizedLinearMethod()
        elif isinstance(layer, FusedMoE):
@@ -92,7 +91,6 @@ class Mxfp4Config(QuantizationConfig):
            logger.debug_once(
                "MXFP4 attention layer is not implemented. "
                "Skipping quantization for this layer.",
-                scope="local",
            )
        return None


--- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -111,7 +111,6 @@ def get_flashinfer_moe_backend() -> FlashinferMoeBackend:
            logger.info_once(
                "Flashinfer TRTLLM MOE backend is only supported on "
                "SM100 and later, using CUTLASS backend instead",
-                scope="local",
            )
            return FlashinferMoeBackend.CUTLASS
        return backend_map[flashinfer_moe_backend]
@@ -239,7 +238,6 @@ def align_fp4_moe_weights_for_fi(
        "Padding intermediate size from %d to %d for up/down projection weights.",
        intermediate,
        padded_intermediate,
-        scope="local",
    )

    up_mult = 2 if is_act_and_mul else 1
@@ -285,7 +283,6 @@ def align_trtllm_fp4_moe_hidden_dim_for_fi(
        "performance degradation.",
        hidden_size,
        padded_hidden_size,
-        scope="local",
    )

    padded_w13 = w13.new_zeros((num_experts, gate_up_dim, padded_hidden_size // 2))
@@ -331,7 +328,6 @@ def align_fp8_moe_weights_for_fi(
        "Padding intermediate size from %d to %d for up/down projection weights.",
        intermediate,
        padded_intermediate,
-        scope="local",
    )

    up_mult = 2 if is_act_and_mul else 1

--- a/vllm/model_executor/model_loader/base_loader.py
+++ b/vllm/model_executor/model_loader/base_loader.py
@@ -70,7 +70,6 @@ class BaseModelLoader(ABC):
                logger.debug_once(
                    "Peak GPU memory after loading weights: %s GiB",
                    format_gib(peak_memory),
-                    scope="local",
                )

            # Process weights into kernel format. Note that when using online

--- a/vllm/model_executor/model_loader/default_loader.py
+++ b/vllm/model_executor/model_loader/default_loader.py
@@ -384,7 +384,6 @@ class DefaultModelLoader(BaseModelLoader):
        logger.info_once(
            "Loading weights took %.2f seconds",
            self.counter_after_loading_weights - self.counter_before_loading_weights,
-            scope="local",
        )
        # We only enable strict check for non-quantized models
        # that have loaded weights tracking currently.

--- a/vllm/model_executor/model_loader/sharded_state_loader.py
+++ b/vllm/model_executor/model_loader/sharded_state_loader.py
@@ -157,7 +157,6 @@ class ShardedStateLoader(BaseModelLoader):
        logger.info_once(
            "Loading weights took %.2f seconds",
            counter_after_loading_weights - counter_before_loading_weights,
-            scope="local",
        )
        if state_dict:
            raise ValueError(f"Missing keys {tuple(state_dict)} in loaded state!")

--- a/vllm/model_executor/offloader/base.py
+++ b/vllm/model_executor/offloader/base.py
@@ -118,11 +118,9 @@ def set_offloader(instance: BaseOffloader) -> None:
    global _instance
    _instance = instance
    if isinstance(instance, NoopOffloader):
-        logger.debug_once(
-            "Offloader set to NoopOffloader (no offloading).", scope="local"
-        )
+        logger.debug_once("Offloader set to NoopOffloader (no offloading).")
    else:
-        logger.info_once("Offloader set to %s", type(instance).__name__, scope="local")
+        logger.info_once("Offloader set to %s", type(instance).__name__)


 def create_offloader(offload_config: "OffloadConfig") -> BaseOffloader:

--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -369,7 +369,6 @@ class CudaPlatformBase(Platform):
            "Using %s attention backend out of potential backends: %s.",
            selected_backend.name,
            "[" + ", ".join(f"'{b[0].name}'" for b in valid_backends_priorities) + "]",
-            scope="local",
        )

        return selected_backend.get_path()
@@ -423,7 +422,6 @@ class CudaPlatformBase(Platform):
                if is_backend_supported:
                    logger.info_once(
                        f"Using backend {vit_attn_backend} for vit attention",
-                        scope="local",
                    )
                    return vit_attn_backend
            except ImportError:

--- a/vllm/profiler/wrapper.py
+++ b/vllm/profiler/wrapper.py
@@ -63,7 +63,7 @@ class WorkerProfiler(ABC):
        """Call _stop with error handling but no safeguards."""
        try:
            self._stop()
-            logger.info_once("Profiler stopped successfully.", scope="local")
+            logger.info_once("Profiler stopped successfully.")
        except Exception as e:
            logger.warning("Failed to stop profiler: %s", e)
        self._running = False  # Always mark as not running, assume stop worked
@@ -93,7 +93,7 @@ class WorkerProfiler(ABC):
            and self._delay_iters > 0
            and self._active_iteration_count == self._delay_iters
        ):
-            logger.info_once("Starting profiler after delay...", scope="local")
+            logger.info_once("Starting profiler after delay...")
            self._call_start()

        # Call profiler step for schedule-based profiling
@@ -109,9 +109,7 @@ class WorkerProfiler(ABC):
            # Automatically stop the profiler after max iters
            # will be marked as not running, but leave as active so that stop
            # can clean up properly
-            logger.info_once(
-                "Max profiling iterations reached. Stopping profiler...", scope="local"
-            )
+            logger.info_once("Max profiling iterations reached. Stopping profiler...")
            self._call_stop()
            return

@@ -141,7 +139,7 @@ class WorkerProfiler(ABC):

    def shutdown(self) -> None:
        """Ensure profiler is stopped when shutting down."""
-        logger.info_once("Shutting down profiler", scope="local")
+        logger.info_once("Shutting down profiler")
        if self._running:
            self.stop()

@@ -176,7 +174,6 @@ class TorchProfilerWrapper(WorkerProfiler):
            logger.info_once(
                "Torch profiling enabled. Traces will be saved to: %s",
                torch_profiler_trace_dir,
-                scope="local",
            )
            logger.debug(
                "Profiler config: record_shapes=%s,"
@@ -216,7 +213,6 @@ class TorchProfilerWrapper(WorkerProfiler):
                    profiler_config.wait_iterations,
                    profiler_config.warmup_iterations,
                    profiler_config.active_iterations,
-                    scope="local",
                )

        self.profiler = torch.profiler.profile(

--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -106,16 +106,14 @@ def is_deep_gemm_e8m0_used() -> bool:
    _lazy_init()

    if _fp8_gemm_nt_impl is None:
-        logger.info_once(
-            "DeepGEMM E8M0 disabled: _fp8_gemm_nt_impl not found", scope="local"
-        )
+        logger.info_once("DeepGEMM E8M0 disabled: _fp8_gemm_nt_impl not found")
        return False

    if envs.VLLM_USE_DEEP_GEMM_E8M0:
-        logger.info_once("DeepGEMM E8M0 enabled on current platform.", scope="local")
+        logger.info_once("DeepGEMM E8M0 enabled on current platform.")
        return True

-    logger.info_once("DeepGEMM E8M0 disabled on current configuration.", scope="local")
+    logger.info_once("DeepGEMM E8M0 disabled on current configuration.")
    return False



--- a/vllm/utils/import_utils.py
+++ b/vllm/utils/import_utils.py
@@ -66,14 +66,12 @@ def import_triton_kernels():

        logger.debug_once(
            f"Loading module triton_kernels from {triton_kernels.__file__}.",
-            scope="local",
        )
    elif _has_module("vllm.third_party.triton_kernels"):
        import vllm.third_party.triton_kernels as triton_kernels

        logger.debug_once(
            f"Loading module triton_kernels from {triton_kernels.__file__}.",
-            scope="local",
        )
        sys.modules["triton_kernels"] = triton_kernels
    else:

--- a/vllm/v1/attention/backends/fa_utils.py
+++ b/vllm/v1/attention/backends/fa_utils.py
@@ -118,7 +118,6 @@ def get_flash_attn_version(
            logger.warning_once(
                "Cannot use FA version 4 with batch invariance, "
                "defaulting to FA version 2.",
-                scope="local",
            )
            fa_version = 2


--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -637,7 +637,6 @@ class FlashAttentionImpl(AttentionImpl):
        logger.info_once(
            "Using FlashAttention version %s",
            self.vllm_flash_attn_version,
-            scope="local",
        )
        # Cache the batch invariant result for use in forward passes
        self.batch_invariant_enabled = envs.VLLM_BATCH_INVARIANT

--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -1334,7 +1334,7 @@ def _report_kv_cache_config(
            dcp_size,
        )
    num_tokens_str = f"{num_tokens:,}"
-    logger.info_once("GPU KV cache size: %s tokens", num_tokens_str, scope="local")
+    logger.info_once("GPU KV cache size: %s tokens", num_tokens_str)
    max_model_len_str = f"{vllm_config.model_config.max_model_len:,}"
    max_concurrency = get_max_concurrency_for_kv_cache_config(
        vllm_config, kv_cache_config
@@ -1343,7 +1343,6 @@ def _report_kv_cache_config(
        "Maximum concurrency for %s tokens per request: %.2fx",
        max_model_len_str,
        max_concurrency,
-        scope="local",
    )


@@ -1445,7 +1444,6 @@ def _auto_fit_max_model_len(
            "Auto-fit max_model_len: attention-free model, "
            "using derived max_model_len=%d",
            original_max,
-            scope="local",
        )
        return

@@ -1472,7 +1470,6 @@ def _auto_fit_max_model_len(
            "Auto-fit max_model_len: full model context length %d fits in "
            "available GPU memory",
            original_max,
-            scope="local",
        )
    else:
        # Need to reduce max_model_len to fit in memory
@@ -1483,7 +1480,6 @@ def _auto_fit_max_model_len(
            original_max,
            auto_fit_max,
            format_gib(limiting_worker_mem),
-            scope="local",
        )



--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -293,7 +293,6 @@ class EngineCore:
                compile_time + encoder_compile_time,
                compile_time,
                encoder_compile_time,
-                scope="local",
            )
        elif compile_time > 0:
            logger.info_once(
@@ -301,13 +300,11 @@ class EngineCore:
                "%.2f s (compilation: %.2f s)",
                elapsed,
                compile_time,
-                scope="local",
            )
        else:
            logger.info_once(
                "init engine (profile, create kv cache, warmup model) took %.2f s",
                elapsed,
-                scope="local",
            )
        return scheduler_kv_cache_config


--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -1032,7 +1032,6 @@ def set_multiprocessing_worker_envs():
                "external environment to tune this value as needed.",
                current_parallelism,
                default_omp_num_threads,
-                scope="local",
            )
            os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads)
            torch.set_num_threads(default_omp_num_threads)