[Refactor] Clean up log once `scope="local"` (#40540)

Signed-off-by: yewentao256 <zhyanwentao@126.com> Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>

[Refactor] Clean up log once `scope="local"` (#40540)
Signed-off-by: yewentao256 <zhyanwentao@126.com> Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
8f87eb46 · Wentao Ye · GitHub · cfa49213 · 8f87eb46 · 8f87eb46
Unverified Commit 8f87eb46 authored Apr 22, 2026 by Wentao Ye Committed by GitHub Apr 22, 2026
20 changed files
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -292,7 +292,6 @@ class CompilerManager:
                    "from the cache, took %.3f s",
                    str(compile_range),
                    elapsed,
-                    scope="local",
                )
            return compiled_graph

@@ -377,7 +376,6 @@ class CompilerManager:
                logger.info_once(
                    "Cache the graph of compile range %s for later use",
                    str(compile_range),
-                    scope="local",
                )
            logger.debug_once(
                "Store the %s-th graph for compile range%s from %s via handle %s",
@@ -385,7 +383,6 @@ class CompilerManager:
                str(compile_range),
                self.compiler.name,
                handle,
-                scope="local",
            )

        # after compiling the last graph, record the end time
@@ -399,7 +396,6 @@ class CompilerManager:
                "Compiling a graph for compile range %s takes %.2f s",
                str(compile_range),
                elapsed,
-                scope="local",
            )

        return compiled_graph
@@ -1072,12 +1068,11 @@ class VllmBackend:
        disable_cache = disable_cache or is_ngram_gpu_enabled

        if disable_cache:
-            logger.info_once("vLLM's torch.compile cache is disabled.", scope="local")
+            logger.info_once("vLLM's torch.compile cache is disabled.")
        else:
            logger.info_once(
                "Using cache directory: %s for vLLM's torch.compile",
                local_cache_dir,
-                scope="local",
            )

        self.compiler_manager.initialize_cache(
@@ -1134,9 +1129,7 @@ class VllmBackend:
        from .monitor import torch_compile_start_time

        dynamo_time = time.perf_counter() - torch_compile_start_time
-        logger.info_once(
-            "Dynamo bytecode transform time: %.2f s", dynamo_time, scope="local"
-        )
+        logger.info_once("Dynamo bytecode transform time: %.2f s", dynamo_time)
        if self.is_encoder:
            self.compilation_config.encoder_compilation_time += dynamo_time
        else:
@@ -1215,7 +1208,6 @@ class VllmBackend:
            logger.info_once(
                "Saved compiler manager cache in %.2f seconds.",
                elapsed,
-                scope="local",
            )

        from torch._guards import detect_fake_mode
@@ -1254,9 +1246,7 @@ class VllmBackend:
            with open(graph_path, "w") as f:
                f.write(src)

-            logger.debug_once(
-                "Computation graph saved to %s", graph_path, scope="local"
-            )
+            logger.debug_once("Computation graph saved to %s", graph_path)

        self._called = True
        graph_to_serialize = (

--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -665,7 +665,6 @@ def _support_torch_compile(
            logger.info_once(
                "saved AOT compiled function to %s",
                self._aot_compilation_path,
-                scope="local",
            )
        except Exception as e:
            logger.warning(

--- a/vllm/compilation/monitor.py
+++ b/vllm/compilation/monitor.py
@@ -45,7 +45,7 @@ def monitor_torch_compile(
    else:
        total_compile_time = time.perf_counter() - torch_compile_start_time
        if compilation_config.mode == CompilationMode.VLLM_COMPILE:
-            logger.info_once(message, total_compile_time, scope="local")
+            logger.info_once(message, total_compile_time)
    finally:
        if depyf_cm is not None:
            try:
@@ -76,7 +76,6 @@ def monitor_profiling_run() -> Generator[None, None, None]:
    logger.info_once(
        "Initial profiling/warmup run took %.2f s",
        elapsed,
-        scope="local",
    )



--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -239,7 +239,6 @@ class SchedulerConfig:
            logger.info_once(
                "Chunked prefill is enabled with max_num_batched_tokens=%d.",
                self.max_num_batched_tokens,
-                scope="local",
            )

        if self.max_num_partial_prefills > 1:

--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -716,9 +716,7 @@ class VllmConfig:
        self.instance_id = f"{time.time_ns()}"

        if self.performance_mode != "balanced":
-            logger.info_once(
-                "Performance mode set to '%s'.", self.performance_mode, scope="local"
-            )
+            logger.info_once("Performance mode set to '%s'.", self.performance_mode)

        self.try_verify_and_update_config()

@@ -818,7 +816,6 @@ class VllmConfig:
                    "Async scheduling not supported with %s-based "
                    "speculative decoding and will be disabled.",
                    self.speculative_config.method,
-                    scope="local",
                )
                self.scheduler_config.async_scheduling = False
            elif (
@@ -828,7 +825,6 @@ class VllmConfig:
                logger.warning_once(
                    "Async scheduling is not compatible with "
                    "disable_padded_drafter_batch=True and will be disabled.",
-                    scope="local",
                )
                self.scheduler_config.async_scheduling = False
            elif not executor_supports_async_sched:
@@ -836,7 +832,6 @@ class VllmConfig:
                    "Async scheduling will be disabled because it is not supported "
                    "with the `%s` distributed executor backend. ",
                    executor_backend,
-                    scope="local",
                )
                self.scheduler_config.async_scheduling = False
            else:
@@ -855,7 +850,6 @@ class VllmConfig:
                    logger.info_once(
                        "Disabling NCCL for DP synchronization "
                        "when using async scheduling.",
-                        scope="local",
                    )
                self.parallel_config.disable_nccl_for_dp_synchronization = True
            else:
@@ -870,7 +864,6 @@ class VllmConfig:
            logger.warning_once(
                "Disabling cascade attention (not yet compatible with "
                "async speculative decoding).",
-                scope="local",
            )
            self.model_config.disable_cascade_attn = True

@@ -1231,7 +1224,6 @@ class VllmConfig:
            self.model_config.disable_cascade_attn = True
            logger.warning_once(
                "Disabling cascade attention when VLLM_BATCH_INVARIANT is enabled.",
-                scope="local",
            )

        if self.parallel_config.use_ubatching:
@@ -1418,7 +1410,6 @@ class VllmConfig:
                    " performance. Consider increasing max_num_batched_tokens to"
                    " accommodate the additional draft token slots, or decrease"
                    " num_speculative_tokens or max_num_seqs.",
-                    scope="local",
                )

            max_num_scheduled_tokens = self.scheduler_config.max_num_scheduled_tokens

--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -108,9 +108,7 @@ class PyNcclCommunicator:
        if self.rank == 0:
            # get the unique id from NCCL
            self.unique_id = self.nccl.ncclGetUniqueId()
-            logger.info_once(
-                "vLLM is using nccl==%s", self.nccl.ncclGetVersion(), scope="local"
-            )
+            logger.info_once("vLLM is using nccl==%s", self.nccl.ncclGetVersion())
        else:
            # construct an empty unique id
            self.unique_id = ncclUniqueId()

--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -2254,7 +2254,6 @@ class EngineArgs:
                "This model does not officially support disabling chunked prefill. "
                "Disabling this manually may cause the engine to crash "
                "or produce incorrect outputs.",
-                scope="local",
            )
        elif (
            model_config.runner_type == "pooling"
@@ -2265,7 +2264,6 @@ class EngineArgs:
                "This model does not officially support chunked prefill. "
                "Enabling this manually may cause the engine to crash "
                "or produce incorrect outputs.",
-                scope="local",
            )

        if self.enable_prefix_caching is None:
@@ -2284,7 +2282,6 @@ class EngineArgs:
                "This model does not officially support prefix caching. "
                "Enabling this manually may cause the engine to crash "
                "or produce incorrect outputs.",
-                scope="local",
            )

        # Disable chunked prefill and prefix caching for:

--- a/vllm/lora/model_manager.py
+++ b/vllm/lora/model_manager.py
@@ -387,7 +387,6 @@ class LoRAModelManager:
                    "LoRA is not supported for non-gated MoE gate module."
                    " %s will be ignored.",
                    module_name,
-                    scope="local",
                )
                continue


--- a/vllm/model_executor/layers/attention/attention.py
+++ b/vllm/model_executor/layers/attention/attention.py
@@ -332,7 +332,6 @@ class Attention(nn.Module, AttentionLayerBase):
            logger.warning_once(
                "Disabling prefix caching for FLASHINFER/TRITON_MLA "
                "with batch invariance, as it is not yet supported.",
-                scope="local",
            )
            cache_config.enable_prefix_caching = False


--- a/vllm/model_executor/layers/attention/mla_attention.py
+++ b/vllm/model_executor/layers/attention/mla_attention.py
@@ -427,7 +427,6 @@ class MLAAttention(nn.Module, AttentionLayerBase):
            logger.warning_once(
                "Disabling prefix caching for TRITON_MLA / FLASHINFER "
                "with batch invariance, as it is not yet supported.",
-                scope="local",
            )
            cache_config.enable_prefix_caching = False

@@ -1523,9 +1522,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):

        if use_fp8:
            fp8_dtype = current_platform.fp8_dtype()
-            logger.info_once(
-                "FP8 prefill attention enabled: query data type is FP8", scope="local"
-            )
+            logger.info_once("FP8 prefill attention enabled: query data type is FP8")
            return fp8_dtype
        elif vllm_config.attention_config.use_prefill_query_quantization:
            logger.info_once(
@@ -1533,7 +1530,6 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
                " use_prefill_query_quantization is enabled. Please"
                " ensure that --kv-cache-dtype is set to fp8 and your prefill"
                " backend is compatible with FP8 attention.",
-                scope="local",
            )
            return model_dtype
        elif (
@@ -1547,7 +1543,6 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
                "prefill latency. To enable, add: "
                '--attention-config \'{"use_prefill_query_quantization"'
                ": true}'",
-                scope="local",
            )

        return model_dtype
@@ -2225,21 +2220,19 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
        )

        if use_trtllm_ragged_deepseek_prefill():
-            logger.info_once(
-                "Using TRT-LLM ragged DeepSeek prefill for MLA", scope="local"
-            )
+            logger.info_once("Using TRT-LLM ragged DeepSeek prefill for MLA")
            self._run_prefill_context_chunk = (
                self._run_prefill_context_chunk_trtllm_ragged
            )
            self._run_prefill_new_tokens = self._run_prefill_new_tokens_trtllm_ragged
            self._pad_v = False
        elif use_flashinfer_prefill():
-            logger.info_once("Using FlashInfer prefill for MLA", scope="local")
+            logger.info_once("Using FlashInfer prefill for MLA")
            self._run_prefill_context_chunk = self._run_prefill_context_chunk_fi
            self._run_prefill_new_tokens = self._run_prefill_new_tokens_fi
            self._pad_v = False
        elif use_cudnn_prefill():
-            logger.info_once("Using CUDNN prefill for MLA", scope="local")
+            logger.info_once("Using CUDNN prefill for MLA")
            self._run_prefill_context_chunk = self._run_prefill_context_chunk_cudnn
            self._run_prefill_new_tokens = self._run_prefill_new_tokens_cudnn
            self._pad_v = False
@@ -2250,7 +2243,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
                    "available. Please install flash_attn or use "
                    "--attention-backend ROCM_AITER_MLA."
                )
-            logger.info_once("Using FlashAttention prefill for MLA", scope="local")
+            logger.info_once("Using FlashAttention prefill for MLA")
            self._run_prefill_context_chunk = self._run_prefill_context_chunk_fa
            self._run_prefill_new_tokens = self._run_prefill_new_tokens_fa


--- a/vllm/model_executor/layers/attention/mm_encoder_attention.py
+++ b/vllm/model_executor/layers/attention/mm_encoder_attention.py
@@ -227,9 +227,7 @@ class MMEncoderAttention(CustomOp):
        if self.attn_backend == AttentionBackendEnum.FLASHINFER:
            _get_flashinfer_workspace_buffer()

-        logger.info_once(
-            f"Using {self.attn_backend} for MMEncoderAttention.", scope="local"
-        )
+        logger.info_once(f"Using {self.attn_backend} for MMEncoderAttention.")

    @classmethod
    def enabled(cls) -> bool:

--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@@ -1020,7 +1020,7 @@ def override_envs_for_invariance(
            "You are using a non-decode-invariant form of batch invariance. "
            "This will not be invariant between prefill and decode."
        )
-        logger.warning_once(warning, scope="local")
+        logger.warning_once(warning)
    os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"

    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

--- a/vllm/model_executor/layers/fused_moe/experts/batched_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/experts/batched_deep_gemm_moe.py
@@ -369,7 +369,6 @@ class BatchedDeepGemmExperts(mk.FusedMoEExpertsModular):
            logger.warning_once(
                "DPMetadata unavailable. Defaulting expected_m to "
                f"{max_tokens_per_expert}.",
-                scope="local",
            )
            return max_tokens_per_expert


--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1091,7 +1091,6 @@ def get_moe_configs(
        "Using default MoE config. Performance might be sub-optimal! "
        "Config file not found at %s",
        ", ".join(config_file_paths),
-        scope="local",
    )
    return None


--- a/vllm/model_executor/layers/fused_moe/nixl_ep_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/nixl_ep_prepare_finalize.py
@@ -123,7 +123,6 @@ class NixlEPPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
                "NixlEPPrepareAndFinalize is setup to dispatch raw/unquantized "
                f"activations despite ({fused_experts.__class__.__name__}) being able "
                "to support quantized activations.",
-                scope="local",
            )

    def num_dispatchers(self) -> int:

--- a/vllm/model_executor/layers/fused_moe/oracle/fp8.py
+++ b/vllm/model_executor/layers/fused_moe/oracle/fp8.py
@@ -266,7 +266,7 @@ def select_fp8_moe_backend(
                k_cls, config, weight_key, activation_key, activation_format
            )
            if supported:
-                logger.info_once(_make_log_backend(backend), scope="local")
+                logger.info_once(_make_log_backend(backend))
                return backend, k_cls
        raise ValueError(_make_log_unsupported(backend, reason))

@@ -337,12 +337,10 @@ def select_fp8_moe_backend(
                    )

                    if supported:
-                        logger.info_once(_make_log_backend(backend), scope="local")
+                        logger.info_once(_make_log_backend(backend))
                        return backend, k_cls
                    else:
-                        logger.debug_once(
-                            _make_log_unsupported(backend, reason), scope="local"
-                        )
+                        logger.debug_once(_make_log_unsupported(backend, reason))

            raise NotImplementedError(
                "Found VLLM_USE_FLASHINFER_MOE_FP8=1, but no "
@@ -396,10 +394,10 @@ def select_fp8_moe_backend(
                activation_format,
            )
            if supported:
-                logger.info_once(_make_log_backend(backend), scope="local")
+                logger.info_once(_make_log_backend(backend))
                return backend, k_cls
            else:
-                logger.debug_once(_make_log_unsupported(backend, reason), scope="local")
+                logger.debug_once(_make_log_unsupported(backend, reason))

    # TODO(rob): per discussion with TPU team, we need a way to register
    # MoE backends by OOT plugins, rather than having an explicit list
@@ -580,7 +578,7 @@ def make_fp8_moe_kernel(
    )
    assert prepare_finalize is not None

-    logger.info_once("Using %s", prepare_finalize.__class__.__name__, scope="local")
+    logger.info_once("Using %s", prepare_finalize.__class__.__name__)

    # Create Experts.
    if prepare_finalize.activation_format == mk.FusedMoEActivationFormat.BatchedExperts:

--- a/vllm/model_executor/layers/fused_moe/oracle/int8.py
+++ b/vllm/model_executor/layers/fused_moe/oracle/int8.py
@@ -117,7 +117,7 @@ def select_int8_moe_backend(
                k_cls, config, weight_key, activation_key, activation_format
            )
            if supported:
-                logger.info_once(_make_log_backend(backend), scope="local")
+                logger.info_once(_make_log_backend(backend))
                return backend, k_cls
        raise ValueError(_make_log_unsupported(backend, reason))

@@ -138,10 +138,10 @@ def select_int8_moe_backend(
                activation_format,
            )
            if supported:
-                logger.info_once(_make_log_backend(backend), scope="local")
+                logger.info_once(_make_log_backend(backend))
                return backend, k_cls
            else:
-                logger.debug_once(_make_log_unsupported(backend, reason), scope="local")
+                logger.debug_once(_make_log_unsupported(backend, reason))

    raise NotImplementedError(
        "No Int8 MoE backend supports the deployment configuration."
@@ -193,7 +193,7 @@ def make_int8_moe_kernel(
    )
    assert prepare_finalize is not None

-    logger.info_once("Using %s", prepare_finalize.__class__.__name__, scope="local")
+    logger.info_once("Using %s", prepare_finalize.__class__.__name__)

    # Create Experts.
    if prepare_finalize.activation_format == mk.FusedMoEActivationFormat.BatchedExperts:

--- a/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py
+++ b/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py
@@ -269,7 +269,7 @@ def select_gpt_oss_mxfp4_moe_backend(
                k_cls, config, weight_key, activation_key, activation_format
            )
            if supported:
-                logger.info_once(_make_log_backend(backend), scope="local")
+                logger.info_once(_make_log_backend(backend))
                return backend, k_cls
        raise ValueError(_make_log_unsupported(backend, reason))

@@ -363,10 +363,10 @@ def select_gpt_oss_mxfp4_moe_backend(
                k_cls, config, kMxfp4Static, activation_key, activation_format
            )
            if supported:
-                logger.info_once(_make_log_backend(backend), scope="local")
+                logger.info_once(_make_log_backend(backend))
                return backend, k_cls
            else:
-                logger.debug_once(_make_log_unsupported(backend, reason), scope="local")
+                logger.debug_once(_make_log_unsupported(backend, reason))

    if current_platform.is_xpu():
        backend = Mxfp4MoeBackend.XPU
@@ -861,7 +861,7 @@ def make_mxfp4_moe_kernel(
    )
    assert prepare_finalize is not None

-    logger.info_once("Using %s", prepare_finalize.__class__.__name__, scope="local")
+    logger.info_once("Using %s", prepare_finalize.__class__.__name__)

    # Create Experts.
    if prepare_finalize.activation_format == mk.FusedMoEActivationFormat.BatchedExperts:

--- a/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py
+++ b/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py
@@ -252,12 +252,10 @@ def select_nvfp4_moe_backend(
                        activation_format,
                    )
                    if supported:
-                        logger.info_once(_make_log_backend(backend), scope="local")
+                        logger.info_once(_make_log_backend(backend))
                        return backend, k_cls
                    else:
-                        logger.debug_once(
-                            _make_log_unsupported(backend, reason), scope="local"
-                        )
+                        logger.debug_once(_make_log_unsupported(backend, reason))

            raise NotImplementedError(
                "Found VLLM_USE_FLASHINFER_MOE_FP4=1, but no "
@@ -282,10 +280,10 @@ def select_nvfp4_moe_backend(
            )

            if supported:
-                logger.info_once(_make_log_backend(backend), scope="local")
+                logger.info_once(_make_log_backend(backend))
                return backend, k_cls
            else:
-                logger.debug_once(_make_log_unsupported(backend, reason), scope="local")
+                logger.debug_once(_make_log_unsupported(backend, reason))

    raise NotImplementedError(
        "No NvFp4 MoE backend supports the deployment configuration."

--- a/vllm/model_executor/layers/fused_moe/oracle/unquantized.py
+++ b/vllm/model_executor/layers/fused_moe/oracle/unquantized.py
@@ -210,7 +210,7 @@ def select_unquantized_moe_backend(
            k_cls, config, None, None, activation_format
        )
        if supported:
-            logger.info_once(_make_log_backend(backend), scope="local")
+            logger.info_once(_make_log_backend(backend))
            return backend, k_cls
        raise ValueError(_make_log_unsupported(backend, reason))

@@ -271,12 +271,10 @@ def select_unquantized_moe_backend(
                    k_cls, moe_config, None, None, activation_format
                )
                if supported:
-                    logger.info_once(_make_log_backend(backend), scope="local")
+                    logger.info_once(_make_log_backend(backend))
                    return backend, k_cls
                else:
-                    logger.debug_once(
-                        _make_log_unsupported(backend, reason), scope="local"
-                    )
+                    logger.debug_once(_make_log_unsupported(backend, reason))

            raise NotImplementedError(
                "Found VLLM_USE_FLASHINFER_MOE_FP16=1, but no "
@@ -298,10 +296,10 @@ def select_unquantized_moe_backend(
            k_cls, moe_config, None, None, activation_format
        )
        if supported:
-            logger.info_once(_make_log_backend(backend), scope="local")
+            logger.info_once(_make_log_backend(backend))
            return backend, k_cls

-        logger.debug_once(_make_log_unsupported(backend, reason), scope="local")
+        logger.debug_once(_make_log_unsupported(backend, reason))

    raise NotImplementedError(
        "No Unquantized MoE backend supports the deployment configuration."
@@ -355,7 +353,7 @@ def make_unquantized_moe_kernel(
    )
    assert prepare_finalize is not None

-    logger.info_once("Using %s", prepare_finalize.__class__.__name__, scope="local")
+    logger.info_once("Using %s", prepare_finalize.__class__.__name__)

    # Create Experts
    if prepare_finalize.activation_format == mk.FusedMoEActivationFormat.BatchedExperts: