Unverified Commit 8f87eb46 authored by Wentao Ye's avatar Wentao Ye Committed by GitHub
Browse files

[Refactor] Clean up log once `scope="local"` (#40540)


Signed-off-by: default avataryewentao256 <zhyanwentao@126.com>
Signed-off-by: default avatarWentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: default avatargemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
parent cfa49213
...@@ -292,7 +292,6 @@ class CompilerManager: ...@@ -292,7 +292,6 @@ class CompilerManager:
"from the cache, took %.3f s", "from the cache, took %.3f s",
str(compile_range), str(compile_range),
elapsed, elapsed,
scope="local",
) )
return compiled_graph return compiled_graph
...@@ -377,7 +376,6 @@ class CompilerManager: ...@@ -377,7 +376,6 @@ class CompilerManager:
logger.info_once( logger.info_once(
"Cache the graph of compile range %s for later use", "Cache the graph of compile range %s for later use",
str(compile_range), str(compile_range),
scope="local",
) )
logger.debug_once( logger.debug_once(
"Store the %s-th graph for compile range%s from %s via handle %s", "Store the %s-th graph for compile range%s from %s via handle %s",
...@@ -385,7 +383,6 @@ class CompilerManager: ...@@ -385,7 +383,6 @@ class CompilerManager:
str(compile_range), str(compile_range),
self.compiler.name, self.compiler.name,
handle, handle,
scope="local",
) )
# after compiling the last graph, record the end time # after compiling the last graph, record the end time
...@@ -399,7 +396,6 @@ class CompilerManager: ...@@ -399,7 +396,6 @@ class CompilerManager:
"Compiling a graph for compile range %s takes %.2f s", "Compiling a graph for compile range %s takes %.2f s",
str(compile_range), str(compile_range),
elapsed, elapsed,
scope="local",
) )
return compiled_graph return compiled_graph
...@@ -1072,12 +1068,11 @@ class VllmBackend: ...@@ -1072,12 +1068,11 @@ class VllmBackend:
disable_cache = disable_cache or is_ngram_gpu_enabled disable_cache = disable_cache or is_ngram_gpu_enabled
if disable_cache: if disable_cache:
logger.info_once("vLLM's torch.compile cache is disabled.", scope="local") logger.info_once("vLLM's torch.compile cache is disabled.")
else: else:
logger.info_once( logger.info_once(
"Using cache directory: %s for vLLM's torch.compile", "Using cache directory: %s for vLLM's torch.compile",
local_cache_dir, local_cache_dir,
scope="local",
) )
self.compiler_manager.initialize_cache( self.compiler_manager.initialize_cache(
...@@ -1134,9 +1129,7 @@ class VllmBackend: ...@@ -1134,9 +1129,7 @@ class VllmBackend:
from .monitor import torch_compile_start_time from .monitor import torch_compile_start_time
dynamo_time = time.perf_counter() - torch_compile_start_time dynamo_time = time.perf_counter() - torch_compile_start_time
logger.info_once( logger.info_once("Dynamo bytecode transform time: %.2f s", dynamo_time)
"Dynamo bytecode transform time: %.2f s", dynamo_time, scope="local"
)
if self.is_encoder: if self.is_encoder:
self.compilation_config.encoder_compilation_time += dynamo_time self.compilation_config.encoder_compilation_time += dynamo_time
else: else:
...@@ -1215,7 +1208,6 @@ class VllmBackend: ...@@ -1215,7 +1208,6 @@ class VllmBackend:
logger.info_once( logger.info_once(
"Saved compiler manager cache in %.2f seconds.", "Saved compiler manager cache in %.2f seconds.",
elapsed, elapsed,
scope="local",
) )
from torch._guards import detect_fake_mode from torch._guards import detect_fake_mode
...@@ -1254,9 +1246,7 @@ class VllmBackend: ...@@ -1254,9 +1246,7 @@ class VllmBackend:
with open(graph_path, "w") as f: with open(graph_path, "w") as f:
f.write(src) f.write(src)
logger.debug_once( logger.debug_once("Computation graph saved to %s", graph_path)
"Computation graph saved to %s", graph_path, scope="local"
)
self._called = True self._called = True
graph_to_serialize = ( graph_to_serialize = (
......
...@@ -665,7 +665,6 @@ def _support_torch_compile( ...@@ -665,7 +665,6 @@ def _support_torch_compile(
logger.info_once( logger.info_once(
"saved AOT compiled function to %s", "saved AOT compiled function to %s",
self._aot_compilation_path, self._aot_compilation_path,
scope="local",
) )
except Exception as e: except Exception as e:
logger.warning( logger.warning(
......
...@@ -45,7 +45,7 @@ def monitor_torch_compile( ...@@ -45,7 +45,7 @@ def monitor_torch_compile(
else: else:
total_compile_time = time.perf_counter() - torch_compile_start_time total_compile_time = time.perf_counter() - torch_compile_start_time
if compilation_config.mode == CompilationMode.VLLM_COMPILE: if compilation_config.mode == CompilationMode.VLLM_COMPILE:
logger.info_once(message, total_compile_time, scope="local") logger.info_once(message, total_compile_time)
finally: finally:
if depyf_cm is not None: if depyf_cm is not None:
try: try:
...@@ -76,7 +76,6 @@ def monitor_profiling_run() -> Generator[None, None, None]: ...@@ -76,7 +76,6 @@ def monitor_profiling_run() -> Generator[None, None, None]:
logger.info_once( logger.info_once(
"Initial profiling/warmup run took %.2f s", "Initial profiling/warmup run took %.2f s",
elapsed, elapsed,
scope="local",
) )
......
...@@ -239,7 +239,6 @@ class SchedulerConfig: ...@@ -239,7 +239,6 @@ class SchedulerConfig:
logger.info_once( logger.info_once(
"Chunked prefill is enabled with max_num_batched_tokens=%d.", "Chunked prefill is enabled with max_num_batched_tokens=%d.",
self.max_num_batched_tokens, self.max_num_batched_tokens,
scope="local",
) )
if self.max_num_partial_prefills > 1: if self.max_num_partial_prefills > 1:
......
...@@ -716,9 +716,7 @@ class VllmConfig: ...@@ -716,9 +716,7 @@ class VllmConfig:
self.instance_id = f"{time.time_ns()}" self.instance_id = f"{time.time_ns()}"
if self.performance_mode != "balanced": if self.performance_mode != "balanced":
logger.info_once( logger.info_once("Performance mode set to '%s'.", self.performance_mode)
"Performance mode set to '%s'.", self.performance_mode, scope="local"
)
self.try_verify_and_update_config() self.try_verify_and_update_config()
...@@ -818,7 +816,6 @@ class VllmConfig: ...@@ -818,7 +816,6 @@ class VllmConfig:
"Async scheduling not supported with %s-based " "Async scheduling not supported with %s-based "
"speculative decoding and will be disabled.", "speculative decoding and will be disabled.",
self.speculative_config.method, self.speculative_config.method,
scope="local",
) )
self.scheduler_config.async_scheduling = False self.scheduler_config.async_scheduling = False
elif ( elif (
...@@ -828,7 +825,6 @@ class VllmConfig: ...@@ -828,7 +825,6 @@ class VllmConfig:
logger.warning_once( logger.warning_once(
"Async scheduling is not compatible with " "Async scheduling is not compatible with "
"disable_padded_drafter_batch=True and will be disabled.", "disable_padded_drafter_batch=True and will be disabled.",
scope="local",
) )
self.scheduler_config.async_scheduling = False self.scheduler_config.async_scheduling = False
elif not executor_supports_async_sched: elif not executor_supports_async_sched:
...@@ -836,7 +832,6 @@ class VllmConfig: ...@@ -836,7 +832,6 @@ class VllmConfig:
"Async scheduling will be disabled because it is not supported " "Async scheduling will be disabled because it is not supported "
"with the `%s` distributed executor backend. ", "with the `%s` distributed executor backend. ",
executor_backend, executor_backend,
scope="local",
) )
self.scheduler_config.async_scheduling = False self.scheduler_config.async_scheduling = False
else: else:
...@@ -855,7 +850,6 @@ class VllmConfig: ...@@ -855,7 +850,6 @@ class VllmConfig:
logger.info_once( logger.info_once(
"Disabling NCCL for DP synchronization " "Disabling NCCL for DP synchronization "
"when using async scheduling.", "when using async scheduling.",
scope="local",
) )
self.parallel_config.disable_nccl_for_dp_synchronization = True self.parallel_config.disable_nccl_for_dp_synchronization = True
else: else:
...@@ -870,7 +864,6 @@ class VllmConfig: ...@@ -870,7 +864,6 @@ class VllmConfig:
logger.warning_once( logger.warning_once(
"Disabling cascade attention (not yet compatible with " "Disabling cascade attention (not yet compatible with "
"async speculative decoding).", "async speculative decoding).",
scope="local",
) )
self.model_config.disable_cascade_attn = True self.model_config.disable_cascade_attn = True
...@@ -1231,7 +1224,6 @@ class VllmConfig: ...@@ -1231,7 +1224,6 @@ class VllmConfig:
self.model_config.disable_cascade_attn = True self.model_config.disable_cascade_attn = True
logger.warning_once( logger.warning_once(
"Disabling cascade attention when VLLM_BATCH_INVARIANT is enabled.", "Disabling cascade attention when VLLM_BATCH_INVARIANT is enabled.",
scope="local",
) )
if self.parallel_config.use_ubatching: if self.parallel_config.use_ubatching:
...@@ -1418,7 +1410,6 @@ class VllmConfig: ...@@ -1418,7 +1410,6 @@ class VllmConfig:
" performance. Consider increasing max_num_batched_tokens to" " performance. Consider increasing max_num_batched_tokens to"
" accommodate the additional draft token slots, or decrease" " accommodate the additional draft token slots, or decrease"
" num_speculative_tokens or max_num_seqs.", " num_speculative_tokens or max_num_seqs.",
scope="local",
) )
max_num_scheduled_tokens = self.scheduler_config.max_num_scheduled_tokens max_num_scheduled_tokens = self.scheduler_config.max_num_scheduled_tokens
......
...@@ -108,9 +108,7 @@ class PyNcclCommunicator: ...@@ -108,9 +108,7 @@ class PyNcclCommunicator:
if self.rank == 0: if self.rank == 0:
# get the unique id from NCCL # get the unique id from NCCL
self.unique_id = self.nccl.ncclGetUniqueId() self.unique_id = self.nccl.ncclGetUniqueId()
logger.info_once( logger.info_once("vLLM is using nccl==%s", self.nccl.ncclGetVersion())
"vLLM is using nccl==%s", self.nccl.ncclGetVersion(), scope="local"
)
else: else:
# construct an empty unique id # construct an empty unique id
self.unique_id = ncclUniqueId() self.unique_id = ncclUniqueId()
......
...@@ -2254,7 +2254,6 @@ class EngineArgs: ...@@ -2254,7 +2254,6 @@ class EngineArgs:
"This model does not officially support disabling chunked prefill. " "This model does not officially support disabling chunked prefill. "
"Disabling this manually may cause the engine to crash " "Disabling this manually may cause the engine to crash "
"or produce incorrect outputs.", "or produce incorrect outputs.",
scope="local",
) )
elif ( elif (
model_config.runner_type == "pooling" model_config.runner_type == "pooling"
...@@ -2265,7 +2264,6 @@ class EngineArgs: ...@@ -2265,7 +2264,6 @@ class EngineArgs:
"This model does not officially support chunked prefill. " "This model does not officially support chunked prefill. "
"Enabling this manually may cause the engine to crash " "Enabling this manually may cause the engine to crash "
"or produce incorrect outputs.", "or produce incorrect outputs.",
scope="local",
) )
if self.enable_prefix_caching is None: if self.enable_prefix_caching is None:
...@@ -2284,7 +2282,6 @@ class EngineArgs: ...@@ -2284,7 +2282,6 @@ class EngineArgs:
"This model does not officially support prefix caching. " "This model does not officially support prefix caching. "
"Enabling this manually may cause the engine to crash " "Enabling this manually may cause the engine to crash "
"or produce incorrect outputs.", "or produce incorrect outputs.",
scope="local",
) )
# Disable chunked prefill and prefix caching for: # Disable chunked prefill and prefix caching for:
......
...@@ -387,7 +387,6 @@ class LoRAModelManager: ...@@ -387,7 +387,6 @@ class LoRAModelManager:
"LoRA is not supported for non-gated MoE gate module." "LoRA is not supported for non-gated MoE gate module."
" %s will be ignored.", " %s will be ignored.",
module_name, module_name,
scope="local",
) )
continue continue
......
...@@ -332,7 +332,6 @@ class Attention(nn.Module, AttentionLayerBase): ...@@ -332,7 +332,6 @@ class Attention(nn.Module, AttentionLayerBase):
logger.warning_once( logger.warning_once(
"Disabling prefix caching for FLASHINFER/TRITON_MLA " "Disabling prefix caching for FLASHINFER/TRITON_MLA "
"with batch invariance, as it is not yet supported.", "with batch invariance, as it is not yet supported.",
scope="local",
) )
cache_config.enable_prefix_caching = False cache_config.enable_prefix_caching = False
......
...@@ -427,7 +427,6 @@ class MLAAttention(nn.Module, AttentionLayerBase): ...@@ -427,7 +427,6 @@ class MLAAttention(nn.Module, AttentionLayerBase):
logger.warning_once( logger.warning_once(
"Disabling prefix caching for TRITON_MLA / FLASHINFER " "Disabling prefix caching for TRITON_MLA / FLASHINFER "
"with batch invariance, as it is not yet supported.", "with batch invariance, as it is not yet supported.",
scope="local",
) )
cache_config.enable_prefix_caching = False cache_config.enable_prefix_caching = False
...@@ -1523,9 +1522,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): ...@@ -1523,9 +1522,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
if use_fp8: if use_fp8:
fp8_dtype = current_platform.fp8_dtype() fp8_dtype = current_platform.fp8_dtype()
logger.info_once( logger.info_once("FP8 prefill attention enabled: query data type is FP8")
"FP8 prefill attention enabled: query data type is FP8", scope="local"
)
return fp8_dtype return fp8_dtype
elif vllm_config.attention_config.use_prefill_query_quantization: elif vllm_config.attention_config.use_prefill_query_quantization:
logger.info_once( logger.info_once(
...@@ -1533,7 +1530,6 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): ...@@ -1533,7 +1530,6 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
" use_prefill_query_quantization is enabled. Please" " use_prefill_query_quantization is enabled. Please"
" ensure that --kv-cache-dtype is set to fp8 and your prefill" " ensure that --kv-cache-dtype is set to fp8 and your prefill"
" backend is compatible with FP8 attention.", " backend is compatible with FP8 attention.",
scope="local",
) )
return model_dtype return model_dtype
elif ( elif (
...@@ -1547,7 +1543,6 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): ...@@ -1547,7 +1543,6 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
"prefill latency. To enable, add: " "prefill latency. To enable, add: "
'--attention-config \'{"use_prefill_query_quantization"' '--attention-config \'{"use_prefill_query_quantization"'
": true}'", ": true}'",
scope="local",
) )
return model_dtype return model_dtype
...@@ -2225,21 +2220,19 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): ...@@ -2225,21 +2220,19 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
) )
if use_trtllm_ragged_deepseek_prefill(): if use_trtllm_ragged_deepseek_prefill():
logger.info_once( logger.info_once("Using TRT-LLM ragged DeepSeek prefill for MLA")
"Using TRT-LLM ragged DeepSeek prefill for MLA", scope="local"
)
self._run_prefill_context_chunk = ( self._run_prefill_context_chunk = (
self._run_prefill_context_chunk_trtllm_ragged self._run_prefill_context_chunk_trtllm_ragged
) )
self._run_prefill_new_tokens = self._run_prefill_new_tokens_trtllm_ragged self._run_prefill_new_tokens = self._run_prefill_new_tokens_trtllm_ragged
self._pad_v = False self._pad_v = False
elif use_flashinfer_prefill(): elif use_flashinfer_prefill():
logger.info_once("Using FlashInfer prefill for MLA", scope="local") logger.info_once("Using FlashInfer prefill for MLA")
self._run_prefill_context_chunk = self._run_prefill_context_chunk_fi self._run_prefill_context_chunk = self._run_prefill_context_chunk_fi
self._run_prefill_new_tokens = self._run_prefill_new_tokens_fi self._run_prefill_new_tokens = self._run_prefill_new_tokens_fi
self._pad_v = False self._pad_v = False
elif use_cudnn_prefill(): elif use_cudnn_prefill():
logger.info_once("Using CUDNN prefill for MLA", scope="local") logger.info_once("Using CUDNN prefill for MLA")
self._run_prefill_context_chunk = self._run_prefill_context_chunk_cudnn self._run_prefill_context_chunk = self._run_prefill_context_chunk_cudnn
self._run_prefill_new_tokens = self._run_prefill_new_tokens_cudnn self._run_prefill_new_tokens = self._run_prefill_new_tokens_cudnn
self._pad_v = False self._pad_v = False
...@@ -2250,7 +2243,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): ...@@ -2250,7 +2243,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
"available. Please install flash_attn or use " "available. Please install flash_attn or use "
"--attention-backend ROCM_AITER_MLA." "--attention-backend ROCM_AITER_MLA."
) )
logger.info_once("Using FlashAttention prefill for MLA", scope="local") logger.info_once("Using FlashAttention prefill for MLA")
self._run_prefill_context_chunk = self._run_prefill_context_chunk_fa self._run_prefill_context_chunk = self._run_prefill_context_chunk_fa
self._run_prefill_new_tokens = self._run_prefill_new_tokens_fa self._run_prefill_new_tokens = self._run_prefill_new_tokens_fa
......
...@@ -227,9 +227,7 @@ class MMEncoderAttention(CustomOp): ...@@ -227,9 +227,7 @@ class MMEncoderAttention(CustomOp):
if self.attn_backend == AttentionBackendEnum.FLASHINFER: if self.attn_backend == AttentionBackendEnum.FLASHINFER:
_get_flashinfer_workspace_buffer() _get_flashinfer_workspace_buffer()
logger.info_once( logger.info_once(f"Using {self.attn_backend} for MMEncoderAttention.")
f"Using {self.attn_backend} for MMEncoderAttention.", scope="local"
)
@classmethod @classmethod
def enabled(cls) -> bool: def enabled(cls) -> bool:
......
...@@ -1020,7 +1020,7 @@ def override_envs_for_invariance( ...@@ -1020,7 +1020,7 @@ def override_envs_for_invariance(
"You are using a non-decode-invariant form of batch invariance. " "You are using a non-decode-invariant form of batch invariance. "
"This will not be invariant between prefill and decode." "This will not be invariant between prefill and decode."
) )
logger.warning_once(warning, scope="local") logger.warning_once(warning)
os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0" os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
......
...@@ -369,7 +369,6 @@ class BatchedDeepGemmExperts(mk.FusedMoEExpertsModular): ...@@ -369,7 +369,6 @@ class BatchedDeepGemmExperts(mk.FusedMoEExpertsModular):
logger.warning_once( logger.warning_once(
"DPMetadata unavailable. Defaulting expected_m to " "DPMetadata unavailable. Defaulting expected_m to "
f"{max_tokens_per_expert}.", f"{max_tokens_per_expert}.",
scope="local",
) )
return max_tokens_per_expert return max_tokens_per_expert
......
...@@ -1091,7 +1091,6 @@ def get_moe_configs( ...@@ -1091,7 +1091,6 @@ def get_moe_configs(
"Using default MoE config. Performance might be sub-optimal! " "Using default MoE config. Performance might be sub-optimal! "
"Config file not found at %s", "Config file not found at %s",
", ".join(config_file_paths), ", ".join(config_file_paths),
scope="local",
) )
return None return None
......
...@@ -123,7 +123,6 @@ class NixlEPPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular): ...@@ -123,7 +123,6 @@ class NixlEPPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
"NixlEPPrepareAndFinalize is setup to dispatch raw/unquantized " "NixlEPPrepareAndFinalize is setup to dispatch raw/unquantized "
f"activations despite ({fused_experts.__class__.__name__}) being able " f"activations despite ({fused_experts.__class__.__name__}) being able "
"to support quantized activations.", "to support quantized activations.",
scope="local",
) )
def num_dispatchers(self) -> int: def num_dispatchers(self) -> int:
......
...@@ -266,7 +266,7 @@ def select_fp8_moe_backend( ...@@ -266,7 +266,7 @@ def select_fp8_moe_backend(
k_cls, config, weight_key, activation_key, activation_format k_cls, config, weight_key, activation_key, activation_format
) )
if supported: if supported:
logger.info_once(_make_log_backend(backend), scope="local") logger.info_once(_make_log_backend(backend))
return backend, k_cls return backend, k_cls
raise ValueError(_make_log_unsupported(backend, reason)) raise ValueError(_make_log_unsupported(backend, reason))
...@@ -337,12 +337,10 @@ def select_fp8_moe_backend( ...@@ -337,12 +337,10 @@ def select_fp8_moe_backend(
) )
if supported: if supported:
logger.info_once(_make_log_backend(backend), scope="local") logger.info_once(_make_log_backend(backend))
return backend, k_cls return backend, k_cls
else: else:
logger.debug_once( logger.debug_once(_make_log_unsupported(backend, reason))
_make_log_unsupported(backend, reason), scope="local"
)
raise NotImplementedError( raise NotImplementedError(
"Found VLLM_USE_FLASHINFER_MOE_FP8=1, but no " "Found VLLM_USE_FLASHINFER_MOE_FP8=1, but no "
...@@ -396,10 +394,10 @@ def select_fp8_moe_backend( ...@@ -396,10 +394,10 @@ def select_fp8_moe_backend(
activation_format, activation_format,
) )
if supported: if supported:
logger.info_once(_make_log_backend(backend), scope="local") logger.info_once(_make_log_backend(backend))
return backend, k_cls return backend, k_cls
else: else:
logger.debug_once(_make_log_unsupported(backend, reason), scope="local") logger.debug_once(_make_log_unsupported(backend, reason))
# TODO(rob): per discussion with TPU team, we need a way to register # TODO(rob): per discussion with TPU team, we need a way to register
# MoE backends by OOT plugins, rather than having an explicit list # MoE backends by OOT plugins, rather than having an explicit list
...@@ -580,7 +578,7 @@ def make_fp8_moe_kernel( ...@@ -580,7 +578,7 @@ def make_fp8_moe_kernel(
) )
assert prepare_finalize is not None assert prepare_finalize is not None
logger.info_once("Using %s", prepare_finalize.__class__.__name__, scope="local") logger.info_once("Using %s", prepare_finalize.__class__.__name__)
# Create Experts. # Create Experts.
if prepare_finalize.activation_format == mk.FusedMoEActivationFormat.BatchedExperts: if prepare_finalize.activation_format == mk.FusedMoEActivationFormat.BatchedExperts:
......
...@@ -117,7 +117,7 @@ def select_int8_moe_backend( ...@@ -117,7 +117,7 @@ def select_int8_moe_backend(
k_cls, config, weight_key, activation_key, activation_format k_cls, config, weight_key, activation_key, activation_format
) )
if supported: if supported:
logger.info_once(_make_log_backend(backend), scope="local") logger.info_once(_make_log_backend(backend))
return backend, k_cls return backend, k_cls
raise ValueError(_make_log_unsupported(backend, reason)) raise ValueError(_make_log_unsupported(backend, reason))
...@@ -138,10 +138,10 @@ def select_int8_moe_backend( ...@@ -138,10 +138,10 @@ def select_int8_moe_backend(
activation_format, activation_format,
) )
if supported: if supported:
logger.info_once(_make_log_backend(backend), scope="local") logger.info_once(_make_log_backend(backend))
return backend, k_cls return backend, k_cls
else: else:
logger.debug_once(_make_log_unsupported(backend, reason), scope="local") logger.debug_once(_make_log_unsupported(backend, reason))
raise NotImplementedError( raise NotImplementedError(
"No Int8 MoE backend supports the deployment configuration." "No Int8 MoE backend supports the deployment configuration."
...@@ -193,7 +193,7 @@ def make_int8_moe_kernel( ...@@ -193,7 +193,7 @@ def make_int8_moe_kernel(
) )
assert prepare_finalize is not None assert prepare_finalize is not None
logger.info_once("Using %s", prepare_finalize.__class__.__name__, scope="local") logger.info_once("Using %s", prepare_finalize.__class__.__name__)
# Create Experts. # Create Experts.
if prepare_finalize.activation_format == mk.FusedMoEActivationFormat.BatchedExperts: if prepare_finalize.activation_format == mk.FusedMoEActivationFormat.BatchedExperts:
......
...@@ -269,7 +269,7 @@ def select_gpt_oss_mxfp4_moe_backend( ...@@ -269,7 +269,7 @@ def select_gpt_oss_mxfp4_moe_backend(
k_cls, config, weight_key, activation_key, activation_format k_cls, config, weight_key, activation_key, activation_format
) )
if supported: if supported:
logger.info_once(_make_log_backend(backend), scope="local") logger.info_once(_make_log_backend(backend))
return backend, k_cls return backend, k_cls
raise ValueError(_make_log_unsupported(backend, reason)) raise ValueError(_make_log_unsupported(backend, reason))
...@@ -363,10 +363,10 @@ def select_gpt_oss_mxfp4_moe_backend( ...@@ -363,10 +363,10 @@ def select_gpt_oss_mxfp4_moe_backend(
k_cls, config, kMxfp4Static, activation_key, activation_format k_cls, config, kMxfp4Static, activation_key, activation_format
) )
if supported: if supported:
logger.info_once(_make_log_backend(backend), scope="local") logger.info_once(_make_log_backend(backend))
return backend, k_cls return backend, k_cls
else: else:
logger.debug_once(_make_log_unsupported(backend, reason), scope="local") logger.debug_once(_make_log_unsupported(backend, reason))
if current_platform.is_xpu(): if current_platform.is_xpu():
backend = Mxfp4MoeBackend.XPU backend = Mxfp4MoeBackend.XPU
...@@ -861,7 +861,7 @@ def make_mxfp4_moe_kernel( ...@@ -861,7 +861,7 @@ def make_mxfp4_moe_kernel(
) )
assert prepare_finalize is not None assert prepare_finalize is not None
logger.info_once("Using %s", prepare_finalize.__class__.__name__, scope="local") logger.info_once("Using %s", prepare_finalize.__class__.__name__)
# Create Experts. # Create Experts.
if prepare_finalize.activation_format == mk.FusedMoEActivationFormat.BatchedExperts: if prepare_finalize.activation_format == mk.FusedMoEActivationFormat.BatchedExperts:
......
...@@ -252,12 +252,10 @@ def select_nvfp4_moe_backend( ...@@ -252,12 +252,10 @@ def select_nvfp4_moe_backend(
activation_format, activation_format,
) )
if supported: if supported:
logger.info_once(_make_log_backend(backend), scope="local") logger.info_once(_make_log_backend(backend))
return backend, k_cls return backend, k_cls
else: else:
logger.debug_once( logger.debug_once(_make_log_unsupported(backend, reason))
_make_log_unsupported(backend, reason), scope="local"
)
raise NotImplementedError( raise NotImplementedError(
"Found VLLM_USE_FLASHINFER_MOE_FP4=1, but no " "Found VLLM_USE_FLASHINFER_MOE_FP4=1, but no "
...@@ -282,10 +280,10 @@ def select_nvfp4_moe_backend( ...@@ -282,10 +280,10 @@ def select_nvfp4_moe_backend(
) )
if supported: if supported:
logger.info_once(_make_log_backend(backend), scope="local") logger.info_once(_make_log_backend(backend))
return backend, k_cls return backend, k_cls
else: else:
logger.debug_once(_make_log_unsupported(backend, reason), scope="local") logger.debug_once(_make_log_unsupported(backend, reason))
raise NotImplementedError( raise NotImplementedError(
"No NvFp4 MoE backend supports the deployment configuration." "No NvFp4 MoE backend supports the deployment configuration."
......
...@@ -210,7 +210,7 @@ def select_unquantized_moe_backend( ...@@ -210,7 +210,7 @@ def select_unquantized_moe_backend(
k_cls, config, None, None, activation_format k_cls, config, None, None, activation_format
) )
if supported: if supported:
logger.info_once(_make_log_backend(backend), scope="local") logger.info_once(_make_log_backend(backend))
return backend, k_cls return backend, k_cls
raise ValueError(_make_log_unsupported(backend, reason)) raise ValueError(_make_log_unsupported(backend, reason))
...@@ -271,12 +271,10 @@ def select_unquantized_moe_backend( ...@@ -271,12 +271,10 @@ def select_unquantized_moe_backend(
k_cls, moe_config, None, None, activation_format k_cls, moe_config, None, None, activation_format
) )
if supported: if supported:
logger.info_once(_make_log_backend(backend), scope="local") logger.info_once(_make_log_backend(backend))
return backend, k_cls return backend, k_cls
else: else:
logger.debug_once( logger.debug_once(_make_log_unsupported(backend, reason))
_make_log_unsupported(backend, reason), scope="local"
)
raise NotImplementedError( raise NotImplementedError(
"Found VLLM_USE_FLASHINFER_MOE_FP16=1, but no " "Found VLLM_USE_FLASHINFER_MOE_FP16=1, but no "
...@@ -298,10 +296,10 @@ def select_unquantized_moe_backend( ...@@ -298,10 +296,10 @@ def select_unquantized_moe_backend(
k_cls, moe_config, None, None, activation_format k_cls, moe_config, None, None, activation_format
) )
if supported: if supported:
logger.info_once(_make_log_backend(backend), scope="local") logger.info_once(_make_log_backend(backend))
return backend, k_cls return backend, k_cls
logger.debug_once(_make_log_unsupported(backend, reason), scope="local") logger.debug_once(_make_log_unsupported(backend, reason))
raise NotImplementedError( raise NotImplementedError(
"No Unquantized MoE backend supports the deployment configuration." "No Unquantized MoE backend supports the deployment configuration."
...@@ -355,7 +353,7 @@ def make_unquantized_moe_kernel( ...@@ -355,7 +353,7 @@ def make_unquantized_moe_kernel(
) )
assert prepare_finalize is not None assert prepare_finalize is not None
logger.info_once("Using %s", prepare_finalize.__class__.__name__, scope="local") logger.info_once("Using %s", prepare_finalize.__class__.__name__)
# Create Experts # Create Experts
if prepare_finalize.activation_format == mk.FusedMoEActivationFormat.BatchedExperts: if prepare_finalize.activation_format == mk.FusedMoEActivationFormat.BatchedExperts:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment