Unverified Commit da4e7687 authored by Aaron Pham's avatar Aaron Pham Committed by GitHub
Browse files

[Fix] Support passing args to logger (#17425)


Signed-off-by: default avatarAaron Pham <contact@aarnphm.xyz>
parent 39317cf4
...@@ -278,7 +278,7 @@ class ModelConfig: ...@@ -278,7 +278,7 @@ class ModelConfig:
max_model_len: int = None # type: ignore max_model_len: int = None # type: ignore
"""Model context length (prompt and output). If unspecified, will be """Model context length (prompt and output). If unspecified, will be
automatically derived from the model config. automatically derived from the model config.
When passing via `--max-model-len`, supports k/m/g/K/M/G in human-readable When passing via `--max-model-len`, supports k/m/g/K/M/G in human-readable
format. Examples:\n format. Examples:\n
- 1k -> 1000\n - 1k -> 1000\n
...@@ -518,11 +518,11 @@ class ModelConfig: ...@@ -518,11 +518,11 @@ class ModelConfig:
self.hf_text_config.sliding_window) self.hf_text_config.sliding_window)
logger.warning_once( logger.warning_once(
f"{self.hf_text_config.model_type} has interleaved " "%s has interleaved attention, which is currently not supported by the %s backend. Disabling sliding window and capping the max length to the sliding window size (%d).", # noqa: E501
"attention, which is currently not supported by the " self.hf_text_config.model_type,
f"{backend} backend. Disabling sliding window and capping " backend,
"the max length to the sliding window size " sliding_window_len_min,
f"({sliding_window_len_min}).") )
self.disable_sliding_window = True self.disable_sliding_window = True
else: else:
# for a model with interleaved attention, # for a model with interleaved attention,
......
...@@ -5,6 +5,7 @@ import json ...@@ -5,6 +5,7 @@ import json
import logging import logging
import os import os
import sys import sys
from collections.abc import Hashable
from functools import lru_cache, partial from functools import lru_cache, partial
from logging import Logger from logging import Logger
from logging.config import dictConfig from logging.config import dictConfig
...@@ -52,15 +53,15 @@ DEFAULT_LOGGING_CONFIG = { ...@@ -52,15 +53,15 @@ DEFAULT_LOGGING_CONFIG = {
@lru_cache @lru_cache
def _print_info_once(logger: Logger, msg: str) -> None: def _print_info_once(logger: Logger, msg: str, *args: Hashable) -> None:
# Set the stacklevel to 2 to print the original caller's line info # Set the stacklevel to 2 to print the original caller's line info
logger.info(msg, stacklevel=2) logger.info(msg, *args, stacklevel=2)
@lru_cache @lru_cache
def _print_warning_once(logger: Logger, msg: str) -> None: def _print_warning_once(logger: Logger, msg: str, *args: Hashable) -> None:
# Set the stacklevel to 2 to print the original caller's line info # Set the stacklevel to 2 to print the original caller's line info
logger.warning(msg, stacklevel=2) logger.warning(msg, *args, stacklevel=2)
class _VllmLogger(Logger): class _VllmLogger(Logger):
...@@ -72,19 +73,19 @@ class _VllmLogger(Logger): ...@@ -72,19 +73,19 @@ class _VllmLogger(Logger):
`intel_extension_for_pytorch.utils._logger`. `intel_extension_for_pytorch.utils._logger`.
""" """
def info_once(self, msg: str) -> None: def info_once(self, msg: str, *args: Hashable) -> None:
""" """
As :meth:`info`, but subsequent calls with the same message As :meth:`info`, but subsequent calls with the same message
are silently dropped. are silently dropped.
""" """
_print_info_once(self, msg) _print_info_once(self, msg, *args)
def warning_once(self, msg: str) -> None: def warning_once(self, msg: str, *args: Hashable) -> None:
""" """
As :meth:`warning`, but subsequent calls with the same message As :meth:`warning`, but subsequent calls with the same message
are silently dropped. are silently dropped.
""" """
_print_warning_once(self, msg) _print_warning_once(self, msg, *args)
def _configure_vllm_root_logger() -> None: def _configure_vllm_root_logger() -> None:
......
...@@ -15,6 +15,5 @@ def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase: ...@@ -15,6 +15,5 @@ def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase:
punica_wrapper = punica_wrapper_cls(*args, **kwargs) punica_wrapper = punica_wrapper_cls(*args, **kwargs)
assert punica_wrapper is not None, \ assert punica_wrapper is not None, \
"the punica_wrapper_qualname(" + punica_wrapper_qualname + ") is wrong." "the punica_wrapper_qualname(" + punica_wrapper_qualname + ") is wrong."
logger.info_once("Using " + punica_wrapper_qualname.rsplit(".", 1)[1] + logger.info_once("Using %s.", punica_wrapper_qualname.rsplit(".", 1)[1])
".")
return punica_wrapper return punica_wrapper
...@@ -107,9 +107,9 @@ class CustomOp(nn.Module): ...@@ -107,9 +107,9 @@ class CustomOp(nn.Module):
custom_ops = compilation_config.custom_ops custom_ops = compilation_config.custom_ops
if not hasattr(cls, "name"): if not hasattr(cls, "name"):
logger.warning_once( logger.warning_once(
f"Custom op {cls.__name__} was not registered, " "Custom op %s was not registered, which means it won't appear in the op registry. It will be enabled/disabled based on the global settings.", # noqa: E501
f"which means it won't appear in the op registry. " cls.__name__,
f"It will be enabled/disabled based on the global settings.") )
return CustomOp.default_on() return CustomOp.default_on()
enabled = f"+{cls.name}" in custom_ops enabled = f"+{cls.name}" in custom_ops
......
...@@ -191,9 +191,9 @@ class GrammarConfig: ...@@ -191,9 +191,9 @@ class GrammarConfig:
if model_with_warn is not None and any_whitespace: if model_with_warn is not None and any_whitespace:
logger.info_once( logger.info_once(
f"{model_with_warn} model detected, consider setting " "%s model detected, consider setting `disable_any_whitespace` to prevent runaway generation of whitespaces.", # noqa: E501
"`disable_any_whitespace` to prevent runaway generation " model_with_warn,
"of whitespaces.") )
# Validate the schema and raise ValueError here if it is invalid. # Validate the schema and raise ValueError here if it is invalid.
# This is to avoid exceptions in model execution, which will crash # This is to avoid exceptions in model execution, which will crash
# the engine worker process. # the engine worker process.
......
...@@ -130,8 +130,9 @@ class AWQMarlinConfig(QuantizationConfig): ...@@ -130,8 +130,9 @@ class AWQMarlinConfig(QuantizationConfig):
# Check if the layer is supported by AWQMarlin. # Check if the layer is supported by AWQMarlin.
if not check_marlin_supports_layer(layer, self.group_size): if not check_marlin_supports_layer(layer, self.group_size):
logger.warning_once( logger.warning_once(
f"Layer '{prefix}' is not supported by AWQMarlin. " "Layer '%s' is not supported by AWQMarlin. Falling back to unoptimized AWQ kernels.", # noqa: E501
"Falling back to unoptimized AWQ kernels.") prefix,
)
return AWQConfig.from_config( return AWQConfig.from_config(
self.full_config).get_quant_method(layer, prefix) self.full_config).get_quant_method(layer, prefix)
return AWQMarlinLinearMethod(self) return AWQMarlinLinearMethod(self)
......
...@@ -464,7 +464,7 @@ def fastsafetensors_weights_iterator( ...@@ -464,7 +464,7 @@ def fastsafetensors_weights_iterator(
hf_weights_files: List[str], hf_weights_files: List[str],
use_tqdm_on_load: bool, use_tqdm_on_load: bool,
) -> Generator[Tuple[str, torch.Tensor], None, None]: ) -> Generator[Tuple[str, torch.Tensor], None, None]:
"""Iterate over the weights in the model safetensor files """Iterate over the weights in the model safetensor files
using fastsafetensor library.""" using fastsafetensor library."""
if torch.distributed.is_initialized(): if torch.distributed.is_initialized():
pg = torch.distributed.group.WORLD pg = torch.distributed.group.WORLD
...@@ -716,10 +716,10 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]: ...@@ -716,10 +716,10 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
remapped_name = name.replace(".kv_scale", ".attn.k_scale") remapped_name = name.replace(".kv_scale", ".attn.k_scale")
if remapped_name not in params_dict: if remapped_name not in params_dict:
logger.warning_once( logger.warning_once(
f"Found kv_scale in the checkpoint (e.g. {name}), " "Found kv_scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv_scale is not loaded.", # noqa: E501
"but not found the expected name in the model " name,
f"(e.g. {remapped_name}). kv_scale is " remapped_name,
"not loaded.") )
return None return None
return remapped_name return remapped_name
...@@ -738,10 +738,12 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]: ...@@ -738,10 +738,12 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
remapped_name = name.replace(scale_name, f".attn{scale_name}") remapped_name = name.replace(scale_name, f".attn{scale_name}")
if remapped_name not in params_dict: if remapped_name not in params_dict:
logger.warning_once( logger.warning_once(
f"Found {scale_name} in the checkpoint (e.g. {name}), " "Found %s in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). %s is not loaded.", # noqa: E501
"but not found the expected name in the model " scale_name,
f"(e.g. {remapped_name}). {scale_name} is " name,
"not loaded.") remapped_name,
scale_name,
)
return None return None
return remapped_name return remapped_name
......
...@@ -1111,10 +1111,10 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -1111,10 +1111,10 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
".kv_scale", ".attn.kv_scale") ".kv_scale", ".attn.kv_scale")
if remapped_kv_scale_name not in params_dict: if remapped_kv_scale_name not in params_dict:
logger.warning_once( logger.warning_once(
"Found kv scale in the checkpoint (e.g. " "Found kv scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv-scale is not loaded.", # noqa: E501
f"{name}), but not found the expected name in " name,
f"the model (e.g. {remapped_kv_scale_name}). " remapped_kv_scale_name,
"kv-scale is not loaded.") )
continue continue
else: else:
name = remapped_kv_scale_name name = remapped_kv_scale_name
......
...@@ -385,11 +385,10 @@ class OlmoeModel(nn.Module): ...@@ -385,11 +385,10 @@ class OlmoeModel(nn.Module):
".kv_scale", ".attn.kv_scale") ".kv_scale", ".attn.kv_scale")
if remapped_kv_scale_name not in params_dict: if remapped_kv_scale_name not in params_dict:
logger.warning_once( logger.warning_once(
"Found kv scale in the checkpoint " "Found kv scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv-scale is not loaded.", # noqa: E501
f"(e.g. {name}), but not found the expected " name,
f"name in the model " remapped_kv_scale_name,
f"(e.g. {remapped_kv_scale_name}). " )
"kv-scale is not loaded.")
continue continue
else: else:
name = remapped_kv_scale_name name = remapped_kv_scale_name
......
...@@ -462,11 +462,10 @@ class Qwen2MoeModel(nn.Module): ...@@ -462,11 +462,10 @@ class Qwen2MoeModel(nn.Module):
".kv_scale", ".attn.kv_scale") ".kv_scale", ".attn.kv_scale")
if remapped_kv_scale_name not in params_dict: if remapped_kv_scale_name not in params_dict:
logger.warning_once( logger.warning_once(
"Found kv scale in the checkpoint " "Found kv_scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv_scale is not loaded.", # noqa: E501
f"(e.g. {name}), but not found the expected " name,
f"name in the model " remapped_kv_scale_name,
f"(e.g. {remapped_kv_scale_name}). " )
"kv-scale is not loaded.")
continue continue
else: else:
name = remapped_kv_scale_name name = remapped_kv_scale_name
......
...@@ -459,11 +459,10 @@ class Qwen3MoeModel(nn.Module): ...@@ -459,11 +459,10 @@ class Qwen3MoeModel(nn.Module):
".kv_scale", ".attn.kv_scale") ".kv_scale", ".attn.kv_scale")
if remapped_kv_scale_name not in params_dict: if remapped_kv_scale_name not in params_dict:
logger.warning_once( logger.warning_once(
"Found kv scale in the checkpoint " "Found kv scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv-scale is not loaded.", # noqa: E501
f"(e.g. {name}), but not found the expected " name,
f"name in the model " remapped_kv_scale_name,
f"(e.g. {remapped_kv_scale_name}). " )
"kv-scale is not loaded.")
continue continue
else: else:
name = remapped_kv_scale_name name = remapped_kv_scale_name
......
...@@ -215,17 +215,14 @@ class MultiModalProfiler(Generic[_I]): ...@@ -215,17 +215,14 @@ class MultiModalProfiler(Generic[_I]):
elif total_len > seq_len and not envs.VLLM_USE_V1: elif total_len > seq_len and not envs.VLLM_USE_V1:
# `max_num_batched_tokens` is defined by `SchedulerConfig` # `max_num_batched_tokens` is defined by `SchedulerConfig`
logger.warning_once( logger.warning_once(
"The encoder sequence length used for profiling (" "The encoder sequence length used for profiling (max_num_batched_tokens / max_num_seqs = %d) " # noqa: E501
f"max_num_batched_tokens / max_num_seqs = {seq_len}) " "is too short to hold the multi-modal embeddings in the worst case (%d tokens in total, out of which %s are reserved for multi-modal embeddings). " # noqa: E501
" is too short " "This may cause certain multi-modal inputs to fail during inference, even when the input text is short. " # noqa: E501
"to hold the multi-modal embeddings in the worst case " "To avoid this, you should increase `max_model_len`, reduce `max_num_seqs`, and/or reduce `mm_counts`.", # noqa: E501
f"({total_len} tokens in total, out of which " seq_len,
f"{self._get_mm_num_tokens(mm_inputs)} are reserved for " total_len,
"multi-modal embeddings). This may cause certain " str(self._get_mm_num_tokens(mm_inputs)),
"multi-modal inputs to fail during inference, even when " )
"the input text is short. To avoid this, you should "
"increase `max_model_len`, reduce `max_num_seqs`, "
"and/or reduce `mm_counts`.")
return DummyEncoderData(encoder_prompt_token_ids) return DummyEncoderData(encoder_prompt_token_ids)
...@@ -243,17 +240,14 @@ class MultiModalProfiler(Generic[_I]): ...@@ -243,17 +240,14 @@ class MultiModalProfiler(Generic[_I]):
if total_len > seq_len and not envs.VLLM_USE_V1: if total_len > seq_len and not envs.VLLM_USE_V1:
# `max_num_batched_tokens` is defined by `SchedulerConfig` # `max_num_batched_tokens` is defined by `SchedulerConfig`
logger.warning_once( logger.warning_once(
"The sequence length used for profiling (" "The sequence length used for profiling (max_num_batched_tokens / max_num_seqs = %d) " # noqa: E501
f"max_num_batched_tokens / max_num_seqs = {seq_len}) " "is too short to hold the multi-modal embeddings in the worst case (%d tokens in total, out of which %s are reserved for multi-modal embeddings). " # noqa: E501
"is too short " "This may cause certain multi-modal inputs to fail during inference, even when the input text is short. " # noqa: E501
"to hold the multi-modal embeddings in the worst case " "To avoid this, you should increase `max_model_len`, reduce `max_num_seqs`, and/or reduce `mm_counts`.", # noqa: E501
f"({total_len} tokens in total, out of which " seq_len,
f"{self._get_mm_num_tokens(mm_inputs)} are reserved for " total_len,
"multi-modal embeddings). This may cause certain " str(self._get_mm_num_tokens(mm_inputs)),
"multi-modal inputs to fail during inference, even when " )
"the input text is short. To avoid this, you should "
"increase `max_model_len`, reduce `max_num_seqs`, "
"and/or reduce `mm_counts`.")
if total_len < seq_len: if total_len < seq_len:
prompt_token_ids.extend([0] * (seq_len - total_len)) prompt_token_ids.extend([0] * (seq_len - total_len))
......
...@@ -100,7 +100,7 @@ class MultiModalRegistry: ...@@ -100,7 +100,7 @@ class MultiModalRegistry:
model_config: "ModelConfig", model_config: "ModelConfig",
) -> Mapping[str, int]: ) -> Mapping[str, int]:
""" """
Get the maximum number of tokens per data item from each modality based Get the maximum number of tokens per data item from each modality based
on underlying model configuration. on underlying model configuration.
""" """
if not model_config.is_multimodal_model: if not model_config.is_multimodal_model:
...@@ -126,11 +126,11 @@ class MultiModalRegistry: ...@@ -126,11 +126,11 @@ class MultiModalRegistry:
) -> Mapping[str, int]: ) -> Mapping[str, int]:
""" """
Get the maximum number of tokens per data item from each modality based Get the maximum number of tokens per data item from each modality based
on underlying model configuration, excluding modalities that user on underlying model configuration, excluding modalities that user
explicitly disabled via `limit_mm_per_prompt`. explicitly disabled via `limit_mm_per_prompt`.
Note: Note:
This is currently directly used only in V1 for profiling the memory This is currently directly used only in V1 for profiling the memory
usage of a model. usage of a model.
""" """
mm_limits = self.get_mm_limits_per_prompt(model_config) mm_limits = self.get_mm_limits_per_prompt(model_config)
...@@ -316,7 +316,9 @@ class MultiModalRegistry: ...@@ -316,7 +316,9 @@ class MultiModalRegistry:
token_ids = dummy_data.prompt_token_ids token_ids = dummy_data.prompt_token_ids
if len(token_ids) < seq_len: if len(token_ids) < seq_len:
logger.warning_once( logger.warning_once(
f"Expected at least {seq_len} dummy encoder tokens for " "Expected at least %d dummy encoder tokens for profiling, but found %d tokens instead.", # noqa: E501
f"profiling, but found {len(token_ids)} tokens instead.") seq_len,
len(token_ids),
)
return dummy_data return dummy_data
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment