Unverified Commit 458c1a4b authored by Nick Hill's avatar Nick Hill Committed by GitHub
Browse files

[Frontend] Reduce chat template warmup logging levels (#37062)


Signed-off-by: default avatarNick Hill <nickhill123@gmail.com>
parent 821fde2d
...@@ -179,17 +179,17 @@ class BaseRenderer(ABC, Generic[_T]): ...@@ -179,17 +179,17 @@ class BaseRenderer(ABC, Generic[_T]):
from vllm.entrypoints.chat_utils import ChatTemplateResolutionError from vllm.entrypoints.chat_utils import ChatTemplateResolutionError
try: try:
logger.info("Warming up chat template processing...") logger.debug("Warming up chat template processing...")
start_time = time.perf_counter() start_time = time.perf_counter()
self.render_chat([[{"role": "user", "content": "warmup"}]], chat_params) self.render_chat([[{"role": "user", "content": "warmup"}]], chat_params)
elapsed = time.perf_counter() - start_time elapsed = time.perf_counter() - start_time
logger.info("Chat template warmup completed in %.3fs", elapsed) logger.debug("Chat template warmup completed in %.3fs", elapsed)
except ChatTemplateResolutionError: except ChatTemplateResolutionError:
logger.info("This model does not support chat template.") logger.debug("This model does not support chat template.")
except Exception: except Exception:
logger.exception("Chat template warmup failed") logger.warning("Chat template warmup failed", exc_info=True)
if self.mm_processor: if self.mm_processor:
from vllm.multimodal.processing import TimingContext from vllm.multimodal.processing import TimingContext
...@@ -200,7 +200,7 @@ class BaseRenderer(ABC, Generic[_T]): ...@@ -200,7 +200,7 @@ class BaseRenderer(ABC, Generic[_T]):
mm_limits = processor.info.allowed_mm_limits mm_limits = processor.info.allowed_mm_limits
try: try:
logger.info("Warming up multi-modal processing...") logger.debug("Warming up multi-modal processing...")
start_time = time.perf_counter() start_time = time.perf_counter()
processor_inputs = processor.dummy_inputs.get_dummy_processor_inputs( processor_inputs = processor.dummy_inputs.get_dummy_processor_inputs(
...@@ -209,14 +209,13 @@ class BaseRenderer(ABC, Generic[_T]): ...@@ -209,14 +209,13 @@ class BaseRenderer(ABC, Generic[_T]):
mm_options=mm_config.limit_per_prompt, mm_options=mm_config.limit_per_prompt,
) )
_ = processor.apply( _ = processor.apply(
processor_inputs, processor_inputs, timing_ctx=TimingContext(enabled=False)
timing_ctx=TimingContext(enabled=False),
) )
elapsed = time.perf_counter() - start_time elapsed = time.perf_counter() - start_time
logger.info("Multi-modal warmup completed in %.3fs", elapsed) logger.info("Multi-modal warmup completed in %.3fs", elapsed)
except Exception: except Exception:
logger.exception("Multi-modal warmup failed") logger.warning("Multi-modal warmup failed")
finally: finally:
self.clear_mm_cache() self.clear_mm_cache()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment