feat(api): Eager chat template warmup to eliminate first-request latency (#30700)

Signed-off-by: Nathan Price <nathan@abridge.com>

feat(api): Eager chat template warmup to eliminate first-request latency (#30700)
Signed-off-by: Nathan Price <nathan@abridge.com>
05a83dc6 · Nathan Price · GitHub · e3fc374a · 05a83dc6 · 05a83dc6
Unverified Commit 05a83dc6 authored Dec 17, 2025 by Nathan Price Committed by GitHub Dec 18, 2025
Show whitespace changes
Inline Side-by-side

Showing with 52 additions and 0 deletions

vllm/entrypoints/openai/api_server.py vllm/entrypoints/openai/api_server.py +3 -0

vllm/entrypoints/openai/serving_chat.py vllm/entrypoints/openai/serving_chat.py +49 -0

No files found.
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1082,6 +1082,9 @@ async def init_app_state(
        if "generate" in supported_tasks
        else None
    )
+    # Warm up chat template processing to avoid first-request latency
+    if state.openai_serving_chat is not None:
+        await state.openai_serving_chat.warmup()
    state.openai_serving_completion = (
        OpenAIServingCompletion(
            engine_client,

--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -162,6 +162,55 @@ class OpenAIServingChat(OpenAIServing):
        self.supports_code_interpreter = False
        self.python_tool = None
+    async def warmup(self) -> None:
+        """
+        Warm up the chat template processing to avoid first-request latency.
+        This method triggers Jinja2 template compilation and content format
+        detection that would otherwise happen on the first real request,
+        causing increased latency on the first request.
+        """
+        logger.info("Warming up chat template processing...")
+        start_time = time.perf_counter()
+        try:
+            # Get the tokenizer from the engine
+            tokenizer = await self.engine_client.get_tokenizer()
+            # Create a minimal dummy request
+            dummy_request = ChatCompletionRequest(
+                messages=[{"role": "user", "content": "warmup"}],
+                model=None,
+                max_completion_tokens=1,
+            )
+            # Call _preprocess_chat to trigger template compilation
+            # This forces:
+            # 1. Chat template content format detection
+            # 2. Jinja2 template compilation
+            # 3. Tokenizer initialization for chat
+            await self._preprocess_chat(
+                dummy_request,
+                tokenizer,
+                dummy_request.messages,
+                chat_template=self.chat_template,
+                chat_template_content_format=self.chat_template_content_format,
+                add_generation_prompt=True,
+                continue_final_message=False,
+                tool_dicts=None,
+                documents=None,
+                chat_template_kwargs=None,
+                tool_parser=None,
+                add_special_tokens=False,
+            )
+            elapsed = (time.perf_counter() - start_time) * 1000
+            logger.info("Chat template warmup completed in %.1fms", elapsed)
+        except Exception:
+            # Log but don't fail server startup if warmup fails
+            logger.exception("Chat template warmup failed")
    async def create_chat_completion(
        self,
        request: ChatCompletionRequest,