Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
05a83dc6
Unverified
Commit
05a83dc6
authored
Dec 17, 2025
by
Nathan Price
Committed by
GitHub
Dec 18, 2025
Browse files
feat(api): Eager chat template warmup to eliminate first-request latency (#30700)
Signed-off-by:
Nathan Price
<
nathan@abridge.com
>
parent
e3fc374a
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
52 additions
and
0 deletions
+52
-0
vllm/entrypoints/openai/api_server.py
vllm/entrypoints/openai/api_server.py
+3
-0
vllm/entrypoints/openai/serving_chat.py
vllm/entrypoints/openai/serving_chat.py
+49
-0
No files found.
vllm/entrypoints/openai/api_server.py
View file @
05a83dc6
...
@@ -1082,6 +1082,9 @@ async def init_app_state(
...
@@ -1082,6 +1082,9 @@ async def init_app_state(
if
"generate"
in
supported_tasks
if
"generate"
in
supported_tasks
else
None
else
None
)
)
# Warm up chat template processing to avoid first-request latency
if
state
.
openai_serving_chat
is
not
None
:
await
state
.
openai_serving_chat
.
warmup
()
state
.
openai_serving_completion
=
(
state
.
openai_serving_completion
=
(
OpenAIServingCompletion
(
OpenAIServingCompletion
(
engine_client
,
engine_client
,
...
...
vllm/entrypoints/openai/serving_chat.py
View file @
05a83dc6
...
@@ -162,6 +162,55 @@ class OpenAIServingChat(OpenAIServing):
...
@@ -162,6 +162,55 @@ class OpenAIServingChat(OpenAIServing):
self
.
supports_code_interpreter
=
False
self
.
supports_code_interpreter
=
False
self
.
python_tool
=
None
self
.
python_tool
=
None
async
def
warmup
(
self
)
->
None
:
"""
Warm up the chat template processing to avoid first-request latency.
This method triggers Jinja2 template compilation and content format
detection that would otherwise happen on the first real request,
causing increased latency on the first request.
"""
logger
.
info
(
"Warming up chat template processing..."
)
start_time
=
time
.
perf_counter
()
try
:
# Get the tokenizer from the engine
tokenizer
=
await
self
.
engine_client
.
get_tokenizer
()
# Create a minimal dummy request
dummy_request
=
ChatCompletionRequest
(
messages
=
[{
"role"
:
"user"
,
"content"
:
"warmup"
}],
model
=
None
,
max_completion_tokens
=
1
,
)
# Call _preprocess_chat to trigger template compilation
# This forces:
# 1. Chat template content format detection
# 2. Jinja2 template compilation
# 3. Tokenizer initialization for chat
await
self
.
_preprocess_chat
(
dummy_request
,
tokenizer
,
dummy_request
.
messages
,
chat_template
=
self
.
chat_template
,
chat_template_content_format
=
self
.
chat_template_content_format
,
add_generation_prompt
=
True
,
continue_final_message
=
False
,
tool_dicts
=
None
,
documents
=
None
,
chat_template_kwargs
=
None
,
tool_parser
=
None
,
add_special_tokens
=
False
,
)
elapsed
=
(
time
.
perf_counter
()
-
start_time
)
*
1000
logger
.
info
(
"Chat template warmup completed in %.1fms"
,
elapsed
)
except
Exception
:
# Log but don't fail server startup if warmup fails
logger
.
exception
(
"Chat template warmup failed"
)
async
def
create_chat_completion
(
async
def
create_chat_completion
(
self
,
self
,
request
:
ChatCompletionRequest
,
request
:
ChatCompletionRequest
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment