Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a42d2df7
Unverified
Commit
a42d2df7
authored
Oct 04, 2025
by
Isotr0py
Committed by
GitHub
Oct 04, 2025
Browse files
[Frontend] Cache chat template kwargs resolution (#26227)
Signed-off-by:
Isotr0py
<
mozf@mail2.sysu.edu.cn
>
parent
5c057e06
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
81 additions
and
18 deletions
+81
-18
vllm/entrypoints/chat_utils.py
vllm/entrypoints/chat_utils.py
+17
-7
vllm/entrypoints/openai/api_server.py
vllm/entrypoints/openai/api_server.py
+3
-0
vllm/entrypoints/openai/serving_chat.py
vllm/entrypoints/openai/serving_chat.py
+9
-11
vllm/entrypoints/openai/serving_embedding.py
vllm/entrypoints/openai/serving_embedding.py
+16
-0
vllm/entrypoints/openai/serving_engine.py
vllm/entrypoints/openai/serving_engine.py
+16
-0
vllm/entrypoints/openai/serving_pooling.py
vllm/entrypoints/openai/serving_pooling.py
+10
-0
vllm/entrypoints/openai/serving_tokenization.py
vllm/entrypoints/openai/serving_tokenization.py
+10
-0
No files found.
vllm/entrypoints/chat_utils.py
View file @
a42d2df7
...
...
@@ -1572,6 +1572,22 @@ class AssistantTracker(jinja2.ext.Extension):
return
call_block
.
set_lineno
(
lineno
)
def
_resolve_chat_template_kwargs
(
chat_template
:
str
,
):
env
=
jinja2
.
sandbox
.
ImmutableSandboxedEnvironment
(
trim_blocks
=
True
,
lstrip_blocks
=
True
,
extensions
=
[
AssistantTracker
,
jinja2
.
ext
.
loopcontrols
],
)
parsed_content
=
env
.
parse
(
chat_template
)
template_vars
=
jinja2
.
meta
.
find_undeclared_variables
(
parsed_content
)
return
template_vars
_cached_resolve_chat_template_kwargs
=
lru_cache
(
_resolve_chat_template_kwargs
)
def
resolve_chat_template_kwargs
(
tokenizer
:
Union
[
PreTrainedTokenizer
,
PreTrainedTokenizerFast
],
chat_template
:
str
,
...
...
@@ -1582,13 +1598,7 @@ def resolve_chat_template_kwargs(
if
supports_kw
(
tokenizer
.
apply_chat_template
,
k
,
allow_var_kwargs
=
False
)
}
env
=
jinja2
.
sandbox
.
ImmutableSandboxedEnvironment
(
trim_blocks
=
True
,
lstrip_blocks
=
True
,
extensions
=
[
AssistantTracker
,
jinja2
.
ext
.
loopcontrols
],
)
parsed_content
=
env
.
parse
(
chat_template
)
template_vars
=
jinja2
.
meta
.
find_undeclared_variables
(
parsed_content
)
template_vars
=
_cached_resolve_chat_template_kwargs
(
chat_template
)
# We exclude chat_template from kwargs here, because
# chat template has been already resolved at this stage
...
...
vllm/entrypoints/openai/api_server.py
View file @
a42d2df7
...
...
@@ -1745,6 +1745,7 @@ async def init_app_state(
request_logger
=
request_logger
,
chat_template
=
resolved_chat_template
,
chat_template_content_format
=
args
.
chat_template_content_format
,
trust_request_chat_template
=
args
.
trust_request_chat_template
,
log_error_stack
=
args
.
log_error_stack
,
)
if
"encode"
in
supported_tasks
else
None
state
.
openai_serving_embedding
=
OpenAIServingEmbedding
(
...
...
@@ -1754,6 +1755,7 @@ async def init_app_state(
request_logger
=
request_logger
,
chat_template
=
resolved_chat_template
,
chat_template_content_format
=
args
.
chat_template_content_format
,
trust_request_chat_template
=
args
.
trust_request_chat_template
,
log_error_stack
=
args
.
log_error_stack
,
)
if
"embed"
in
supported_tasks
else
None
state
.
openai_serving_classification
=
ServingClassification
(
...
...
@@ -1777,6 +1779,7 @@ async def init_app_state(
request_logger
=
request_logger
,
chat_template
=
resolved_chat_template
,
chat_template_content_format
=
args
.
chat_template_content_format
,
trust_request_chat_template
=
args
.
trust_request_chat_template
,
log_error_stack
=
args
.
log_error_stack
,
)
state
.
openai_serving_transcription
=
OpenAIServingTranscription
(
...
...
vllm/entrypoints/openai/serving_chat.py
View file @
a42d2df7
...
...
@@ -222,16 +222,14 @@ class OpenAIServingChat(OpenAIServing):
if
not
self
.
use_harmony
:
# Common case.
request_chat_template
=
request
.
chat_template
chat_template_kwargs
=
request
.
chat_template_kwargs
if
not
self
.
trust_request_chat_template
and
(
request_chat_template
is
not
None
or
(
chat_template_kwargs
and
chat_template_kwargs
.
get
(
"chat_template"
)
is
not
None
)):
return
self
.
create_error_response
(
"Chat template is passed with request, but "
"--trust-request-chat-template is not set. "
"Refused request with untrusted chat template."
)
error_check_ret
=
self
.
_validate_chat_template
(
request_chat_template
=
request
.
chat_template
,
chat_template_kwargs
=
request
.
chat_template_kwargs
,
trust_request_chat_template
=
self
.
trust_request_chat_template
,
)
if
error_check_ret
is
not
None
:
return
error_check_ret
(
conversation
,
request_prompts
,
...
...
@@ -240,7 +238,7 @@ class OpenAIServingChat(OpenAIServing):
request
,
tokenizer
,
request
.
messages
,
chat_template
=
request
_
chat_template
or
self
.
chat_template
,
chat_template
=
request
.
chat_template
or
self
.
chat_template
,
chat_template_content_format
=
self
.
chat_template_content_format
,
add_generation_prompt
=
request
.
add_generation_prompt
,
...
...
vllm/entrypoints/openai/serving_embedding.py
View file @
a42d2df7
...
...
@@ -576,6 +576,7 @@ class OpenAIServingEmbedding(EmbeddingMixin):
request_logger
:
Optional
[
RequestLogger
],
chat_template
:
Optional
[
str
],
chat_template_content_format
:
ChatTemplateContentFormatOption
,
trust_request_chat_template
:
bool
=
False
,
log_error_stack
:
bool
=
False
,
)
->
None
:
super
().
__init__
(
engine_client
=
engine_client
,
...
...
@@ -586,6 +587,7 @@ class OpenAIServingEmbedding(EmbeddingMixin):
self
.
chat_template
=
chat_template
self
.
chat_template_content_format
:
Final
=
chat_template_content_format
self
.
trust_request_chat_template
=
trust_request_chat_template
async
def
create_embedding
(
self
,
...
...
@@ -629,3 +631,17 @@ class OpenAIServingEmbedding(EmbeddingMixin):
return
self
.
create_error_response
(
str
(
e
))
return
pooling_params
async
def
_preprocess
(
self
,
ctx
:
ServeContext
,
)
->
Optional
[
ErrorResponse
]:
if
isinstance
(
ctx
.
request
,
EmbeddingChatRequest
):
error_check_ret
=
self
.
_validate_chat_template
(
request_chat_template
=
ctx
.
request
.
chat_template
,
chat_template_kwargs
=
ctx
.
request
.
chat_template_kwargs
,
trust_request_chat_template
=
self
.
trust_request_chat_template
,
)
if
error_check_ret
is
not
None
:
return
error_check_ret
return
await
super
().
_preprocess
(
ctx
)
vllm/entrypoints/openai/serving_engine.py
View file @
a42d2df7
...
...
@@ -751,6 +751,22 @@ class OpenAIServing:
tokenizer
=
tokenizer
,
)
def
_validate_chat_template
(
self
,
request_chat_template
:
Optional
[
str
],
chat_template_kwargs
:
Optional
[
dict
[
str
,
Any
]],
trust_request_chat_template
:
bool
,
)
->
Optional
[
ErrorResponse
]:
if
not
trust_request_chat_template
and
(
request_chat_template
is
not
None
or
(
chat_template_kwargs
and
chat_template_kwargs
.
get
(
"chat_template"
)
is
not
None
)):
return
self
.
create_error_response
(
"Chat template is passed with request, but "
"--trust-request-chat-template is not set. "
"Refused request with untrusted chat template."
)
return
None
async
def
_preprocess_chat
(
self
,
request
:
Union
[
ChatLikeRequest
,
ResponsesRequest
],
...
...
vllm/entrypoints/openai/serving_pooling.py
View file @
a42d2df7
...
...
@@ -65,6 +65,7 @@ class OpenAIServingPooling(OpenAIServing):
request_logger
:
Optional
[
RequestLogger
],
chat_template
:
Optional
[
str
],
chat_template_content_format
:
ChatTemplateContentFormatOption
,
trust_request_chat_template
:
bool
=
False
,
log_error_stack
:
bool
=
False
,
)
->
None
:
super
().
__init__
(
engine_client
=
engine_client
,
...
...
@@ -75,6 +76,7 @@ class OpenAIServingPooling(OpenAIServing):
self
.
chat_template
=
chat_template
self
.
chat_template_content_format
:
Final
=
chat_template_content_format
self
.
trust_request_chat_template
=
trust_request_chat_template
io_processor_plugin
=
self
.
model_config
.
io_processor_plugin
self
.
io_processor
=
get_io_processor
(
vllm_config
,
io_processor_plugin
)
...
...
@@ -129,6 +131,14 @@ class OpenAIServingPooling(OpenAIServing):
prompt
=
validated_prompt
,
request_id
=
request_id
)
elif
isinstance
(
request
,
PoolingChatRequest
):
error_check_ret
=
self
.
_validate_chat_template
(
request_chat_template
=
request
.
chat_template
,
chat_template_kwargs
=
request
.
chat_template_kwargs
,
trust_request_chat_template
=
self
.
trust_request_chat_template
,
)
if
error_check_ret
is
not
None
:
return
error_check_ret
(
_
,
_
,
...
...
vllm/entrypoints/openai/serving_tokenization.py
View file @
a42d2df7
...
...
@@ -40,6 +40,7 @@ class OpenAIServingTokenization(OpenAIServing):
request_logger
:
Optional
[
RequestLogger
],
chat_template
:
Optional
[
str
],
chat_template_content_format
:
ChatTemplateContentFormatOption
,
trust_request_chat_template
:
bool
=
False
,
log_error_stack
:
bool
=
False
,
)
->
None
:
super
().
__init__
(
engine_client
=
engine_client
,
...
...
@@ -50,6 +51,7 @@ class OpenAIServingTokenization(OpenAIServing):
self
.
chat_template
=
chat_template
self
.
chat_template_content_format
:
Final
=
chat_template_content_format
self
.
trust_request_chat_template
=
trust_request_chat_template
async
def
create_tokenize
(
self
,
...
...
@@ -71,6 +73,14 @@ class OpenAIServingTokenization(OpenAIServing):
if
isinstance
(
request
,
TokenizeChatRequest
):
tool_dicts
=
(
None
if
request
.
tools
is
None
else
[
tool
.
model_dump
()
for
tool
in
request
.
tools
])
error_check_ret
=
self
.
_validate_chat_template
(
request_chat_template
=
request
.
chat_template
,
chat_template_kwargs
=
request
.
chat_template_kwargs
,
trust_request_chat_template
=
self
.
trust_request_chat_template
,
)
if
error_check_ret
is
not
None
:
return
error_check_ret
(
_
,
_
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment