Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
edb359cc
Unverified
Commit
edb359cc
authored
Feb 07, 2026
by
Cyrus Leung
Committed by
GitHub
Feb 07, 2026
Browse files
[Renderer] Define `render_cmpl` and `render_chat` (#34039)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
6ed5eda3
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
150 additions
and
75 deletions
+150
-75
tests/models/multimodal/generation/test_phi4mm.py
tests/models/multimodal/generation/test_phi4mm.py
+0
-2
tests/models/multimodal/generation/test_qwen2_vl.py
tests/models/multimodal/generation/test_qwen2_vl.py
+0
-1
tests/models/quantization/test_awq.py
tests/models/quantization/test_awq.py
+0
-2
vllm/entrypoints/llm.py
vllm/entrypoints/llm.py
+14
-28
vllm/entrypoints/openai/engine/serving.py
vllm/entrypoints/openai/engine/serving.py
+19
-32
vllm/renderers/inputs/preprocess.py
vllm/renderers/inputs/preprocess.py
+10
-10
vllm/renderers/protocol.py
vllm/renderers/protocol.py
+107
-0
No files found.
tests/models/multimodal/generation/test_phi4mm.py
View file @
edb359cc
...
...
@@ -170,8 +170,6 @@ def run_test(
@
pytest
.
mark
.
parametrize
(
"size_factors"
,
[
# No image
[],
# Single-scale
[
1.0
],
# Single-scale, batched
...
...
tests/models/multimodal/generation/test_qwen2_vl.py
View file @
edb359cc
...
...
@@ -375,7 +375,6 @@ def test_qwen2_vl_image_embeddings_input(
@
pytest
.
mark
.
parametrize
(
"size_factors"
,
[
[],
# Single-scale
[
0.5
],
# Single-scale, batched
...
...
tests/models/quantization/test_awq.py
View file @
edb359cc
...
...
@@ -100,8 +100,6 @@ def run_awq_test(
@
pytest
.
mark
.
parametrize
(
"size_factors"
,
[
# No image
[],
# Single-scale
[
1.0
],
# Single-scale, batched
...
...
vllm/entrypoints/llm.py
View file @
edb359cc
...
...
@@ -73,7 +73,7 @@ from vllm.outputs import (
from
vllm.platforms
import
current_platform
from
vllm.pooling_params
import
PoolingParams
from
vllm.renderers
import
ChatParams
,
TokenizeParams
,
merge_kwargs
from
vllm.renderers.inputs
import
DictPrompt
,
SingletonDictPrompt
,
TokPrompt
from
vllm.renderers.inputs
import
DictPrompt
,
TokPrompt
from
vllm.renderers.inputs.preprocess
import
(
conversation_to_seq
,
extract_prompt_components
,
...
...
@@ -805,7 +805,7 @@ class LLM:
self
,
prompts
:
Sequence
[
PromptType
],
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
)
->
list
[
DictPrompt
|
TokPrompt
]:
)
->
Sequence
[
DictPrompt
|
TokPrompt
]:
"""
Convert prompt inputs from LLM APIs (other than [LLM.chat][]) into
a format that can be passed to `_add_request`.
...
...
@@ -819,22 +819,12 @@ class LLM:
renderer
=
self
.
llm_engine
.
renderer
model_config
=
self
.
model_config
parsed_prompts
=
[
parse_model_prompt
(
model_config
,
prompt
)
for
prompt
in
prompts
]
tok_params
=
self
.
_get_cmpl_tok_params
(
tokenization_kwargs
)
engine_prompts
=
list
[
DictPrompt
|
TokPrompt
]()
for
prompt
in
prompts
:
parsed_prompt
=
parse_model_prompt
(
model_config
,
prompt
)
in_prompt
=
renderer
.
render_prompt
(
parsed_prompt
)
# Some MM models have non-default `add_special_tokens`
# TODO: Move multi-modal processor into tokenization
engine_prompts
.
append
(
in_prompt
if
model_config
.
is_multimodal_model
else
renderer
.
tokenize_prompt
(
in_prompt
,
tok_params
)
)
return
engine_prompts
return
renderer
.
render_cmpl
(
parsed_prompts
,
tok_params
)
def
_get_chat_tok_params
(
self
,
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
):
model_config
=
self
.
model_config
...
...
@@ -857,7 +847,7 @@ class LLM:
tools
:
list
[
dict
[
str
,
Any
]]
|
None
=
None
,
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
mm_processor_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
)
->
list
[
DictPrompt
|
TokPrompt
]:
)
->
Sequence
[
TokPrompt
]:
"""
Convert a list of conversations into prompts so that they can then
be used as input for other LLM APIs.
...
...
@@ -885,16 +875,12 @@ class LLM:
)
tok_params
=
self
.
_get_chat_tok_params
(
tokenization_kwargs
)
engine_prompts
=
list
[
DictPrompt
|
TokPrompt
]()
for
conversation
in
conversations
:
_
,
in_prompt
=
renderer
.
render_messages
(
conversation
,
chat_params
)
if
mm_processor_kwargs
is
not
None
:
target_prompt
:
SingletonDictPrompt
=
in_prompt
.
get
(
# type: ignore
"encoder_prompt"
,
in_prompt
_
,
engine_prompts
=
renderer
.
render_chat
(
conversations
,
chat_params
,
tok_params
,
prompt_extras
=
{
"mm_processor_kwargs"
:
mm_processor_kwargs
},
)
target_prompt
[
"mm_processor_kwargs"
]
=
mm_processor_kwargs
# type: ignore
engine_prompts
.
append
(
renderer
.
tokenize_prompt
(
in_prompt
,
tok_params
))
return
engine_prompts
...
...
@@ -1743,7 +1729,7 @@ class LLM:
# TODO: Remove this after deprecating `param.truncate_prompt_tokens`
# Then, move the code from the `else` block to the top and let
# `self._preprocess_completion` handle prompt normalization
engine_prompts
=
[
engine_prompts
:
Sequence
[
DictPrompt
|
TokPrompt
]
=
[
engine_prompt
for
prompt
,
param
in
zip
(
seq_prompts
,
seq_params
)
for
engine_prompt
in
self
.
_preprocess_completion
(
...
...
vllm/entrypoints/openai/engine/serving.py
View file @
edb359cc
...
...
@@ -106,7 +106,6 @@ from vllm.pooling_params import PoolingParams
from
vllm.renderers
import
ChatParams
,
TokenizeParams
,
merge_kwargs
from
vllm.renderers.inputs
import
TokPrompt
from
vllm.renderers.inputs.preprocess
import
(
SingletonDictPrompt
,
extract_prompt_components
,
extract_prompt_len
,
parse_model_prompt
,
...
...
@@ -963,8 +962,6 @@ class OpenAIServing:
renderer
=
self
.
renderer
model_config
=
self
.
model_config
tok_params
=
request
.
build_tok_params
(
model_config
)
prompts
=
list
[
SingletonPrompt
|
bytes
]()
if
prompt_embeds
is
not
None
:
# embeds take higher priority
prompts
.
extend
(
prompt_to_seq
(
prompt_embeds
))
...
...
@@ -979,22 +976,17 @@ class OpenAIServing:
)
for
prompt
in
prompts
]
in_prompts
=
await
renderer
.
render_prompts_async
(
parsed_prompts
)
tok_params
=
request
.
build_tok_params
(
model_config
)
extra_items
=
{
return
await
renderer
.
render_cmpl_async
(
parsed_prompts
,
tok_params
,
prompt_extras
=
{
k
:
v
for
k
in
(
"mm_processor_kwargs"
,
"cache_salt"
)
if
(
v
:
=
getattr
(
request
,
k
,
None
))
is
not
None
}
for
in_prompt
in
in_prompts
:
target_prompt
:
SingletonDictPrompt
=
in_prompt
.
get
(
# type: ignore
"encoder_prompt"
,
in_prompt
},
)
target_prompt
.
update
(
extra_items
)
# type: ignore
engine_prompts
=
await
renderer
.
tokenize_prompts_async
(
in_prompts
,
tok_params
)
return
engine_prompts
async
def
_preprocess_chat
(
self
,
...
...
@@ -1023,21 +1015,16 @@ class OpenAIServing:
default_template
,
default_template_content_format
).
with_defaults
(
default_template_kwargs
)
conversation
,
in_prompt
=
await
renderer
.
render_messages_async
(
messages
,
chat_params
)
target_prompt
:
SingletonDictPrompt
=
in_prompt
.
get
(
# type: ignore
"encoder_prompt"
,
in_prompt
)
extra_items
=
{
(
conversation
,),
(
engine_prompt
,)
=
await
renderer
.
render_chat_async
(
[
messages
],
chat_params
,
tok_params
,
prompt_extras
=
{
k
:
v
for
k
in
(
"mm_processor_kwargs"
,
"cache_salt"
)
if
(
v
:
=
getattr
(
request
,
k
,
None
))
is
not
None
}
target_prompt
.
update
(
extra_items
)
# type: ignore
engine_prompt
=
await
renderer
.
tokenize_prompt_async
(
target_prompt
,
tok_params
)
},
)
# tool parsing is done only if a tool_parser has been set and if
# tool_choice is not "none" (if tool_choice is "none" but a tool_parser
...
...
vllm/renderers/inputs/preprocess.py
View file @
edb359cc
...
...
@@ -225,16 +225,20 @@ class PromptComponents(NamedTuple):
embeds
:
"torch.Tensor | None"
=
None
def
extract_prompt_components
(
model_config
:
"ModelConfig"
,
prompt
:
object
,
)
->
PromptComponents
:
target_prompt
=
(
def
extract_target_prompt
(
model_config
:
"ModelConfig"
,
prompt
:
object
):
return
(
parse_enc_dec_prompt
(
prompt
)[
"encoder_prompt"
]
if
model_config
.
is_encoder_decoder
else
parse_dec_only_prompt
(
prompt
)
)
def
extract_prompt_components
(
model_config
:
"ModelConfig"
,
prompt
:
object
,
)
->
PromptComponents
:
target_prompt
=
extract_target_prompt
(
model_config
,
prompt
)
return
PromptComponents
(
text
=
target_prompt
.
get
(
"prompt"
),
token_ids
=
target_prompt
.
get
(
"prompt_token_ids"
),
# type: ignore[arg-type]
...
...
@@ -243,11 +247,7 @@ def extract_prompt_components(
def
extract_prompt_len
(
model_config
:
"ModelConfig"
,
prompt
:
object
):
target_prompt
=
(
parse_enc_dec_prompt
(
prompt
)[
"encoder_prompt"
]
if
model_config
.
is_encoder_decoder
else
parse_dec_only_prompt
(
prompt
)
)
target_prompt
=
extract_target_prompt
(
model_config
,
prompt
)
return
length_from_prompt_token_ids_or_embeds
(
target_prompt
.
get
(
"prompt_token_ids"
),
# type: ignore[arg-type]
...
...
vllm/renderers/protocol.py
View file @
edb359cc
...
...
@@ -16,6 +16,7 @@ from .inputs import (
EncoderDecoderTokPrompt
,
TokPrompt
,
)
from
.inputs.preprocess
import
extract_target_prompt
from
.params
import
ChatParams
,
TokenizeParams
if
TYPE_CHECKING
:
...
...
@@ -277,3 +278,109 @@ class BaseRenderer(ABC):
return
await
asyncio
.
gather
(
*
(
self
.
tokenize_prompt_async
(
prompt
,
params
)
for
prompt
in
prompts
)
)
# Step 3: Add extra keys to the prompts
def
_apply_prompt_extras
(
self
,
prompts
:
Sequence
[
DictPrompt
|
TokPrompt
],
prompt_extras
:
dict
[
str
,
Any
]
|
None
,
):
if
not
prompt_extras
:
return
for
prompt
in
prompts
:
target_prompt
=
extract_target_prompt
(
self
.
config
,
prompt
)
target_prompt
.
update
(
prompt_extras
)
# type: ignore[arg-type]
# Top-level methods
def
render_cmpl
(
self
,
prompts
:
Sequence
[
DictPrompt
|
bytes
],
tok_params
:
TokenizeParams
,
*
,
prompt_extras
:
dict
[
str
,
Any
]
|
None
=
None
,
):
dict_prompts
=
self
.
render_prompts
(
prompts
)
# NOTE: Some MM models have non-default `add_special_tokens`
# so we handle tokenization in multi-modal processor
if
self
.
config
.
is_multimodal_model
:
self
.
_apply_prompt_extras
(
dict_prompts
,
prompt_extras
)
return
dict_prompts
tok_prompts
=
self
.
tokenize_prompts
(
dict_prompts
,
tok_params
)
self
.
_apply_prompt_extras
(
tok_prompts
,
prompt_extras
)
# TODO: Apply multi-modal processor
return
tok_prompts
async
def
render_cmpl_async
(
self
,
prompts
:
Sequence
[
DictPrompt
|
bytes
],
tok_params
:
TokenizeParams
,
*
,
prompt_extras
:
dict
[
str
,
Any
]
|
None
=
None
,
):
dict_prompts
=
await
self
.
render_prompts_async
(
prompts
)
# NOTE: MM data cannot be passed to online Completions API
# so we don't have the special case that is in the offline version
tok_prompts
=
await
self
.
tokenize_prompts_async
(
dict_prompts
,
tok_params
)
self
.
_apply_prompt_extras
(
tok_prompts
,
prompt_extras
)
# TODO: Apply multi-modal processor
return
tok_prompts
def
render_chat
(
self
,
conversations
:
Sequence
[
list
[
"ChatCompletionMessageParam"
]],
chat_params
:
ChatParams
,
tok_params
:
TokenizeParams
,
*
,
prompt_extras
:
dict
[
str
,
Any
]
|
None
=
None
,
):
rendered
=
[
self
.
render_messages
(
conversation
,
chat_params
)
for
conversation
in
conversations
]
out_conversations
=
list
[
list
[
"ConversationMessage"
]]()
dict_prompts
=
list
[
DictPrompt
]()
for
conv
,
prompt
in
rendered
:
out_conversations
.
append
(
conv
)
dict_prompts
.
append
(
prompt
)
tok_prompts
=
self
.
tokenize_prompts
(
dict_prompts
,
tok_params
)
self
.
_apply_prompt_extras
(
tok_prompts
,
prompt_extras
)
# TODO: Apply multi-modal processor
return
out_conversations
,
tok_prompts
async
def
render_chat_async
(
self
,
conversations
:
Sequence
[
list
[
"ChatCompletionMessageParam"
]],
chat_params
:
ChatParams
,
tok_params
:
TokenizeParams
,
*
,
prompt_extras
:
dict
[
str
,
Any
]
|
None
=
None
,
):
rendered
=
[
self
.
render_messages_async
(
conversation
,
chat_params
)
for
conversation
in
conversations
]
out_conversations
=
list
[
list
[
"ConversationMessage"
]]()
dict_prompts
=
list
[
DictPrompt
]()
for
conv
,
prompt
in
await
asyncio
.
gather
(
*
rendered
):
out_conversations
.
append
(
conv
)
dict_prompts
.
append
(
prompt
)
tok_prompts
=
await
self
.
tokenize_prompts_async
(
dict_prompts
,
tok_params
)
self
.
_apply_prompt_extras
(
tok_prompts
,
prompt_extras
)
# TODO: Apply multi-modal processor
return
out_conversations
,
tok_prompts
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment