Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
574fe752
Unverified
Commit
574fe752
authored
Feb 17, 2026
by
Cyrus Leung
Committed by
GitHub
Feb 17, 2026
Browse files
[Renderer] Move InputPreprocessor into Renderer (2/2) (#34560)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
c61a98f5
Changes
31
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
559 additions
and
493 deletions
+559
-493
tests/entrypoints/llm/test_chat.py
tests/entrypoints/llm/test_chat.py
+6
-9
tests/models/multimodal/processing/test_common.py
tests/models/multimodal/processing/test_common.py
+3
-2
tests/renderers/test_process_multi_modal_uuids.py
tests/renderers/test_process_multi_modal_uuids.py
+165
-0
tests/samplers/test_beam_search.py
tests/samplers/test_beam_search.py
+0
-2
vllm/beam_search.py
vllm/beam_search.py
+29
-8
vllm/engine/protocol.py
vllm/engine/protocol.py
+6
-6
vllm/entrypoints/llm.py
vllm/entrypoints/llm.py
+203
-221
vllm/entrypoints/openai/chat_completion/serving.py
vllm/entrypoints/openai/chat_completion/serving.py
+12
-25
vllm/entrypoints/openai/completion/serving.py
vllm/entrypoints/openai/completion/serving.py
+4
-22
vllm/entrypoints/openai/engine/serving.py
vllm/entrypoints/openai/engine/serving.py
+41
-87
vllm/entrypoints/openai/realtime/serving.py
vllm/entrypoints/openai/realtime/serving.py
+8
-2
vllm/entrypoints/openai/responses/context.py
vllm/entrypoints/openai/responses/context.py
+2
-2
vllm/entrypoints/openai/responses/serving.py
vllm/entrypoints/openai/responses/serving.py
+3
-6
vllm/entrypoints/openai/speech_to_text/speech_to_text.py
vllm/entrypoints/openai/speech_to_text/speech_to_text.py
+49
-56
vllm/entrypoints/pooling/embed/serving.py
vllm/entrypoints/pooling/embed/serving.py
+3
-12
vllm/entrypoints/pooling/pooling/serving.py
vllm/entrypoints/pooling/pooling/serving.py
+2
-7
vllm/entrypoints/pooling/score/serving.py
vllm/entrypoints/pooling/score/serving.py
+13
-7
vllm/entrypoints/serve/disagg/serving.py
vllm/entrypoints/serve/disagg/serving.py
+2
-17
vllm/entrypoints/serve/tokenize/serving.py
vllm/entrypoints/serve/tokenize/serving.py
+2
-2
vllm/inputs/data.py
vllm/inputs/data.py
+6
-0
No files found.
tests/entrypoints/llm/test_chat.py
View file @
574fe752
...
...
@@ -195,18 +195,15 @@ def test_chat_batch_failure_cleanup(llm_for_failure_test):
valid_msg
=
[{
"role"
:
"user"
,
"content"
:
"Hello"
}]
long_text
=
"This is a very long text to test the error "
*
50
invalid_msg
=
[{
"role"
:
"user"
,
"content"
:
long_text
}]
batch_1
=
[
valid_msg
,
valid_msg
,
invalid_msg
,
]
batch_2
=
[
valid_msg
,
valid_msg
,
]
batch_1
=
[
valid_msg
,
valid_msg
,
invalid_msg
]
batch_2
=
[
valid_msg
,
valid_msg
]
sampling_params
=
SamplingParams
(
temperature
=
0
,
max_tokens
=
10
)
with
pytest
.
raises
(
ValueError
,
match
=
"context length is only"
):
llm
.
chat
(
batch_1
,
sampling_params
=
sampling_params
)
assert
llm
.
llm_engine
.
get_num_unfinished_requests
()
==
0
outputs_2
=
llm
.
chat
(
batch_2
,
sampling_params
=
sampling_params
)
assert
len
(
outputs_2
)
==
len
(
batch_2
)
assert
llm
.
llm_engine
.
get_num_unfinished_requests
()
==
0
tests/models/multimodal/processing/test_common.py
View file @
574fe752
...
...
@@ -489,8 +489,9 @@ def _assert_inputs_equal(
if
ignore_mm_keys
is
None
:
ignore_mm_keys
=
set
()
a_rest
=
{
k
:
v
for
k
,
v
in
a
.
items
()
if
k
!=
"mm_kwargs"
}
b_rest
=
{
k
:
v
for
k
,
v
in
b
.
items
()
if
k
!=
"mm_kwargs"
}
ignore_prompt_keys
=
(
"prompt"
,
"mm_kwargs"
)
a_rest
=
{
k
:
v
for
k
,
v
in
a
.
items
()
if
k
not
in
ignore_prompt_keys
}
b_rest
=
{
k
:
v
for
k
,
v
in
b
.
items
()
if
k
not
in
ignore_prompt_keys
}
assert
a_rest
==
b_rest
,
msg
...
...
tests/
v1/engine
/test_process_multi_modal_uuids.py
→
tests/
renderers
/test_process_multi_modal_uuids.py
View file @
574fe752
...
...
@@ -6,18 +6,17 @@ import pytest
from
vllm.assets.image
import
ImageAsset
from
vllm.assets.video
import
VideoAsset
from
vllm.config
import
CacheConfig
,
ModelConfig
,
VllmConfig
from
vllm.multimodal
import
MultiModalUUIDDict
from
vllm.sampling_params
import
SamplingParams
from
vllm.v1.engine.input_processor
import
InputProcessor
from
vllm.renderers.hf
import
HfRenderer
from
vllm.tokenizers.registry
import
tokenizer_args_from_config
cherry_pil_image
=
ImageAsset
(
"cherry_blossom"
).
pil_image
stop_pil_image
=
ImageAsset
(
"stop_sign"
).
pil_image
baby_reading_np_ndarrays
=
VideoAsset
(
"baby_reading"
).
np_ndarrays
def
_build_
input_processo
r
(
def
_build_
rendere
r
(
*
,
mm_cache_gb
:
float
=
4.0
,
enable_prefix_caching
:
bool
=
True
)
->
InputProcesso
r
:
)
->
HfRendere
r
:
model_config
=
ModelConfig
(
model
=
"Qwen/Qwen2.5-VL-3B-Instruct"
,
max_model_len
=
128
,
...
...
@@ -29,47 +28,45 @@ def _build_input_processor(
cache_config
=
CacheConfig
(
enable_prefix_caching
=
enable_prefix_caching
),
)
return
InputProcessor
(
vllm_config
)
_
,
tokenizer_name
,
_
,
kwargs
=
tokenizer_args_from_config
(
model_config
)
return
HfRenderer
.
from_config
(
vllm_config
,
tokenizer_kwargs
=
{
**
kwargs
,
"tokenizer_name"
:
tokenizer_name
},
)
def
test_multi_modal_uuids_length_mismatch_raises
():
input_processor
=
_build_input_processo
r
()
renderer
=
_build_rendere
r
()
prompt
=
{
"prompt"
:
"USER: <image>
\n
Describe
\n
ASSISTANT:"
,
"multi_modal_data"
:
{
"image"
:
[
cherry_pil_image
,
stop_pil_image
]},
# Mismatch: 2 items but only 1 uuid provided
"multi_modal_uuids"
:
{
"image"
:
[
"hash_cherry"
]},
}
mm_data
=
{
"image"
:
[
cherry_pil_image
,
stop_pil_image
]}
# Mismatch: 2 items but only 1 uuid provided
mm_uuids
=
{
"image"
:
[
"hash_cherry"
]}
mm_processor
=
renderer
.
get_mm_processor
()
mm_items
=
mm_processor
.
info
.
parse_mm_data
(
mm_data
)
with
pytest
.
raises
(
ValueError
,
match
=
"must have same length as"
):
input_processor
.
process_inputs
(
request_id
=
"req-1"
,
prompt
=
prompt
,
# type: ignore[arg-type]
params
=
SamplingParams
(),
)
renderer
.
_process_mm_uuids
(
mm_data
,
mm_items
,
mm_uuids
,
"req-1"
)
def
test_multi_modal_uuids_missing_modality_raises
():
input_processor
=
_build_input_processor
()
prompt
=
{
"prompt"
:
"USER: <image><video>
\n
Describe
\n
ASSISTANT:"
,
# Two modalities provided in data
"multi_modal_data"
:
{
"image"
:
[
cherry_pil_image
],
"video"
:
None
,
},
# Only image uuids provided; video missing should raise
"multi_modal_uuids"
:
{
"image"
:
[
"hash_cherry"
]},
renderer
=
_build_renderer
()
mm_data
=
{
"image"
:
[
cherry_pil_image
],
"video"
:
None
,
}
# Only image uuids provided; video missing should raise
mm_uuids
=
{
"image"
:
[
"hash_cherry"
]}
mm_processor
=
renderer
.
get_mm_processor
()
mm_items
=
mm_processor
.
info
.
parse_mm_data
(
mm_data
)
with
pytest
.
raises
(
ValueError
,
match
=
"is empty but .* is missing"
):
input_processor
.
process_inputs
(
request_id
=
"req-2"
,
prompt
=
prompt
,
# type: ignore[arg-type]
params
=
SamplingParams
(),
)
renderer
.
_process_mm_uuids
(
mm_data
,
mm_items
,
mm_uuids
,
"req-2"
)
@
pytest
.
mark
.
parametrize
(
...
...
@@ -83,92 +80,86 @@ def test_multi_modal_uuids_missing_modality_raises():
def
test_multi_modal_uuids_accepts_none_and_passes_through
(
monkeypatch
,
mm_cache_gb
:
float
,
enable_prefix_caching
:
bool
):
input_processor
=
_build_input_processo
r
(
renderer
=
_build_rendere
r
(
mm_cache_gb
=
mm_cache_gb
,
enable_prefix_caching
=
enable_prefix_caching
,
)
# Capture the overrides passed to InputPreprocessor.preprocess
captured
:
dict
[
str
,
object
]
=
{}
def
fake_preprocess
(
prompt
,
*
,
tokenization_kwargs
=
None
,
lora_request
=
None
,
mm_uuids
=
None
):
captured
[
"mm_uuids"
]
=
mm_uuids
# Minimal processed inputs for decoder-only flow
return
{
"type"
:
"token"
,
"prompt_token_ids"
:
[
1
]}
# Monkeypatch only the bound preprocess method on this instance
monkeypatch
.
setattr
(
input_processor
.
input_preprocessor
,
"preprocess"
,
fake_preprocess
,
raising
=
True
)
mm_data
=
{
"image"
:
[
cherry_pil_image
,
stop_pil_image
],
"video"
:
baby_reading_np_ndarrays
,
}
# Use a consistent two-image scenario across all configurations
mm_uuids
=
{
"image"
:
[
None
,
"hash_stop"
],
"video"
:
None
}
prompt
=
{
"prompt"
:
"USER: <image><image>
\n
Two images
\n
ASSISTANT:"
,
"multi_modal_data"
:
{
"image"
:
[
cherry_pil_image
,
stop_pil_image
],
"video"
:
baby_reading_np_ndarrays
,
},
"multi_modal_uuids"
:
mm_uuids
,
}
input
_processor
.
process_inputs
(
request_id
=
"req-3"
,
pro
mpt
=
prompt
,
# type: ignore[arg-type]
params
=
SamplingParams
(),
mm
_processor
=
renderer
.
get_mm_processor
()
mm_items
=
mm_processor
.
info
.
parse_mm_data
(
mm_data
)
pro
cessed_mm_uuids
=
renderer
.
_process_mm_uuids
(
mm_data
,
mm_items
,
mm_uuids
,
"req-3"
)
assert
captured
[
"
mm_uuids
"
]
==
mm_uuids
assert
processed_
mm_uuids
==
mm_uuids
def
test_multi_modal_uuids_ignored_when_caching_disabled
(
monkeypatch
):
# When both processor cache is 0 and prefix caching disabled, the
# processor builds overrides from request id instead of using user UUIDs.
input_processor
=
_build_input_processor
(
mm_cache_gb
=
0.0
,
enable_prefix_caching
=
False
@
pytest
.
mark
.
parametrize
(
"mm_cache_gb, enable_prefix_caching"
,
[
(
4.0
,
True
),
# default behavior
(
4.0
,
False
),
# prefix caching disabled
(
0.0
,
True
),
# processor cache disabled
],
)
def
test_multi_modal_uuids_accepts_empty
(
monkeypatch
,
mm_cache_gb
:
float
,
enable_prefix_caching
:
bool
):
renderer
=
_build_renderer
(
mm_cache_gb
=
mm_cache_gb
,
enable_prefix_caching
=
enable_prefix_caching
,
)
captured
:
dict
[
str
,
MultiModalUUIDDict
]
=
{}
# While None means cached multi-modal input requiring UUIDs
# an empty list means no multi-modal input
mm_data
=
{
"image"
:
[],
"video"
:
[]}
# type: ignore[var-annotated]
mm_uuids
=
{
"image"
:
[],
"video"
:
None
}
# type: ignore[var-annotated]
def
fake_preprocess
(
prompt
,
*
,
tokenization_kwargs
=
None
,
lora_request
=
None
,
mm_uuids
=
None
):
captured
[
"mm_uuids"
]
=
mm_uuids
return
{
"type"
:
"token"
,
"prompt_token_ids"
:
[
1
]}
monkeypatch
.
setattr
(
input_processor
.
input_preprocessor
,
"preprocess"
,
fake_preprocess
,
raising
=
True
mm_processor
=
renderer
.
get_mm_processor
()
mm_items
=
mm_processor
.
info
.
parse_mm_data
(
mm_data
)
processed_mm_uuids
=
renderer
.
_process_mm_uuids
(
mm_data
,
mm_items
,
mm_uuids
,
"req-4"
)
assert
processed_mm_uuids
==
mm_uuids
def
test_multi_modal_uuids_ignored_when_caching_disabled
(
monkeypatch
):
# When both processor cache is 0 and prefix caching disabled, the
# processor builds overrides from request id instead of using user UUIDs.
renderer
=
_build_renderer
(
mm_cache_gb
=
0.0
,
enable_prefix_caching
=
False
)
request_id
=
"req-42"
mm_uuids
=
{
"image"
:
[
"hash_cherry"
,
"hash_stop"
],
"video"
:
[
"hash_video"
]}
prompt
=
{
"prompt"
:
"USER: <image><image><video>
\n
Describe
\n
ASSISTANT:"
,
"multi_modal_data"
:
{
"image"
:
[
cherry_pil_image
,
stop_pil_image
],
"video"
:
[
baby_reading_np_ndarrays
],
},
"multi_modal_uuids"
:
mm_uuids
,
mm_data
=
{
"image"
:
[
cherry_pil_image
,
stop_pil_image
],
"video"
:
baby_reading_np_ndarrays
,
}
mm_uuids
=
{
"image"
:
[
"hash_cherry"
,
"hash_stop"
],
"video"
:
[
"hash_video"
]}
input
_processor
.
process_inputs
(
request_id
=
request_id
,
pro
mpt
=
prompt
,
# type: ignore[arg-type]
params
=
SamplingParams
(),
mm
_processor
=
renderer
.
get_mm_processor
()
mm_items
=
mm_processor
.
info
.
parse_mm_data
(
mm_data
)
pro
cessed_mm_uuids
=
renderer
.
_process_mm_uuids
(
mm_data
,
mm_items
,
mm_uuids
,
request_id
)
# Expect request-id-based overrides are passed through
assert
set
(
mm_uuids
.
keys
())
==
{
"image"
,
"video"
}
assert
len
(
mm_uuids
[
"image"
])
==
2
assert
len
(
mm_uuids
[
"video"
])
==
1
assert
captured
[
"
mm_uuids
"
]
[
"image"
][
0
].
startswith
(
assert
processed_
mm_uuids
[
"image"
][
0
].
startswith
(
f
"
{
request_id
}
-image-"
)
and
captured
[
"
mm_uuids
"
]
[
"image"
][
0
].
endswith
(
"-0"
)
assert
captured
[
"
mm_uuids
"
]
[
"image"
][
1
].
startswith
(
)
and
processed_
mm_uuids
[
"image"
][
0
].
endswith
(
"-0"
)
assert
processed_
mm_uuids
[
"image"
][
1
].
startswith
(
f
"
{
request_id
}
-image-"
)
and
captured
[
"
mm_uuids
"
]
[
"image"
][
1
].
endswith
(
"-1"
)
assert
captured
[
"
mm_uuids
"
]
[
"video"
][
0
].
startswith
(
)
and
processed_
mm_uuids
[
"image"
][
1
].
endswith
(
"-1"
)
assert
processed_
mm_uuids
[
"video"
][
0
].
startswith
(
f
"
{
request_id
}
-video-"
)
and
captured
[
"
mm_uuids
"
]
[
"video"
][
0
].
endswith
(
"-0"
)
)
and
processed_
mm_uuids
[
"video"
][
0
].
endswith
(
"-0"
)
tests/samplers/test_beam_search.py
View file @
574fe752
...
...
@@ -20,7 +20,6 @@ MM_BEAM_WIDTHS = [2]
MODELS
=
[
"TinyLlama/TinyLlama-1.1B-Chat-v1.0"
]
@
pytest
.
mark
.
skip_v1
# V1 engine does not yet support beam search
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
MAX_TOKENS
)
...
...
@@ -62,7 +61,6 @@ def test_beam_search_single_input(
)
@
pytest
.
mark
.
skip_v1
# V1 engine does not yet support beam search
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
MAX_TOKENS
)
...
...
vllm/beam_search.py
View file @
574fe752
...
...
@@ -2,13 +2,11 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
dataclasses
import
dataclass
from
typing
import
TYPE_CHECKING
,
Any
from
vllm.inputs
import
TokenInputs
,
token_inputs
from
vllm.logprobs
import
Logprob
from
vllm.lora.request
import
LoRARequest
if
TYPE_CHECKING
:
from
vllm.multimodal
import
MultiModalDataDict
from
vllm.multimodal.inputs
import
MultiModalInputs
,
mm_inputs
@
dataclass
...
...
@@ -19,6 +17,8 @@ class BeamSearchSequence:
about to be returned to the user.
"""
orig_prompt
:
TokenInputs
|
MultiModalInputs
# The tokens include the prompt.
tokens
:
list
[
int
]
logprobs
:
list
[
dict
[
int
,
Logprob
]]
...
...
@@ -27,8 +27,28 @@ class BeamSearchSequence:
text
:
str
|
None
=
None
finish_reason
:
str
|
None
=
None
stop_reason
:
int
|
str
|
None
=
None
multi_modal_data
:
"MultiModalDataDict | None"
=
None
mm_processor_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
def
get_prompt
(
self
):
prompt
=
self
.
orig_prompt
prompt_text
=
prompt
.
get
(
"prompt"
)
cache_salt
=
prompt
.
get
(
"cache_salt"
)
if
prompt
[
"type"
]
==
"token"
:
return
token_inputs
(
self
.
tokens
,
prompt
=
prompt_text
,
cache_salt
=
cache_salt
,
)
return
mm_inputs
(
prompt_token_ids
=
self
.
tokens
,
mm_kwargs
=
prompt
[
"mm_kwargs"
],
mm_hashes
=
prompt
[
"mm_hashes"
],
mm_placeholders
=
prompt
[
"mm_placeholders"
],
prompt
=
prompt_text
,
cache_salt
=
cache_salt
,
)
@
dataclass
...
...
@@ -44,14 +64,15 @@ class BeamSearchOutput:
class
BeamSearchInstance
:
def
__init__
(
self
,
prompt
_t
oken
s
:
list
[
int
]
,
prompt
:
T
oken
Inputs
|
MultiModalInputs
,
lora_request
:
LoRARequest
|
None
=
None
,
logprobs
:
list
[
dict
[
int
,
Logprob
]]
|
None
=
None
,
**
kwargs
,
):
self
.
beams
:
list
[
BeamSearchSequence
]
=
[
BeamSearchSequence
(
tokens
=
prompt_tokens
,
orig_prompt
=
prompt
,
tokens
=
prompt
[
"prompt_token_ids"
],
logprobs
=
[]
if
logprobs
is
None
else
list
(
logprobs
),
lora_request
=
lora_request
,
**
kwargs
,
...
...
vllm/engine/protocol.py
View file @
574fe752
...
...
@@ -11,13 +11,12 @@ from vllm.distributed.weight_transfer.base import (
WeightTransferInitRequest
,
WeightTransferUpdateRequest
,
)
from
vllm.inputs.data
import
PromptType
from
vllm.inputs.data
import
ProcessorInputs
,
PromptType
from
vllm.lora.request
import
LoRARequest
from
vllm.outputs
import
PoolingRequestOutput
,
RequestOutput
from
vllm.plugins.io_processors
import
IOProcessor
from
vllm.pooling_params
import
PoolingParams
from
vllm.renderers
import
BaseRenderer
from
vllm.renderers.inputs
import
DictPrompt
,
TokPrompt
from
vllm.sampling_params
import
SamplingParams
from
vllm.tasks
import
SupportedTask
from
vllm.v1.engine
import
EngineCoreRequest
...
...
@@ -35,7 +34,7 @@ class StreamingInput:
where inputs are provided via an async generator.
"""
prompt
:
Pro
mptType
prompt
:
Pro
cessorInputs
sampling_params
:
SamplingParams
|
None
=
None
...
...
@@ -69,8 +68,7 @@ class EngineClient(ABC):
self
,
prompt
:
EngineCoreRequest
|
PromptType
|
DictPrompt
|
TokPrompt
|
ProcessorInputs
|
AsyncGenerator
[
StreamingInput
,
None
],
sampling_params
:
SamplingParams
,
request_id
:
str
,
...
...
@@ -81,6 +79,7 @@ class EngineClient(ABC):
trace_headers
:
Mapping
[
str
,
str
]
|
None
=
None
,
priority
:
int
=
0
,
data_parallel_rank
:
int
|
None
=
None
,
reasoning_ended
:
bool
|
None
=
None
,
)
->
AsyncGenerator
[
RequestOutput
,
None
]:
"""Generate outputs for a request."""
...
...
...
@@ -88,13 +87,14 @@ class EngineClient(ABC):
@
abstractmethod
def
encode
(
self
,
prompt
:
PromptType
|
DictPrompt
|
TokPrompt
,
prompt
:
PromptType
|
ProcessorInputs
,
pooling_params
:
PoolingParams
,
request_id
:
str
,
lora_request
:
LoRARequest
|
None
=
None
,
trace_headers
:
Mapping
[
str
,
str
]
|
None
=
None
,
priority
:
int
=
0
,
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
reasoning_ended
:
bool
|
None
=
None
,
)
->
AsyncGenerator
[
PoolingRequestOutput
,
None
]:
"""Generate outputs for a request from a pooling model."""
...
...
...
vllm/entrypoints/llm.py
View file @
574fe752
...
...
@@ -3,8 +3,8 @@
import
itertools
import
warnings
from
collections.abc
import
Callable
,
Sequence
from
typing
import
TYPE_CHECKING
,
Any
,
cast
from
collections.abc
import
Callable
,
Iterable
,
Sequence
from
typing
import
TYPE_CHECKING
,
Any
import
cloudpickle
import
torch.nn
as
nn
...
...
@@ -55,6 +55,7 @@ from vllm.entrypoints.pooling.score.utils import (
from
vllm.entrypoints.utils
import
log_non_default_args
from
vllm.inputs.data
import
(
DataPrompt
,
ProcessorInputs
,
PromptType
,
SingletonPrompt
,
TextPrompt
,
...
...
@@ -73,10 +74,8 @@ from vllm.outputs import (
from
vllm.platforms
import
current_platform
from
vllm.pooling_params
import
PoolingParams
from
vllm.renderers
import
ChatParams
,
merge_kwargs
from
vllm.renderers.inputs
import
DictPrompt
,
TokPrompt
from
vllm.renderers.inputs.preprocess
import
(
conversation_to_seq
,
extract_prompt_components
,
parse_model_prompt
,
prompt_to_seq
,
)
...
...
@@ -86,6 +85,7 @@ from vllm.tokenizers import TokenizerLike
from
vllm.tokenizers.mistral
import
MistralTokenizer
from
vllm.usage.usage_lib
import
UsageContext
from
vllm.utils.counter
import
Counter
from
vllm.utils.tqdm_utils
import
maybe_tqdm
from
vllm.v1.engine.llm_engine
import
LLMEngine
from
vllm.v1.sample.logits_processor
import
LogitsProcessor
...
...
@@ -400,7 +400,7 @@ class LLM:
sampling_params
:
SamplingParams
|
Sequence
[
SamplingParams
]
|
None
=
None
,
*
,
use_tqdm
:
bool
|
Callable
[...,
tqdm
]
=
True
,
lora_request
:
list
[
LoRARequest
]
|
LoRARequest
|
None
=
None
,
lora_request
:
Sequence
[
LoRARequest
]
|
LoRARequest
|
None
=
None
,
priority
:
list
[
int
]
|
None
=
None
,
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
)
->
list
[
RequestOutput
]:
...
...
@@ -462,7 +462,7 @@ class LLM:
self
,
prompts
:
PromptType
|
Sequence
[
PromptType
],
sampling_params
:
SamplingParams
|
Sequence
[
SamplingParams
]
|
None
=
None
,
lora_request
:
list
[
LoRARequest
]
|
LoRARequest
|
None
=
None
,
lora_request
:
Sequence
[
LoRARequest
]
|
LoRARequest
|
None
=
None
,
priority
:
list
[
int
]
|
None
=
None
,
use_tqdm
:
bool
|
Callable
[...,
tqdm
]
=
True
,
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
...
...
@@ -495,34 +495,32 @@ class LLM:
# Use the same preprocessing as _run_completion
seq_prompts
=
prompt_to_seq
(
prompts
)
seq_params
=
self
.
_params_to_seq
(
sampling_params
,
len
(
seq_prompts
))
if
any
(
param
.
truncate_prompt_tokens
is
not
None
for
param
in
seq_params
):
engine_prompts
:
Sequence
[
DictPrompt
|
TokPrompt
]
=
[
engine_prompt
for
prompt
,
param
in
zip
(
seq_prompts
,
seq_params
)
for
engine_prompt
in
self
.
_preprocess_cmpl
(
[
prompt
],
tokenization_kwargs
=
merge_kwargs
(
tokenization_kwargs
,
dict
(
truncate_prompt_tokens
=
param
.
truncate_prompt_tokens
),
seq_lora_requests
=
self
.
_lora_request_to_seq
(
lora_request
,
len
(
seq_prompts
))
seq_tok_kwargs
=
[
merge_kwargs
(
tokenization_kwargs
,
dict
(
truncate_prompt_tokens
=
param
.
truncate_prompt_tokens
),
)
for
param
in
seq_params
]
seq_priority
=
self
.
_priority_to_seq
(
priority
,
len
(
prompts
))
request_ids
=
self
.
_render_and_add_requests
(
prompts
=
(
self
.
_preprocess_cmpl_one
(
prompt
,
tok_kwargs
)
for
prompt
,
tok_kwargs
in
zip
(
maybe_tqdm
(
seq_prompts
,
use_tqdm
=
use_tqdm
,
desc
=
"Rendering prompts"
,
),
seq_tok_kwargs
,
)
]
else
:
engine_prompts
=
self
.
_preprocess_cmpl
(
seq_prompts
,
tokenization_kwargs
=
tokenization_kwargs
,
)
request_ids
=
self
.
_validate_and_add_requests
(
prompts
=
engine_prompts
,
params
=
seq_params
,
use_tqdm
=
use_tqdm
,
lora_request
=
self
.
_get_modality_specific_lora_reqs
(
engine_prompts
,
lora_request
),
params
=
seq_params
,
lora_requests
=
seq_lora_requests
,
tokenization_kwargs
=
tokenization_kwargs
,
priorit
y
=
priority
,
priorit
ies
=
seq_
priority
,
)
return
request_ids
...
...
@@ -545,53 +543,41 @@ class LLM:
outputs
=
self
.
_run_engine
(
use_tqdm
=
use_tqdm
)
return
self
.
engine_class
.
validate_outputs
(
outputs
,
RequestOutput
)
def
_
get_modality_specific
_lora_reqs
(
def
_
resolve
_lora_reqs
(
self
,
prompts
:
Sequence
[
DictPrompt
|
TokPrompt
],
lora_request
:
list
[
LoRARequest
]
|
LoRARequest
|
None
,
prompts
:
Sequence
[
ProcessorInputs
],
lora_request
:
Sequence
[
LoRARequest
|
None
]
|
LoRARequest
|
None
,
):
# Grab the lora config off the vllm config on the engine,
# since this is the same for both v0 & v1.
lora_config
=
self
.
llm_engine
.
vllm_config
.
lora_config
seq_lora_requests
=
self
.
_lora_request_to_seq
(
lora_request
,
len
(
prompts
))
# If there's no lora config / default_mm_loras, or the model
# isn't multimodal, leave the lora as is.
if
(
lora_config
is
None
or
not
self
.
model_config
.
is_multimodal_model
or
(
lora_config
and
lora_config
.
default_mm_loras
is
None
)
):
return
lora_request
optional_loras
=
(
[
lora_request
]
*
len
(
prompts
)
if
not
isinstance
(
lora_request
,
Sequence
)
else
lora_request
)
return
seq_lora_requests
return
[
self
.
_resolve_single_prompt_mm_lora
(
prompt
,
opt_
lora_req
,
lora_req
,
lora_config
.
default_mm_loras
,
)
for
prompt
,
opt_
lora_req
in
zip
(
prompts
,
optional_lora
s
)
for
prompt
,
lora_req
in
zip
(
prompts
,
seq_lora_request
s
)
]
def
_resolve_single_prompt_mm_lora
(
self
,
prompt
:
DictPrompt
|
TokPrompt
,
prompt
:
ProcessorInputs
,
lora_request
:
LoRARequest
|
None
,
default_mm_loras
:
dict
[
str
,
str
]
|
None
,
):
if
not
default_mm_loras
or
not
(
mm_data
:
=
prompt
.
get
(
"multi_modal_data"
)
or
{}
):
if
not
default_mm_loras
or
prompt
[
"type"
]
!=
"multimodal"
:
return
lora_request
intersection
=
set
(
mm_data
.
keys
()
# type: ignore
).
intersection
(
default_mm_loras
.
keys
())
prompt_modalities
=
prompt
[
"mm_placeholders"
].
keys
()
intersection
=
set
(
prompt_modalities
).
intersection
(
default_mm_loras
.
keys
())
if
not
intersection
:
return
lora_request
if
len
(
intersection
)
>
1
:
...
...
@@ -674,22 +660,6 @@ class LLM:
"""
return
self
.
llm_engine
.
apply_model
(
func
)
def
_get_beam_search_lora_requests
(
self
,
lora_request
:
list
[
LoRARequest
]
|
LoRARequest
|
None
,
prompts
:
list
[
TokensPrompt
|
TextPrompt
],
)
->
list
[
LoRARequest
|
None
]:
"""Get the optional lora request corresponding to each prompt."""
if
isinstance
(
lora_request
,
Sequence
)
and
len
(
lora_request
)
!=
len
(
prompts
):
raise
ValueError
(
"Lora request list should be the same length as the prompts"
)
if
lora_request
is
None
or
isinstance
(
lora_request
,
LoRARequest
):
return
[
lora_request
]
*
len
(
prompts
)
raise
TypeError
(
f
"Invalid lora_request type
{
type
(
lora_request
)
}
"
)
def
beam_search
(
self
,
prompts
:
list
[
TokensPrompt
|
TextPrompt
],
...
...
@@ -718,13 +688,12 @@ class LLM:
ignore_eos
=
params
.
ignore_eos
length_penalty
=
params
.
length_penalty
lora_requests
=
self
.
_get_beam_search_lora_requests
(
lora_request
,
prompts
)
tokenizer
=
self
.
renderer
.
get_tokenizer
()
eos_token_id
=
tokenizer
.
eos_token_id
sort_beams_key
=
create_sort_beams_key_function
(
eos_token_id
,
length_penalty
)
tokenizer
=
self
.
get_tokenizer
()
sort_beams_key
=
create_sort_beams_key_function
(
tokenizer
.
eos_token_id
,
length_penalty
,
)
engine_prompts
=
self
.
_preprocess_cmpl
(
prompts
)
lora_requests
=
self
.
_lora_request_to_seq
(
lora_request
,
len
(
engine_prompts
))
if
use_tqdm
and
concurrency_limit
is
not
None
:
logger
.
warning
(
...
...
@@ -734,21 +703,12 @@ class LLM:
use_tqdm
=
False
if
concurrency_limit
is
None
:
concurrency_limit
=
len
(
prompts
)
def
create_tokens_prompt_from_beam
(
beam
:
BeamSearchSequence
)
->
TokensPrompt
:
token_prompt_kwargs
:
TokensPrompt
=
{
"prompt_token_ids"
:
beam
.
tokens
}
if
beam
.
multi_modal_data
is
not
None
:
token_prompt_kwargs
[
"multi_modal_data"
]
=
beam
.
multi_modal_data
if
beam
.
mm_processor_kwargs
is
not
None
:
token_prompt_kwargs
[
"mm_processor_kwargs"
]
=
beam
.
mm_processor_kwargs
return
TokensPrompt
(
**
token_prompt_kwargs
)
concurrency_limit
=
len
(
engine_prompts
)
# generate 2 * beam_width candidates at each step
# following the huggingface transformers implementation
# at https://github.com/huggingface/transformers/blob/e15687fffe5c9d20598a19aeab721ae0a7580f8a/src/transformers/generation/beam_search.py#L534 # noqa
beam_search
_params
=
SamplingParams
(
sampling
_params
=
SamplingParams
(
logprobs
=
2
*
beam_width
,
max_tokens
=
1
,
temperature
=
temperature
,
...
...
@@ -756,30 +716,25 @@ class LLM:
)
instances
:
list
[
BeamSearchInstance
]
=
[]
for
lora_req
,
prompt
in
zip
(
lora_requests
,
prompts
):
# Add multimodal processor kwargs & data
mm_kwargs
=
{}
if
"multi_modal_data"
in
prompt
:
mm_kwargs
[
"multi_modal_data"
]
=
prompt
[
"multi_modal_data"
]
if
"mm_processor_kwargs"
in
prompt
:
mm_kwargs
[
"mm_processor_kwargs"
]
=
prompt
[
"mm_processor_kwargs"
]
if
"prompt_token_ids"
in
prompt
:
prompt
=
cast
(
TokensPrompt
,
prompt
)
# Needed for mypy
prompt_tokens
=
prompt
[
"prompt_token_ids"
]
else
:
prompt_tokens
=
tokenizer
.
encode
(
prompt
[
"prompt"
])
for
lora_req
,
prompt
in
zip
(
lora_requests
,
engine_prompts
):
if
prompt
[
"type"
]
==
"embeds"
:
raise
NotImplementedError
(
"Embedding prompt not supported for beam search"
)
if
prompt
[
"type"
]
==
"enc_dec"
:
raise
NotImplementedError
(
"Encoder-decoder prompt not supported for beam search"
)
instances
.
append
(
BeamSearchInstance
(
prompt
_tokens
,
prompt
,
lora_request
=
lora_req
,
logprobs
=
None
,
**
mm_kwargs
,
),
)
for
prompt_start
in
range
(
0
,
len
(
prompt
s
),
concurrency_limit
):
for
prompt_start
in
range
(
0
,
len
(
instance
s
),
concurrency_limit
):
instances_batch
=
instances
[
prompt_start
:
prompt_start
+
concurrency_limit
]
token_iter
=
range
(
max_tokens
)
...
...
@@ -808,22 +763,15 @@ class LLM:
if
len
(
all_beams
)
==
0
:
break
# create corresponding batch entries for prompt & optional lora
prompts_batch
,
lora_req_batch
=
zip
(
*
[
(
create_tokens_prompt_from_beam
(
beam
),
beam
.
lora_request
)
for
beam
in
all_beams
]
)
# only runs for one step
# we don't need to use tqdm here
output
=
self
.
generate
(
prompts_batch
,
sampling_params
=
beam_search_params
,
raw_output
=
self
.
_render_and_run_requests
(
prompts
=
(
beam
.
get_prompt
()
for
beam
in
all_beams
),
params
=
self
.
_params_to_seq
(
sampling_params
,
len
(
all_beams
)),
lora_requests
=
[
beam
.
lora_request
for
beam
in
all_beams
],
use_tqdm
=
False
,
lora_request
=
lora_req_batch
,
)
output
=
self
.
engine_class
.
validate_outputs
(
raw_output
,
RequestOutput
)
for
(
start
,
end
),
instance
in
zip
(
instance_start_and_end
,
instances_batch
...
...
@@ -841,19 +789,15 @@ class LLM:
logprobs
=
result
.
outputs
[
0
].
logprobs
[
0
]
for
token_id
,
logprob_obj
in
logprobs
.
items
():
new_beam
=
BeamSearchSequence
(
current_beam
.
orig_prompt
,
tokens
=
current_beam
.
tokens
+
[
token_id
],
logprobs
=
current_beam
.
logprobs
+
[
logprobs
],
lora_request
=
current_beam
.
lora_request
,
cum_logprob
=
current_beam
.
cum_logprob
+
logprob_obj
.
logprob
,
multi_modal_data
=
current_beam
.
multi_modal_data
,
mm_processor_kwargs
=
current_beam
.
mm_processor_kwargs
,
)
if
(
token_id
==
tokenizer
.
eos_token_id
and
not
ignore_eos
):
if
token_id
==
eos_token_id
and
not
ignore_eos
:
instance
.
completed
.
append
(
new_beam
)
else
:
instance_new_beams
.
append
(
new_beam
)
...
...
@@ -872,6 +816,7 @@ class LLM:
for
beam
in
best_beams
:
beam
.
text
=
tokenizer
.
decode
(
beam
.
tokens
)
outputs
.
append
(
BeamSearchOutput
(
sequences
=
best_beams
))
return
outputs
...
...
@@ -880,7 +825,7 @@ class LLM:
self
,
prompts
:
Sequence
[
PromptType
],
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
)
->
Sequence
[
DictPrompt
|
TokPrompt
]:
)
->
Sequence
[
ProcessorInputs
]:
"""
Convert prompt inputs from LLM APIs (other than [LLM.chat][]) into
a format that can be passed to `_add_request`.
...
...
@@ -888,8 +833,7 @@ class LLM:
Refer to [LLM.generate][] for a complete description of the arguments.
Returns:
A list of `TokPrompt` objects containing the tokenized prompt
after chat template interpolation, and the raw multi-modal inputs.
A list of `ProcessorInputs` objects ready to be passed into LLMEngine.
"""
renderer
=
self
.
renderer
model_config
=
self
.
model_config
...
...
@@ -903,6 +847,14 @@ class LLM:
return
renderer
.
render_cmpl
(
parsed_prompts
,
tok_params
)
def
_preprocess_cmpl_one
(
self
,
prompt
:
PromptType
,
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
)
->
ProcessorInputs
:
(
engine_prompt
,)
=
self
.
_preprocess_cmpl
([
prompt
],
tokenization_kwargs
)
return
engine_prompt
def
_preprocess_chat
(
self
,
conversations
:
Sequence
[
list
[
ChatCompletionMessageParam
]],
...
...
@@ -914,7 +866,7 @@ class LLM:
tools
:
list
[
dict
[
str
,
Any
]]
|
None
=
None
,
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
mm_processor_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
)
->
Sequence
[
TokPrompt
]:
)
->
Sequence
[
ProcessorInputs
]:
"""
Convert a list of conversations into prompts so that they can then
be used as input for other LLM APIs.
...
...
@@ -922,8 +874,7 @@ class LLM:
Refer to [LLM.chat][] for a complete description of the arguments.
Returns:
A list of `TokPrompt` objects containing the tokenized prompt
after chat template interpolation, and the raw multi-modal inputs.
A list of `ProcessorInputs` objects ready to be passed into LLMEngine.
"""
renderer
=
self
.
renderer
...
...
@@ -953,13 +904,39 @@ class LLM:
return
engine_prompts
def
_preprocess_chat_one
(
self
,
conversation
:
list
[
ChatCompletionMessageParam
],
chat_template
:
str
|
None
=
None
,
chat_template_content_format
:
ChatTemplateContentFormatOption
=
"auto"
,
chat_template_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
add_generation_prompt
:
bool
=
True
,
continue_final_message
:
bool
=
False
,
tools
:
list
[
dict
[
str
,
Any
]]
|
None
=
None
,
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
mm_processor_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
)
->
ProcessorInputs
:
(
engine_prompt
,)
=
self
.
_preprocess_chat
(
[
conversation
],
chat_template
=
chat_template
,
chat_template_content_format
=
chat_template_content_format
,
chat_template_kwargs
=
chat_template_kwargs
,
add_generation_prompt
=
add_generation_prompt
,
continue_final_message
=
continue_final_message
,
tools
=
tools
,
tokenization_kwargs
=
tokenization_kwargs
,
mm_processor_kwargs
=
mm_processor_kwargs
,
)
return
engine_prompt
def
chat
(
self
,
messages
:
list
[
ChatCompletionMessageParam
]
|
Sequence
[
list
[
ChatCompletionMessageParam
]],
sampling_params
:
SamplingParams
|
Sequence
[
SamplingParams
]
|
None
=
None
,
use_tqdm
:
bool
|
Callable
[...,
tqdm
]
=
True
,
lora_request
:
LoRARequest
|
None
=
None
,
lora_request
:
Sequence
[
LoRARequest
]
|
LoRARequest
|
None
=
None
,
chat_template
:
str
|
None
=
None
,
chat_template_content_format
:
ChatTemplateContentFormatOption
=
"auto"
,
add_generation_prompt
:
bool
=
True
,
...
...
@@ -1805,47 +1782,41 @@ class LLM:
|
Sequence
[
SamplingParams
|
PoolingParams
],
*
,
use_tqdm
:
bool
|
Callable
[...,
tqdm
]
=
True
,
lora_request
:
list
[
LoRARequest
]
|
LoRARequest
|
None
=
None
,
lora_request
:
Sequence
[
LoRARequest
]
|
LoRARequest
|
None
=
None
,
priority
:
list
[
int
]
|
None
=
None
,
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
):
seq_prompts
=
prompt_to_seq
(
prompts
)
seq_params
=
self
.
_params_to_seq
(
params
,
len
(
seq_prompts
))
if
any
(
param
.
truncate_prompt_tokens
is
not
None
for
param
in
seq_params
):
# TODO: Remove this after deprecating `param.truncate_prompt_tokens`
# Then, move the code from the `else` block to the top and let
# `self._preprocess_cmpl` handle prompt normalization
engine_prompts
:
Sequence
[
DictPrompt
|
TokPrompt
]
=
[
engine_prompt
for
prompt
,
param
in
zip
(
seq_prompts
,
seq_params
)
for
engine_prompt
in
self
.
_preprocess_cmpl
(
[
prompt
],
tokenization_kwargs
=
merge_kwargs
(
tokenization_kwargs
,
dict
(
truncate_prompt_tokens
=
param
.
truncate_prompt_tokens
),
seq_lora_requests
=
self
.
_lora_request_to_seq
(
lora_request
,
len
(
seq_prompts
))
seq_tok_kwargs
=
[
merge_kwargs
(
tokenization_kwargs
,
dict
(
truncate_prompt_tokens
=
param
.
truncate_prompt_tokens
),
)
for
param
in
seq_params
]
seq_priority
=
self
.
_priority_to_seq
(
priority
,
len
(
prompts
))
return
self
.
_render_and_run_requests
(
prompts
=
(
self
.
_preprocess_cmpl_one
(
prompt
,
tok_kwargs
)
for
prompt
,
tok_kwargs
in
zip
(
maybe_tqdm
(
seq_prompts
,
use_tqdm
=
use_tqdm
,
desc
=
"Rendering prompts"
,
),
seq_tok_kwargs
,
)
]
else
:
engine_prompts
=
self
.
_preprocess_cmpl
(
seq_prompts
,
tokenization_kwargs
=
tokenization_kwargs
,
)
self
.
_validate_and_add_requests
(
prompts
=
engine_prompts
,
),
params
=
seq_params
,
use_tqdm
=
use_tqdm
,
lora_request
=
self
.
_get_modality_specific_lora_reqs
(
engine_prompts
,
lora_request
),
lora_requests
=
seq_lora_requests
,
tokenization_kwargs
=
tokenization_kwargs
,
priorit
y
=
priority
,
priorit
ies
=
seq_
priority
,
)
return
self
.
_run_engine
(
use_tqdm
=
use_tqdm
)
def
_run_chat
(
self
,
messages
:
list
[
ChatCompletionMessageParam
]
...
...
@@ -1855,7 +1826,7 @@ class LLM:
|
Sequence
[
SamplingParams
|
PoolingParams
],
*
,
use_tqdm
:
bool
|
Callable
[...,
tqdm
]
=
True
,
lora_request
:
LoRARequest
|
None
=
None
,
lora_request
:
Sequence
[
LoRARequest
]
|
LoRARequest
|
None
=
None
,
chat_template
:
str
|
None
=
None
,
chat_template_content_format
:
ChatTemplateContentFormatOption
=
"auto"
,
add_generation_prompt
:
bool
=
True
,
...
...
@@ -1865,68 +1836,94 @@ class LLM:
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
mm_processor_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
):
engine_prompts
=
self
.
_preprocess_chat
(
conversation_to_seq
(
messages
),
chat_template
=
chat_template
,
chat_template_content_format
=
chat_template_content_format
,
chat_template_kwargs
=
chat_template_kwargs
,
add_generation_prompt
=
add_generation_prompt
,
continue_final_message
=
continue_final_message
,
tools
=
tools
,
seq_convs
=
conversation_to_seq
(
messages
)
seq_params
=
self
.
_params_to_seq
(
params
,
len
(
seq_convs
))
seq_lora_requests
=
self
.
_lora_request_to_seq
(
lora_request
,
len
(
seq_convs
))
seq_tok_kwargs
=
[
merge_kwargs
(
tokenization_kwargs
,
dict
(
truncate_prompt_tokens
=
param
.
truncate_prompt_tokens
),
)
for
param
in
seq_params
]
return
self
.
_render_and_run_requests
(
prompts
=
(
self
.
_preprocess_chat_one
(
conversation
,
chat_template
=
chat_template
,
chat_template_content_format
=
chat_template_content_format
,
chat_template_kwargs
=
chat_template_kwargs
,
add_generation_prompt
=
add_generation_prompt
,
continue_final_message
=
continue_final_message
,
tools
=
tools
,
tokenization_kwargs
=
tok_kwargs
,
mm_processor_kwargs
=
mm_processor_kwargs
,
)
for
conversation
,
tok_kwargs
in
zip
(
maybe_tqdm
(
seq_convs
,
use_tqdm
=
use_tqdm
,
desc
=
"Rendering conversations"
,
),
seq_tok_kwargs
,
)
),
params
=
seq_params
,
lora_requests
=
seq_lora_requests
,
use_tqdm
=
use_tqdm
,
tokenization_kwargs
=
tokenization_kwargs
,
mm_processor_kwargs
=
mm_processor_kwargs
,
)
self
.
_validate_and_add_requests
(
prompts
=
engine_prompts
,
def
_render_and_run_requests
(
self
,
prompts
:
Iterable
[
ProcessorInputs
],
params
:
Sequence
[
SamplingParams
|
PoolingParams
],
*
,
lora_requests
:
Sequence
[
LoRARequest
|
None
]
|
None
=
None
,
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
priorities
:
Sequence
[
int
]
|
None
=
None
,
use_tqdm
:
bool
|
Callable
[...,
tqdm
]
=
True
,
):
if
isinstance
(
prompts
,
(
list
,
tuple
)):
logger
.
warning_once
(
"Rendering all prompts before adding them to the engine "
"is less efficient than performing both on the same prompt "
"before processing the next prompt. You should instead pass "
"a generator that renders one prompt per iteration, as that allows "
"engine execution to begin for the first prompt while processing "
"the next prompt."
)
self
.
_render_and_add_requests
(
prompts
=
prompts
,
params
=
params
,
use_tqdm
=
use_tqdm
,
lora_request
=
self
.
_get_modality_specific_lora_reqs
(
engine_prompts
,
lora_request
),
lora_requests
=
lora_requests
,
tokenization_kwargs
=
tokenization_kwargs
,
priorities
=
priorities
,
)
return
self
.
_run_engine
(
use_tqdm
=
use_tqdm
)
def
_
validate
_and_add_requests
(
def
_
render
_and_add_requests
(
self
,
prompts
:
Sequence
[
DictPrompt
|
TokPrompt
],
params
:
SamplingParams
|
PoolingParams
|
Sequence
[
SamplingParams
|
PoolingParams
],
prompts
:
Iterable
[
ProcessorInputs
],
params
:
Sequence
[
SamplingParams
|
PoolingParams
],
*
,
use_tqdm
:
bool
|
Callable
[...,
tqdm
]
=
True
,
lora_request
:
Sequence
[
LoRARequest
|
None
]
|
LoRARequest
|
None
,
lora_requests
:
Sequence
[
LoRARequest
|
None
]
|
None
=
None
,
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
priorit
y
:
list
[
int
]
|
None
=
None
,
priorit
ies
:
Sequence
[
int
]
|
None
=
None
,
)
->
list
[
str
]:
num_requests
=
len
(
prompts
)
seq_params
=
self
.
_params_to_seq
(
params
,
num_requests
)
seq_lora_requests
=
self
.
_lora_request_to_seq
(
lora_request
,
num_requests
)
seq_priority
=
self
.
_priority_to_seq
(
priority
,
num_requests
)
for
sp
in
seq_params
:
if
isinstance
(
sp
,
SamplingParams
):
# We only care about the final output
sp
.
output_kind
=
RequestOutputKind
.
FINAL_ONLY
# Add requests to the engine.
it
=
prompts
if
use_tqdm
:
tqdm_func
=
use_tqdm
if
callable
(
use_tqdm
)
else
tqdm
it
=
tqdm_func
(
it
,
desc
=
"Adding requests"
)
added_request_ids
:
list
[
str
]
=
[]
try
:
for
i
,
prompt
in
enumerate
(
it
):
for
i
,
prompt
in
enumerate
(
prompts
):
request_id
=
self
.
_add_request
(
prompt
,
seq_
params
[
i
],
lora_request
=
seq_
lora_requests
[
i
],
params
[
i
],
lora_request
=
None
if
lora_requests
is
None
else
lora_requests
[
i
],
tokenization_kwargs
=
tokenization_kwargs
,
priority
=
seq_
priorit
y
[
i
],
priority
=
0
if
priorities
is
None
else
priorit
ies
[
i
],
)
added_request_ids
.
append
(
request_id
)
except
Exception
as
e
:
...
...
@@ -1938,13 +1935,16 @@ class LLM:
def
_add_request
(
self
,
prompt
:
Pro
mptType
|
DictPrompt
|
TokPrompt
,
prompt
:
Pro
cessorInputs
,
params
:
SamplingParams
|
PoolingParams
,
lora_request
:
LoRARequest
|
None
=
None
,
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
priority
:
int
=
0
,
)
->
str
:
prompt_text
,
_
,
_
=
extract_prompt_components
(
self
.
model_config
,
prompt
)
if
isinstance
(
params
,
SamplingParams
):
# We only care about the final output
params
.
output_kind
=
RequestOutputKind
.
FINAL_ONLY
request_id
=
str
(
next
(
self
.
request_counter
))
if
params
.
truncate_prompt_tokens
is
not
None
:
...
...
@@ -1962,32 +1962,14 @@ class LLM:
dict
(
truncate_prompt_tokens
=
params
.
truncate_prompt_tokens
),
)
renderer
=
self
.
renderer
tok_params
=
renderer
.
default_cmpl_tok_params
.
with_kwargs
(
**
(
tokenization_kwargs
or
{})
)
tokenization_kwargs
=
tok_params
.
get_encode_kwargs
()
engine_request
=
self
.
input_processor
.
process_inputs
(
return
self
.
llm_engine
.
add_request
(
request_id
,
prompt
,
params
,
lora_request
=
lora_request
,
tokenization_kwargs
=
tokenization_kwargs
,
priority
=
priority
,
supported_tasks
=
self
.
supported_tasks
,
)
self
.
llm_engine
.
add_request
(
request_id
,
engine_request
,
params
,
lora_request
=
lora_request
,
tokenization_kwargs
=
tokenization_kwargs
,
priority
=
priority
,
prompt_text
=
prompt_text
,
)
return
engine_request
.
request_id
def
_run_engine
(
self
,
...
...
vllm/entrypoints/openai/chat_completion/serving.py
View file @
574fe752
...
...
@@ -67,13 +67,12 @@ from vllm.entrypoints.openai.parser.harmony_utils import (
)
from
vllm.entrypoints.openai.utils
import
maybe_filter_parallel_tool_calls
from
vllm.entrypoints.utils
import
get_max_tokens
,
should_include_usage
from
vllm.inputs.data
import
TokensPrompt
from
vllm.inputs.data
import
ProcessorInputs
,
TokensPrompt
from
vllm.logger
import
init_logger
from
vllm.logprobs
import
Logprob
from
vllm.outputs
import
CompletionOutput
,
RequestOutput
from
vllm.parser
import
ParserManager
from
vllm.reasoning
import
ReasoningParser
from
vllm.renderers.inputs
import
TokPrompt
from
vllm.sampling_params
import
BeamSearchParams
,
SamplingParams
from
vllm.tokenizers
import
TokenizerLike
from
vllm.tokenizers.mistral
import
(
...
...
@@ -221,7 +220,7 @@ class OpenAIServingChat(OpenAIServing):
async
def
render_chat_request
(
self
,
request
:
ChatCompletionRequest
,
)
->
tuple
[
list
[
ConversationMessage
],
list
[
TokPrompt
]]
|
ErrorResponse
:
)
->
tuple
[
list
[
ConversationMessage
],
list
[
ProcessorInputs
]]
|
ErrorResponse
:
"""
render chat request by validating and preprocessing inputs.
...
...
@@ -380,7 +379,9 @@ class OpenAIServingChat(OpenAIServing):
generators
:
list
[
AsyncGenerator
[
RequestOutput
,
None
]]
=
[]
try
:
for
i
,
engine_prompt
in
enumerate
(
engine_prompts
):
prompt_text
=
self
.
_extract_prompt_text
(
engine_prompt
)
prompt_token_ids
=
self
.
_extract_prompt_components
(
engine_prompt
).
token_ids
# If we are creating sub requests for multiple prompts, ensure that they
# have unique request ids.
...
...
@@ -431,35 +432,21 @@ class OpenAIServingChat(OpenAIServing):
trace_headers
=
trace_headers
,
)
else
:
tok_params
=
request
.
build_tok_params
(
self
.
model_config
)
tokenization_kwargs
=
tok_params
.
get_encode_kwargs
()
engine_request
=
self
.
input_processor
.
process_inputs
(
sub_request_id
,
engine_prompt
,
sampling_params
,
lora_request
=
lora_request
,
tokenization_kwargs
=
tokenization_kwargs
,
trace_headers
=
trace_headers
,
priority
=
request
.
priority
,
data_parallel_rank
=
data_parallel_rank
,
reasoning_ended
=
(
reasoning_parser
.
is_reasoning_end
(
prompt_token_ids
or
[])
if
reasoning_parser
else
None
)
reasoning_ended
=
None
if
reasoning_parser
:
reasoning_ended
=
reasoning_parser
.
is_reasoning_end
(
engine_request
.
prompt_token_ids
or
[]
# type: ignore[attr-defined]
)
engine_request
.
reasoning_ended
=
reasoning_ended
generator
=
self
.
engine_client
.
generate
(
engine_
reques
t
,
engine_
promp
t
,
sampling_params
,
sub_request_id
,
lora_request
=
lora_request
,
trace_headers
=
trace_headers
,
priority
=
request
.
priority
,
prompt_text
=
prompt_text
,
tokenization_kwargs
=
tokenization_kwargs
,
data_parallel_rank
=
data_parallel_rank
,
reasoning_ended
=
reasoning_ended
,
)
generators
.
append
(
generator
)
...
...
vllm/entrypoints/openai/completion/serving.py
View file @
574fe752
...
...
@@ -34,10 +34,10 @@ from vllm.entrypoints.openai.engine.serving import (
from
vllm.entrypoints.openai.models.serving
import
OpenAIServingModels
from
vllm.entrypoints.utils
import
get_max_tokens
,
should_include_usage
from
vllm.exceptions
import
VLLMValidationError
from
vllm.inputs.data
import
ProcessorInputs
from
vllm.logger
import
init_logger
from
vllm.logprobs
import
Logprob
from
vllm.outputs
import
RequestOutput
from
vllm.renderers.inputs
import
TokPrompt
from
vllm.sampling_params
import
BeamSearchParams
,
SamplingParams
from
vllm.tokenizers
import
TokenizerLike
from
vllm.utils.async_utils
import
merge_async_iterators
...
...
@@ -80,7 +80,7 @@ class OpenAIServingCompletion(OpenAIServing):
async
def
render_completion_request
(
self
,
request
:
CompletionRequest
,
)
->
list
[
TokPrompt
]
|
ErrorResponse
:
)
->
list
[
ProcessorInputs
]
|
ErrorResponse
:
"""
render completion request by validating and preprocessing inputs.
...
...
@@ -163,8 +163,6 @@ class OpenAIServingCompletion(OpenAIServing):
generators
:
list
[
AsyncGenerator
[
RequestOutput
,
None
]]
=
[]
try
:
for
i
,
engine_prompt
in
enumerate
(
engine_prompts
):
prompt_text
=
self
.
_extract_prompt_text
(
engine_prompt
)
max_tokens
=
get_max_tokens
(
max_model_len
,
request
.
max_tokens
,
...
...
@@ -208,29 +206,13 @@ class OpenAIServingCompletion(OpenAIServing):
trace_headers
=
trace_headers
,
)
else
:
tok_params
=
request
.
build_tok_params
(
self
.
model_config
)
tokenization_kwargs
=
tok_params
.
get_encode_kwargs
()
engine_request
=
self
.
input_processor
.
process_inputs
(
request_id_item
,
engine_prompt
,
sampling_params
,
lora_request
=
lora_request
,
tokenization_kwargs
=
tokenization_kwargs
,
trace_headers
=
trace_headers
,
priority
=
request
.
priority
,
data_parallel_rank
=
data_parallel_rank
,
)
generator
=
self
.
engine_client
.
generate
(
engine_
reques
t
,
engine_
promp
t
,
sampling_params
,
request_id_item
,
lora_request
=
lora_request
,
trace_headers
=
trace_headers
,
priority
=
request
.
priority
,
prompt_text
=
prompt_text
,
tokenization_kwargs
=
tokenization_kwargs
,
data_parallel_rank
=
data_parallel_rank
,
)
...
...
@@ -312,7 +294,7 @@ class OpenAIServingCompletion(OpenAIServing):
async
def
completion_stream_generator
(
self
,
request
:
CompletionRequest
,
engine_prompts
:
list
[
TokPrompt
],
engine_prompts
:
list
[
ProcessorInputs
],
result_generator
:
AsyncIterator
[
tuple
[
int
,
RequestOutput
]],
request_id
:
str
,
created_time
:
int
,
...
...
vllm/entrypoints/openai/engine/serving.py
View file @
574fe752
...
...
@@ -96,15 +96,19 @@ from vllm.entrypoints.serve.tokenize.protocol import (
)
from
vllm.entrypoints.utils
import
get_max_tokens
,
sanitize_message
from
vllm.exceptions
import
VLLMValidationError
from
vllm.inputs.data
import
PromptType
,
SingletonPrompt
,
TokensPrompt
from
vllm.inputs.data
import
(
ProcessorInputs
,
PromptType
,
SingletonPrompt
,
TokensPrompt
,
token_inputs
,
)
from
vllm.logger
import
init_logger
from
vllm.logprobs
import
Logprob
,
PromptLogprobs
from
vllm.lora.request
import
LoRARequest
from
vllm.multimodal
import
MultiModalDataDict
from
vllm.outputs
import
CompletionOutput
,
PoolingRequestOutput
,
RequestOutput
from
vllm.pooling_params
import
PoolingParams
from
vllm.renderers
import
ChatParams
,
TokenizeParams
,
merge_kwargs
from
vllm.renderers.inputs
import
TokPrompt
from
vllm.renderers.inputs.preprocess
import
(
extract_prompt_components
,
extract_prompt_len
,
...
...
@@ -206,7 +210,7 @@ class ServeContext(Generic[RequestT]):
request_id
:
str
created_time
:
int
=
field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
lora_request
:
LoRARequest
|
None
=
None
engine_prompts
:
list
[
TokPrompt
]
|
None
=
None
engine_prompts
:
list
[
ProcessorInputs
]
|
None
=
None
result_generator
:
AsyncGenerator
[
tuple
[
int
,
PoolingRequestOutput
],
None
]
|
None
=
(
None
...
...
@@ -249,7 +253,7 @@ class OpenAIServing:
async
def
beam_search
(
self
,
prompt
:
TokPrompt
,
prompt
:
ProcessorInputs
,
request_id
:
str
,
params
:
BeamSearchParams
,
lora_request
:
LoRARequest
|
None
=
None
,
...
...
@@ -262,86 +266,53 @@ class OpenAIServing:
length_penalty
=
params
.
length_penalty
include_stop_str_in_output
=
params
.
include_stop_str_in_output
input_processor
=
self
.
input_processor
tokenizer
=
input_processor
.
tokenizer
if
tokenizer
is
None
:
raise
VLLMValidationError
(
"You cannot use beam search when `skip_tokenizer_init=True`"
,
parameter
=
"skip_tokenizer_init"
,
value
=
True
,
)
eos_token_id
:
int
=
tokenizer
.
eos_token_id
# type: ignore
if
isinstance
(
prompt
,
dict
)
and
"encoder_prompt"
in
prompt
:
raise
NotImplementedError
(
"Encoder-decoder prompt not supported"
)
prompt_text
:
str
|
None
=
prompt
.
get
(
"prompt"
)
# type: ignore
prompt_token_ids
:
list
[
int
]
=
prompt
.
get
(
"prompt_token_ids"
,
[])
# type: ignore
multi_modal_data
:
MultiModalDataDict
|
None
=
prompt
.
get
(
"multi_modal_data"
)
# type: ignore
mm_processor_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
tokenizer
=
self
.
renderer
.
get_tokenizer
()
eos_token_id
=
tokenizer
.
eos_token_id
sort_beams_key
=
create_sort_beams_key_function
(
eos_token_id
,
length_penalty
)
# This is a workaround to fix multimodal beam search; this is a
# bandaid fix for 2 small problems:
# 1. Multi_modal_data on the processed_inputs currently resolves to
# `None`.
# 2. preprocessing above expands the multimodal placeholders. However,
# this happens again in generation, so the double expansion causes
# a mismatch.
# TODO - would be ideal to handle this more gracefully.
if
prompt
[
"type"
]
==
"embeds"
:
raise
NotImplementedError
(
"Embedding prompt not supported for beam search"
)
if
prompt
[
"type"
]
==
"enc_dec"
:
raise
NotImplementedError
(
"Encoder-decoder prompt not supported for beam search"
)
prompt_text
=
prompt
.
get
(
"prompt"
)
prompt_token_ids
=
prompt
[
"prompt_token_ids"
]
tokenized_length
=
len
(
prompt_token_ids
)
sort_beams_key
=
create_sort_beams_key_function
(
eos_token_id
,
length_penalty
)
logprobs_num
=
2
*
beam_width
beam_search
_params
=
SamplingParams
(
sampling
_params
=
SamplingParams
(
logprobs
=
logprobs_num
,
max_tokens
=
1
,
temperature
=
temperature
,
)
all_beams
=
[
BeamSearchSequence
(
orig_prompt
=
prompt
,
tokens
=
prompt_token_ids
,
cum_logprob
=
0
,
logprobs
=
[],
multi_modal_data
=
multi_modal_data
,
mm_processor_kwargs
=
mm_processor_kwargs
,
lora_request
=
lora_request
,
)
]
completed
=
[]
for
_
in
range
(
max_tokens
):
prompts_batch
,
lora_req_batch
=
zip
(
*
[
(
TokensPrompt
(
prompt_token_ids
=
beam
.
tokens
,
multi_modal_data
=
beam
.
multi_modal_data
,
mm_processor_kwargs
=
beam
.
mm_processor_kwargs
,
),
beam
.
lora_request
,
)
for
beam
in
all_beams
]
)
tasks
=
[]
request_id_batch
=
f
"
{
request_id
}
-
{
random_uuid
()
}
"
for
i
,
(
individual_prompt
,
lora_req
)
in
enumerate
(
zip
(
prompt
s_batch
,
lora_req_batch
)
):
for
i
,
beam
in
enumerate
(
all_beams
):
prompt
_item
=
beam
.
get_prompt
(
)
lora_request_item
=
beam
.
lora_request
request_id_item
=
f
"
{
request_id_batch
}
-beam-
{
i
}
"
task
=
asyncio
.
create_task
(
collect_from_async_generator
(
self
.
engine_client
.
generate
(
individual_
prompt
,
beam_search
_params
,
prompt
_item
,
sampling
_params
,
request_id_item
,
lora_request
=
lora_req
,
lora_request
=
lora_req
uest_item
,
trace_headers
=
trace_headers
,
)
)
...
...
@@ -406,6 +377,7 @@ class OpenAIServing:
logprobs_entry
=
result
.
outputs
[
0
].
logprobs
[
0
]
completed
.
append
(
BeamSearchSequence
(
orig_prompt
=
prompt
,
tokens
=
current_beam
.
tokens
+
[
eos_token_id
]
if
include_stop_str_in_output
else
current_beam
.
tokens
,
...
...
@@ -433,12 +405,11 @@ class OpenAIServing:
logprobs_entry
=
result
.
outputs
[
0
].
logprobs
[
0
]
new_beams
.
append
(
BeamSearchSequence
(
orig_prompt
=
prompt
,
tokens
=
current_beam
.
tokens
+
[
token_id
],
logprobs
=
current_beam
.
logprobs
+
[
logprobs_entry
],
lora_request
=
current_beam
.
lora_request
,
cum_logprob
=
float
(
all_beams_logprob
[
idx
]),
multi_modal_data
=
current_beam
.
multi_modal_data
,
mm_processor_kwargs
=
current_beam
.
mm_processor_kwargs
,
)
)
...
...
@@ -958,7 +929,7 @@ class OpenAIServing:
request
:
RendererRequest
,
prompt_input
:
str
|
list
[
str
]
|
list
[
int
]
|
list
[
list
[
int
]]
|
None
,
prompt_embeds
:
bytes
|
list
[
bytes
]
|
None
,
)
->
list
[
TokPrompt
]:
)
->
list
[
ProcessorInputs
]:
prompts
=
list
[
SingletonPrompt
|
bytes
]()
if
prompt_embeds
is
not
None
:
# embeds take higher priority
prompts
.
extend
(
prompt_to_seq
(
prompt_embeds
))
...
...
@@ -971,7 +942,7 @@ class OpenAIServing:
self
,
request
:
RendererRequest
,
prompts
:
Sequence
[
PromptType
|
bytes
],
)
->
list
[
TokPrompt
]:
)
->
list
[
ProcessorInputs
]:
renderer
=
self
.
renderer
model_config
=
self
.
model_config
...
...
@@ -1004,7 +975,7 @@ class OpenAIServing:
default_template_kwargs
:
dict
[
str
,
Any
]
|
None
,
tool_dicts
:
list
[
dict
[
str
,
Any
]]
|
None
=
None
,
tool_parser
:
Callable
[[
TokenizerLike
],
ToolParser
]
|
None
=
None
,
)
->
tuple
[
list
[
ConversationMessage
],
list
[
TokPrompt
]]:
)
->
tuple
[
list
[
ConversationMessage
],
list
[
ProcessorInputs
]]:
from
vllm.tokenizers.mistral
import
MistralTokenizer
renderer
=
self
.
renderer
...
...
@@ -1052,13 +1023,13 @@ class OpenAIServing:
return
conversation
,
[
engine_prompt
]
def
_extract_prompt_components
(
self
,
prompt
:
object
):
def
_extract_prompt_components
(
self
,
prompt
:
PromptType
|
ProcessorInputs
):
return
extract_prompt_components
(
self
.
model_config
,
prompt
)
def
_extract_prompt_text
(
self
,
prompt
:
object
):
def
_extract_prompt_text
(
self
,
prompt
:
ProcessorInputs
):
return
self
.
_extract_prompt_components
(
prompt
).
text
def
_extract_prompt_len
(
self
,
prompt
:
object
):
def
_extract_prompt_len
(
self
,
prompt
:
ProcessorInputs
):
return
extract_prompt_len
(
self
.
model_config
,
prompt
)
async
def
_render_next_turn
(
...
...
@@ -1088,16 +1059,14 @@ class OpenAIServing:
async
def
_generate_with_builtin_tools
(
self
,
request_id
:
str
,
engine_prompt
:
TokPrompt
,
engine_prompt
:
ProcessorInputs
,
sampling_params
:
SamplingParams
,
tok_params
:
TokenizeParams
,
context
:
ConversationContext
,
lora_request
:
LoRARequest
|
None
=
None
,
priority
:
int
=
0
,
trace_headers
:
Mapping
[
str
,
str
]
|
None
=
None
,
):
max_model_len
=
self
.
model_config
.
max_model_len
prompt_text
=
self
.
_extract_prompt_text
(
engine_prompt
)
orig_priority
=
priority
sub_request
=
0
...
...
@@ -1112,26 +1081,13 @@ class OpenAIServing:
lora_request
=
lora_request
,
)
tokenization_kwargs
=
tok_params
.
get_encode_kwargs
()
engine_request
=
self
.
input_processor
.
process_inputs
(
sub_request_id
,
engine_prompt
,
sampling_params
,
lora_request
=
lora_request
,
tokenization_kwargs
=
tokenization_kwargs
,
trace_headers
=
trace_headers
,
priority
=
priority
,
)
generator
=
self
.
engine_client
.
generate
(
engine_
reques
t
,
engine_
promp
t
,
sampling_params
,
sub_request_id
,
lora_request
=
lora_request
,
trace_headers
=
trace_headers
,
priority
=
priority
,
prompt_text
=
prompt_text
,
tokenization_kwargs
=
tokenization_kwargs
,
)
async
for
res
in
generator
:
...
...
@@ -1154,11 +1110,11 @@ class OpenAIServing:
# Render the next prompt token ids and update sampling_params.
if
isinstance
(
context
,
(
HarmonyContext
,
StreamingHarmonyContext
)):
token_ids
=
context
.
render_for_completion
()
engine_prompt
=
TokensPrompt
(
prompt_
token_i
ds
=
token_ids
)
engine_prompt
=
token_i
nputs
(
token_ids
)
sampling_params
.
max_tokens
=
max_model_len
-
len
(
token_ids
)
elif
isinstance
(
context
,
ParsableContext
):
engine_prompt
s
=
await
self
.
_render_next_turn
(
(
engine_prompt
,)
=
await
self
.
_render_next_turn
(
context
.
request
,
context
.
parser
.
response_messages
,
context
.
tool_dicts
,
...
...
@@ -1166,8 +1122,6 @@ class OpenAIServing:
context
.
chat_template
,
context
.
chat_template_content_format
,
)
engine_prompt
=
engine_prompts
[
0
]
prompt_text
=
self
.
_extract_prompt_text
(
engine_prompt
)
sampling_params
.
max_tokens
=
get_max_tokens
(
max_model_len
,
...
...
@@ -1184,7 +1138,7 @@ class OpenAIServing:
def
_log_inputs
(
self
,
request_id
:
str
,
inputs
:
PromptType
|
TokPrompt
,
inputs
:
PromptType
|
ProcessorInputs
,
params
:
SamplingParams
|
PoolingParams
|
BeamSearchParams
|
None
,
lora_request
:
LoRARequest
|
None
,
)
->
None
:
...
...
vllm/entrypoints/openai/realtime/serving.py
View file @
574fe752
...
...
@@ -15,6 +15,7 @@ from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from
vllm.inputs.data
import
PromptType
from
vllm.logger
import
init_logger
from
vllm.model_executor.models.interfaces
import
SupportsRealtime
from
vllm.renderers.inputs.preprocess
import
parse_model_prompt
logger
=
init_logger
(
__name__
)
...
...
@@ -70,15 +71,20 @@ class OpenAIServingRealtime(OpenAIServing):
Yields:
StreamingInput objects containing audio prompts for the engine
"""
model_config
=
self
.
model_config
renderer
=
self
.
renderer
# mypy is being stupid
# TODO(Patrick) - fix this
stream_input_iter
=
cast
(
AsyncGenerator
[
PromptType
,
None
],
self
.
model_cls
.
buffer_realtime_audio
(
audio_stream
,
input_stream
,
self
.
model_config
audio_stream
,
input_stream
,
model_config
),
)
async
for
prompt
in
stream_input_iter
:
yield
StreamingInput
(
prompt
=
prompt
)
parsed_prompt
=
parse_model_prompt
(
model_config
,
prompt
)
(
engine_prompt
,)
=
await
renderer
.
render_cmpl_async
([
parsed_prompt
])
yield
StreamingInput
(
prompt
=
engine_prompt
)
vllm/entrypoints/openai/responses/context.py
View file @
574fe752
...
...
@@ -9,7 +9,7 @@ from abc import ABC, abstractmethod
from
collections.abc
import
Callable
from
contextlib
import
AsyncExitStack
from
dataclasses
import
replace
from
typing
import
TYPE_CHECKING
,
Union
from
typing
import
TYPE_CHECKING
,
Final
,
Union
from
openai.types.responses.response_function_tool_call_output_item
import
(
ResponseFunctionToolCallOutputItem
,
...
...
@@ -304,7 +304,7 @@ class ParsableContext(ConversationContext):
self
.
tool_dicts
=
construct_tool_dicts
(
request
.
tools
,
request
.
tool_choice
)
self
.
chat_template
=
chat_template
self
.
chat_template_content_format
=
chat_template_content_format
self
.
chat_template_content_format
:
Final
=
chat_template_content_format
self
.
input_messages
:
list
[
ResponseRawMessageAndToken
]
=
[]
self
.
output_messages
:
list
[
ResponseRawMessageAndToken
]
=
[]
...
...
vllm/entrypoints/openai/responses/serving.py
View file @
574fe752
...
...
@@ -116,13 +116,12 @@ from vllm.entrypoints.openai.responses.utils import (
)
from
vllm.entrypoints.utils
import
get_max_tokens
from
vllm.exceptions
import
VLLMValidationError
from
vllm.inputs.data
import
TokensPrompt
from
vllm.inputs.data
import
ProcessorInputs
,
token_inputs
from
vllm.logger
import
init_logger
from
vllm.logprobs
import
Logprob
as
SampleLogprob
from
vllm.logprobs
import
SampleLogprobs
from
vllm.outputs
import
CompletionOutput
from
vllm.parser
import
ParserManager
from
vllm.renderers.inputs
import
TokPrompt
from
vllm.sampling_params
import
SamplingParams
,
StructuredOutputsParams
from
vllm.tokenizers
import
TokenizerLike
from
vllm.utils
import
random_uuid
...
...
@@ -298,7 +297,7 @@ class OpenAIServingResponses(OpenAIServing):
def
_validate_generator_input
(
self
,
engine_prompt
:
TokPrompt
,
engine_prompt
:
ProcessorInputs
,
)
->
ErrorResponse
|
None
:
"""Add validations to the input to the generator here."""
prompt_len
=
self
.
_extract_prompt_len
(
engine_prompt
)
...
...
@@ -458,7 +457,6 @@ class OpenAIServingResponses(OpenAIServing):
sampling_params
=
request
.
to_sampling_params
(
default_max_tokens
,
self
.
default_sampling_params
)
tok_params
=
request
.
build_tok_params
(
self
.
model_config
)
trace_headers
=
(
None
...
...
@@ -512,7 +510,6 @@ class OpenAIServingResponses(OpenAIServing):
request_id
=
request
.
request_id
,
engine_prompt
=
engine_prompt
,
sampling_params
=
sampling_params
,
tok_params
=
tok_params
,
context
=
context
,
lora_request
=
lora_request
,
priority
=
request
.
priority
,
...
...
@@ -647,7 +644,7 @@ class OpenAIServingResponses(OpenAIServing):
messages
=
self
.
_construct_input_messages_with_harmony
(
request
,
prev_response
)
prompt_token_ids
=
render_for_completion
(
messages
)
engine_prompt
=
TokensPrompt
(
prompt_
token_i
ds
=
prompt_token_ids
)
engine_prompt
=
token_i
nputs
(
prompt_token_ids
)
# Add cache_salt if provided in the request
if
request
.
cache_salt
is
not
None
:
...
...
vllm/entrypoints/openai/speech_to_text/speech_to_text.py
View file @
574fe752
...
...
@@ -36,14 +36,15 @@ from vllm.entrypoints.openai.speech_to_text.protocol import (
TranslationSegment
,
TranslationStreamResponse
,
)
from
vllm.entrypoints.utils
import
get_max_tokens
from
vllm.exceptions
import
VLLMValidationError
from
vllm.inputs
.data
import
Pro
mptType
from
vllm.inputs
import
Pro
cessorInputs
from
vllm.logger
import
init_logger
from
vllm.logprobs
import
FlatLogprobs
,
Logprob
from
vllm.model_executor.models
import
SupportsTranscription
,
supports_transcription
from
vllm.outputs
import
RequestOutput
from
vllm.renderers.inputs
import
EncoderDecoderDictPrompt
from
vllm.renderers.inputs.preprocess
import
parse_enc_dec_prompt
from
vllm.renderers.inputs
import
DictPrompt
,
EncoderDecoderDictPrompt
from
vllm.renderers.inputs.preprocess
import
parse_enc_dec_prompt
,
parse_model_prompt
from
vllm.tokenizers
import
get_tokenizer
from
vllm.utils.import_utils
import
PlaceholderModule
...
...
@@ -202,8 +203,6 @@ class OpenAISpeechToText(OpenAIServing):
return
try
:
from
vllm.sampling_params
import
SamplingParams
warmup_start
=
time
.
perf_counter
()
logger
.
info
(
"Warming up multimodal input processor..."
)
...
...
@@ -221,21 +220,11 @@ class OpenAISpeechToText(OpenAIServing):
request_prompt
=
""
,
to_language
=
None
,
)
# Create minimal sampling params
dummy_params
=
SamplingParams
(
max_tokens
=
1
,
temperature
=
0.0
,
skip_clone
=
True
,
# Internal warmup, safe to skip clone
)
parsed_prompt
=
parse_model_prompt
(
self
.
model_config
,
dummy_prompt
)
# Process the dummy input through the input processor
# This will trigger all the multimodal processing initialization
_
=
self
.
input_processor
.
process_inputs
(
request_id
=
"warmup"
,
prompt
=
dummy_prompt
,
params
=
dummy_params
,
)
_
=
self
.
renderer
.
render_cmpl
([
parsed_prompt
])
warmup_elapsed
=
time
.
perf_counter
()
-
warmup_start
logger
.
info
(
"Input processor warmup completed in %.2fs"
,
warmup_elapsed
)
...
...
@@ -257,7 +246,7 @@ class OpenAISpeechToText(OpenAIServing):
self
,
request
:
SpeechToTextRequest
,
audio_data
:
bytes
,
)
->
tuple
[
list
[
Pro
mptType
],
float
]:
)
->
tuple
[
list
[
Pro
cessorInputs
],
float
]:
# Validate request
language
=
self
.
model_cls
.
validate_language
(
request
.
language
)
# Skip to_language validation to avoid extra logging for Whisper.
...
...
@@ -285,7 +274,7 @@ class OpenAISpeechToText(OpenAIServing):
and
duration
>
self
.
asr_config
.
max_audio_clip_s
)
chunks
=
[
y
]
if
not
do_split_audio
else
self
.
_split_audio
(
y
,
int
(
sr
))
p
rompts
=
[]
p
arsed_prompts
:
list
[
DictPrompt
]
=
[]
for
chunk
in
chunks
:
# The model has control over the construction, as long as it
# returns a valid PromptType.
...
...
@@ -298,12 +287,19 @@ class OpenAISpeechToText(OpenAIServing):
request_prompt
=
request
.
prompt
,
to_language
=
to_language
,
)
parsed_prompt
:
DictPrompt
if
request
.
response_format
==
"verbose_json"
:
prompt
=
self
.
_preprocess_verbose_prompt
(
parse_enc_dec_prompt
(
prompt
))
parsed_prompt
=
parse_enc_dec_prompt
(
prompt
)
parsed_prompt
=
self
.
_preprocess_verbose_prompt
(
parsed_prompt
)
else
:
parsed_prompt
=
parse_model_prompt
(
self
.
model_config
,
prompt
)
parsed_prompts
.
append
(
parsed_prompt
)
prompts
.
append
(
prompt
)
engine_prompts
=
await
self
.
renderer
.
render_cmpl_async
(
parsed_
prompt
s
)
return
prompts
,
duration
return
engine_
prompts
,
duration
def
_preprocess_verbose_prompt
(
self
,
prompt
:
EncoderDecoderDictPrompt
):
dec_prompt
=
prompt
[
"decoder_prompt"
]
...
...
@@ -436,7 +432,7 @@ class OpenAISpeechToText(OpenAIServing):
try
:
lora_request
=
self
.
_maybe_get_adapters
(
request
)
prompts
,
duration_s
=
await
self
.
_preprocess_speech_to_text
(
engine_
prompts
,
duration_s
=
await
self
.
_preprocess_speech_to_text
(
request
=
request
,
audio_data
=
audio_data
,
)
...
...
@@ -445,57 +441,54 @@ class OpenAISpeechToText(OpenAIServing):
logger
.
exception
(
"Error in preprocessing prompt inputs"
)
return
self
.
create_error_response
(
e
)
# Schedule the request and get the result generator.
max_model_len
=
self
.
model_config
.
max_model_len
list_result_generator
:
list
[
AsyncGenerator
[
RequestOutput
,
None
]]
|
None
=
None
try
:
# Unlike most decoder-only models, whisper generation length is not
# constrained by the size of the input audio, which is mapped to a
# fixed-size log-mel-spectogram. Still, allow for fewer tokens to be
# generated by respecting the extra completion tokens arg.
if
request
.
max_completion_tokens
is
None
:
default_max_tokens
=
self
.
model_config
.
max_model_len
else
:
default_max_tokens
=
min
(
self
.
model_config
.
max_model_len
,
request
.
max_completion_tokens
)
max_tokens
=
get_max_tokens
(
max_model_len
,
request
.
max_completion_tokens
,
0
,
self
.
default_sampling_params
,
)
sampling_params
=
request
.
to_sampling_params
(
default_max_tokens
,
self
.
default_sampling_params
max_tokens
,
self
.
default_sampling_params
,
)
if
request
.
response_format
==
"verbose_json"
:
sampling_params
.
logprobs
=
1
self
.
_log_inputs
(
request_id
,
# It will not display special tokens like <|startoftranscript|>
request
.
prompt
,
params
=
sampling_params
,
lora_request
=
lora_request
,
)
trace_headers
=
(
None
if
raw_request
is
None
else
await
self
.
_get_trace_headers
(
raw_request
.
headers
)
)
list_result_generator
=
[]
for
i
,
prompt
in
enumerate
(
prompts
):
for
i
,
engine_
prompt
in
enumerate
(
engine_
prompts
):
request_id_item
=
f
"
{
request_id
}
_
{
i
}
"
engine_request
=
self
.
input_processor
.
process_inputs
(
self
.
_log_inputs
(
request_id_item
,
prompt
,
engine_prompt
,
params
=
sampling_params
,
lora_request
=
lora_request
,
)
trace_headers
=
(
None
if
raw_request
is
None
else
await
self
.
_get_trace_headers
(
raw_request
.
headers
)
)
generator
=
self
.
engine_client
.
generate
(
engine_prompt
,
sampling_params
,
request_id_item
,
lora_request
=
lora_request
,
trace_headers
=
trace_headers
,
priority
=
0
,
)
list_result_generator
.
append
(
self
.
engine_client
.
generate
(
engine_request
,
sampling_params
,
request_id_item
,
lora_request
=
lora_request
,
)
)
list_result_generator
.
append
(
generator
)
except
ValueError
as
e
:
return
self
.
create_error_response
(
e
)
...
...
vllm/entrypoints/pooling/embed/serving.py
View file @
574fe752
...
...
@@ -28,11 +28,10 @@ from vllm.entrypoints.pooling.utils import (
encode_pooling_output_base64
,
encode_pooling_output_float
,
)
from
vllm.inputs.data
import
TokensPrompt
from
vllm.inputs.data
import
ProcessorInputs
,
TokensPrompt
,
token_inputs
from
vllm.logger
import
init_logger
from
vllm.outputs
import
PoolingOutput
,
PoolingRequestOutput
from
vllm.pooling_params
import
PoolingParams
from
vllm.renderers.inputs
import
TokPrompt
from
vllm.utils.async_utils
import
merge_async_iterators
from
vllm.utils.collection_utils
import
chunk_list
from
vllm.utils.serial_utils
import
EmbedDType
,
Endianness
...
...
@@ -256,7 +255,7 @@ class OpenAIServingEmbedding(OpenAIServing):
chunk_request_id
=
f
"
{
ctx
.
request_id
}
-prompt-
{
prompt_idx
}
-chunk-
{
chunk_idx
}
"
# Create engine prompt for this chunk
chunk_engine_prompt
=
TokensPrompt
(
prompt_
token_i
ds
=
chunk_tokens
)
chunk_engine_prompt
=
token_i
nputs
(
chunk_tokens
)
# Log the chunk
self
.
_log_inputs
(
...
...
@@ -266,16 +265,12 @@ class OpenAIServingEmbedding(OpenAIServing):
lora_request
=
ctx
.
lora_request
,
)
tok_params
=
ctx
.
request
.
build_tok_params
(
self
.
model_config
)
tokenization_kwargs
=
tok_params
.
get_encode_kwargs
()
# Create generator for this chunk and wrap it to return indices
original_generator
=
self
.
engine_client
.
encode
(
chunk_engine_prompt
,
pooling_params
,
chunk_request_id
,
lora_request
=
ctx
.
lora_request
,
tokenization_kwargs
=
tokenization_kwargs
,
trace_headers
=
trace_headers
,
priority
=
ctx
.
request
.
priority
,
)
...
...
@@ -362,7 +357,7 @@ class OpenAIServingEmbedding(OpenAIServing):
async
def
_create_single_prompt_generator
(
self
,
ctx
:
EmbeddingServeContext
,
engine_prompt
:
TokPrompt
,
engine_prompt
:
ProcessorInputs
,
pooling_params
:
PoolingParams
,
trace_headers
:
Mapping
[
str
,
str
]
|
None
,
prompt_index
:
int
,
...
...
@@ -377,16 +372,12 @@ class OpenAIServingEmbedding(OpenAIServing):
lora_request
=
ctx
.
lora_request
,
)
tok_params
=
ctx
.
request
.
build_tok_params
(
self
.
model_config
)
tokenization_kwargs
=
tok_params
.
get_encode_kwargs
()
# Return the original generator without wrapping
return
self
.
engine_client
.
encode
(
engine_prompt
,
pooling_params
,
request_id_item
,
lora_request
=
ctx
.
lora_request
,
tokenization_kwargs
=
tokenization_kwargs
,
trace_headers
=
trace_headers
,
priority
=
ctx
.
request
.
priority
,
)
...
...
vllm/entrypoints/pooling/pooling/serving.py
View file @
574fe752
...
...
@@ -33,10 +33,9 @@ from vllm.entrypoints.pooling.utils import (
encode_pooling_output_base64
,
encode_pooling_output_float
,
)
from
vllm.inputs
import
Pro
mptType
from
vllm.inputs
import
Pro
cessorInputs
from
vllm.logger
import
init_logger
from
vllm.outputs
import
PoolingRequestOutput
from
vllm.renderers.inputs
import
TokPrompt
from
vllm.renderers.inputs.preprocess
import
prompt_to_seq
from
vllm.utils.async_utils
import
merge_async_iterators
from
vllm.utils.serial_utils
import
EmbedDType
,
EncodingFormat
,
Endianness
...
...
@@ -93,7 +92,7 @@ class OpenAIServingPooling(OpenAIServing):
"dimensions is currently not supported"
)
engine_prompts
:
Sequence
[
Pro
mptType
|
TokPrompt
]
engine_prompts
:
Sequence
[
Pro
cessorInputs
]
if
use_io_processor
:
=
isinstance
(
request
,
IOProcessorRequest
):
if
self
.
io_processor
is
None
:
raise
ValueError
(
...
...
@@ -152,9 +151,6 @@ class OpenAIServingPooling(OpenAIServing):
else
:
pooling_params
=
request
.
to_pooling_params
()
# type: ignore
tok_params
=
request
.
build_tok_params
(
self
.
model_config
)
tokenization_kwargs
=
tok_params
.
get_encode_kwargs
()
for
i
,
engine_prompt
in
enumerate
(
engine_prompts
):
request_id_item
=
f
"
{
request_id
}
-
{
i
}
"
...
...
@@ -176,7 +172,6 @@ class OpenAIServingPooling(OpenAIServing):
pooling_params
,
request_id_item
,
lora_request
=
lora_request
,
tokenization_kwargs
=
tokenization_kwargs
,
trace_headers
=
trace_headers
,
priority
=
request
.
priority
,
)
...
...
vllm/entrypoints/pooling/score/serving.py
View file @
574fe752
...
...
@@ -35,7 +35,7 @@ from vllm.entrypoints.pooling.score.utils import (
get_score_prompt
,
validate_score_input
,
)
from
vllm.inputs.data
import
TokensPrompt
from
vllm.inputs.data
import
ProcessorInputs
,
TokensPrompt
,
token_inputs
from
vllm.logger
import
init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.outputs
import
PoolingRequestOutput
,
ScoringRequestOutput
...
...
@@ -108,12 +108,15 @@ class ServingScores(OpenAIServing):
*
(
encode_async
(
t
,
**
tokenization_kwargs
)
for
t
in
input_texts
)
)
engine_prompts
:
list
[
TokensPrompt
]
=
[]
engine_prompts
:
list
[
ProcessorInputs
]
=
[]
for
tok_result
,
input_text
in
zip
(
tokenized_prompts
,
input_texts
):
text_token_prompt
=
self
.
_validate_input
(
request
,
tok_result
,
input_text
)
engine_prompts
.
append
(
TokensPrompt
(
prompt_token_ids
=
text_token_prompt
[
"prompt_token_ids"
])
token_inputs
(
text_token_prompt
[
"prompt_token_ids"
],
prompt
=
input_text
,
)
)
# Schedule the request and get the result generator.
...
...
@@ -125,7 +128,7 @@ class ServingScores(OpenAIServing):
self
.
_log_inputs
(
request_id_item
,
input_texts
[
i
]
,
engine_prompt
,
params
=
pooling_params
,
lora_request
=
lora_request
,
)
...
...
@@ -207,12 +210,15 @@ class ServingScores(OpenAIServing):
*
(
encode_async
(
t
,
**
tokenization_kwargs
)
for
t
in
input_texts
)
)
engine_prompts
:
list
[
TokensPrompt
]
=
[]
engine_prompts
:
list
[
ProcessorInputs
]
=
[]
for
tok_result
,
input_text
in
zip
(
tokenized_prompts
,
input_texts
):
text_token_prompt
=
self
.
_validate_input
(
request
,
tok_result
,
input_text
)
engine_prompts
.
append
(
TokensPrompt
(
prompt_token_ids
=
text_token_prompt
[
"prompt_token_ids"
])
token_inputs
(
text_token_prompt
[
"prompt_token_ids"
],
prompt
=
input_text
,
)
)
# Schedule the request and get the result generator.
...
...
@@ -225,7 +231,7 @@ class ServingScores(OpenAIServing):
self
.
_log_inputs
(
request_id_item
,
input_texts
[
i
]
,
engine_prompt
,
params
=
pooling_params
,
lora_request
=
lora_request
,
)
...
...
vllm/entrypoints/serve/disagg/serving.py
View file @
574fe752
...
...
@@ -29,7 +29,6 @@ from vllm.entrypoints.serve.disagg.protocol import (
GenerateResponse
,
GenerateResponseChoice
,
)
from
vllm.inputs.data
import
TokensPrompt
from
vllm.logger
import
init_logger
from
vllm.logprobs
import
Logprob
from
vllm.outputs
import
RequestOutput
...
...
@@ -116,7 +115,7 @@ class ServingTokens(OpenAIServing):
self
.
_log_inputs
(
request_id
,
TokensPrompt
(
prompt_token_ids
=
request
.
token_ids
)
,
engine_prompt
,
params
=
sampling_params
,
lora_request
=
lora_request
,
)
...
...
@@ -127,27 +126,13 @@ class ServingTokens(OpenAIServing):
else
await
self
.
_get_trace_headers
(
raw_request
.
headers
)
)
tok_params
=
request
.
build_tok_params
(
self
.
model_config
)
tokenization_kwargs
=
tok_params
.
get_encode_kwargs
()
engine_request
=
self
.
input_processor
.
process_inputs
(
request_id
,
engine_prompt
,
sampling_params
,
lora_request
=
lora_request
,
tokenization_kwargs
=
tokenization_kwargs
,
trace_headers
=
trace_headers
,
priority
=
request
.
priority
,
)
result_generator
=
self
.
engine_client
.
generate
(
engine_
reques
t
,
engine_
promp
t
,
sampling_params
,
request_id
,
lora_request
=
lora_request
,
trace_headers
=
trace_headers
,
priority
=
request
.
priority
,
tokenization_kwargs
=
tokenization_kwargs
,
)
except
ValueError
as
e
:
...
...
vllm/entrypoints/serve/tokenize/serving.py
View file @
574fe752
...
...
@@ -20,7 +20,7 @@ from vllm.entrypoints.serve.tokenize.protocol import (
TokenizeResponse
,
TokenizerInfoResponse
,
)
from
vllm.inputs
import
TokensPrompt
from
vllm.inputs
import
TokensPrompt
,
token_inputs
from
vllm.logger
import
init_logger
from
vllm.tokenizers
import
TokenizerLike
...
...
@@ -135,7 +135,7 @@ class OpenAIServingTokenization(OpenAIServing):
self
.
_log_inputs
(
request_id
,
TokensPrompt
(
prompt_
token_i
ds
=
request
.
tokens
),
token_i
nputs
(
request
.
tokens
),
params
=
None
,
lora_request
=
lora_request
,
)
...
...
vllm/inputs/data.py
View file @
574fe752
...
...
@@ -187,6 +187,9 @@ class _InputOptions(TypedDict):
Additional options available to all input types.
"""
arrival_time
:
NotRequired
[
float
]
"""The time when the input was received (before rendering)."""
cache_salt
:
NotRequired
[
str
]
"""Optional cache salt to be used for prefix caching."""
...
...
@@ -300,6 +303,9 @@ class EncoderDecoderInputs(TypedDict):
decoder_prompt
:
DecoderInputs
"""The inputs for the decoder portion."""
arrival_time
:
NotRequired
[
float
]
"""The time when the input was received (before rendering)."""
ProcessorInputs
:
TypeAlias
=
DecoderOnlyInputs
|
EncoderDecoderInputs
"""
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment