Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ba2f0acc
Unverified
Commit
ba2f0acc
authored
Mar 26, 2026
by
Cyrus Leung
Committed by
GitHub
Mar 25, 2026
Browse files
[Misc] Reorganize inputs (#35182)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
678b3c99
Changes
141
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
260 additions
and
273 deletions
+260
-273
vllm/entrypoints/anthropic/serving.py
vllm/entrypoints/anthropic/serving.py
+4
-4
vllm/entrypoints/chat_utils.py
vllm/entrypoints/chat_utils.py
+2
-1
vllm/entrypoints/llm.py
vllm/entrypoints/llm.py
+22
-22
vllm/entrypoints/openai/chat_completion/serving.py
vllm/entrypoints/openai/chat_completion/serving.py
+11
-11
vllm/entrypoints/openai/completion/serving.py
vllm/entrypoints/openai/completion/serving.py
+15
-17
vllm/entrypoints/openai/engine/serving.py
vllm/entrypoints/openai/engine/serving.py
+13
-17
vllm/entrypoints/openai/realtime/serving.py
vllm/entrypoints/openai/realtime/serving.py
+3
-3
vllm/entrypoints/openai/responses/serving.py
vllm/entrypoints/openai/responses/serving.py
+22
-26
vllm/entrypoints/openai/speech_to_text/speech_to_text.py
vllm/entrypoints/openai/speech_to_text/speech_to_text.py
+17
-16
vllm/entrypoints/pooling/base/io_processor.py
vllm/entrypoints/pooling/base/io_processor.py
+10
-10
vllm/entrypoints/pooling/base/serving.py
vllm/entrypoints/pooling/base/serving.py
+8
-8
vllm/entrypoints/pooling/embed/io_processor.py
vllm/entrypoints/pooling/embed/io_processor.py
+20
-20
vllm/entrypoints/pooling/pooling/serving.py
vllm/entrypoints/pooling/pooling/serving.py
+9
-9
vllm/entrypoints/pooling/score/serving.py
vllm/entrypoints/pooling/score/serving.py
+36
-45
vllm/entrypoints/pooling/score/utils.py
vllm/entrypoints/pooling/score/utils.py
+7
-3
vllm/entrypoints/pooling/typing.py
vllm/entrypoints/pooling/typing.py
+2
-2
vllm/entrypoints/serve/disagg/protocol.py
vllm/entrypoints/serve/disagg/protocol.py
+9
-8
vllm/entrypoints/serve/disagg/serving.py
vllm/entrypoints/serve/disagg/serving.py
+3
-5
vllm/entrypoints/serve/render/serving.py
vllm/entrypoints/serve/render/serving.py
+38
-37
vllm/entrypoints/serve/tokenize/serving.py
vllm/entrypoints/serve/tokenize/serving.py
+9
-9
No files found.
vllm/entrypoints/anthropic/serving.py
View file @
ba2f0acc
...
...
@@ -797,12 +797,12 @@ class AnthropicServingMessages(OpenAIServingChat):
if
isinstance
(
result
,
ErrorResponse
):
return
result
_
,
engine_
promp
ts
=
result
_
,
engine_
inpu
ts
=
result
input_tokens
=
sum
(
# type: ignore
len
(
promp
t
[
"prompt_token_ids"
])
# type: ignore[typeddict-item, misc]
for
promp
t
in
engine_
promp
ts
if
"prompt_token_ids"
in
promp
t
len
(
engine_inpu
t
[
"prompt_token_ids"
])
# type: ignore[typeddict-item, misc]
for
engine_inpu
t
in
engine_
inpu
ts
if
"prompt_token_ids"
in
engine_inpu
t
)
response
=
AnthropicCountTokensResponse
(
...
...
vllm/entrypoints/chat_utils.py
View file @
ba2f0acc
...
...
@@ -40,9 +40,10 @@ from typing_extensions import Required, TypedDict
from
vllm
import
envs
from
vllm.config
import
ModelConfig
from
vllm.inputs
import
MultiModalDataDict
,
MultiModalUUIDDict
from
vllm.logger
import
init_logger
from
vllm.model_executor.models
import
SupportsMultiModal
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalDataDict
,
MultiModalUUIDDict
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalBatchedField
,
MultiModalFlatField
,
...
...
vllm/entrypoints/llm.py
View file @
ba2f0acc
...
...
@@ -57,9 +57,9 @@ from vllm.entrypoints.pooling.score.utils import (
validate_score_input
,
)
from
vllm.entrypoints.utils
import
log_non_default_args
from
vllm.inputs
.data
import
(
from
vllm.inputs
import
(
DataPrompt
,
Processor
Input
s
,
Engine
Input
,
PromptType
,
SingletonPrompt
,
TextPrompt
,
...
...
@@ -589,7 +589,7 @@ class LLM:
def
_resolve_mm_lora
(
self
,
prompt
:
Processor
Input
s
,
prompt
:
Engine
Input
,
lora_request
:
LoRARequest
|
None
,
)
->
LoRARequest
|
None
:
if
prompt
[
"type"
]
!=
"multimodal"
:
...
...
@@ -716,8 +716,8 @@ class LLM:
eos_token_id
=
tokenizer
.
eos_token_id
sort_beams_key
=
create_sort_beams_key_function
(
eos_token_id
,
length_penalty
)
engine_
promp
ts
=
self
.
_preprocess_cmpl
(
prompts
)
lora_requests
=
self
.
_lora_request_to_seq
(
lora_request
,
len
(
engine_
promp
ts
))
engine_
inpu
ts
=
self
.
_preprocess_cmpl
(
prompts
)
lora_requests
=
self
.
_lora_request_to_seq
(
lora_request
,
len
(
engine_
inpu
ts
))
if
use_tqdm
and
concurrency_limit
is
not
None
:
logger
.
warning
(
...
...
@@ -727,7 +727,7 @@ class LLM:
use_tqdm
=
False
if
concurrency_limit
is
None
:
concurrency_limit
=
len
(
engine_
promp
ts
)
concurrency_limit
=
len
(
engine_
inpu
ts
)
# generate 2 * beam_width candidates at each step
# following the huggingface transformers implementation
...
...
@@ -740,7 +740,7 @@ class LLM:
)
instances
:
list
[
BeamSearchInstance
]
=
[]
for
lora_req
,
prompt
in
zip
(
lora_requests
,
engine_
promp
ts
):
for
lora_req
,
prompt
in
zip
(
lora_requests
,
engine_
inpu
ts
):
if
prompt
[
"type"
]
==
"embeds"
:
raise
NotImplementedError
(
"Embedding prompt not supported for beam search"
...
...
@@ -845,7 +845,7 @@ class LLM:
self
,
prompts
:
Sequence
[
PromptType
],
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
)
->
Sequence
[
Processor
Input
s
]:
)
->
Sequence
[
Engine
Input
]:
"""
Convert prompt inputs from LLM APIs (other than [LLM.chat][]) into
a format that can be passed to `_add_request`.
...
...
@@ -853,7 +853,7 @@ class LLM:
Refer to [LLM.generate][] for a complete description of the arguments.
Returns:
A list of `
Processor
Input
s
` objects ready to be passed into LLMEngine.
A list of `
Engine
Input` objects ready to be passed into LLMEngine.
"""
renderer
=
self
.
renderer
model_config
=
self
.
model_config
...
...
@@ -871,9 +871,9 @@ class LLM:
self
,
prompt
:
PromptType
,
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
)
->
Processor
Input
s
:
(
engine_
promp
t
,)
=
self
.
_preprocess_cmpl
([
prompt
],
tokenization_kwargs
)
return
engine_
promp
t
)
->
Engine
Input
:
(
engine_
inpu
t
,)
=
self
.
_preprocess_cmpl
([
prompt
],
tokenization_kwargs
)
return
engine_
inpu
t
def
_preprocess_chat
(
self
,
...
...
@@ -886,7 +886,7 @@ class LLM:
tools
:
list
[
dict
[
str
,
Any
]]
|
None
=
None
,
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
mm_processor_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
)
->
Sequence
[
Processor
Input
s
]:
)
->
Sequence
[
Engine
Input
]:
"""
Convert a list of conversations into prompts so that they can then
be used as input for other LLM APIs.
...
...
@@ -894,7 +894,7 @@ class LLM:
Refer to [LLM.chat][] for a complete description of the arguments.
Returns:
A list of `
Processor
Input
s
` objects ready to be passed into LLMEngine.
A list of `
Engine
Input` objects ready to be passed into LLMEngine.
"""
renderer
=
self
.
renderer
...
...
@@ -915,14 +915,14 @@ class LLM:
**
(
tokenization_kwargs
or
{})
)
_
,
engine_
promp
ts
=
renderer
.
render_chat
(
_
,
engine_
inpu
ts
=
renderer
.
render_chat
(
conversations
,
chat_params
,
tok_params
,
prompt_extras
=
{
"mm_processor_kwargs"
:
mm_processor_kwargs
},
)
return
engine_
promp
ts
return
engine_
inpu
ts
def
_preprocess_chat_one
(
self
,
...
...
@@ -935,8 +935,8 @@ class LLM:
tools
:
list
[
dict
[
str
,
Any
]]
|
None
=
None
,
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
mm_processor_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
)
->
Processor
Input
s
:
(
engine_
promp
t
,)
=
self
.
_preprocess_chat
(
)
->
Engine
Input
:
(
engine_
inpu
t
,)
=
self
.
_preprocess_chat
(
[
conversation
],
chat_template
=
chat_template
,
chat_template_content_format
=
chat_template_content_format
,
...
...
@@ -948,7 +948,7 @@ class LLM:
mm_processor_kwargs
=
mm_processor_kwargs
,
)
return
engine_
promp
t
return
engine_
inpu
t
def
chat
(
self
,
...
...
@@ -1909,7 +1909,7 @@ class LLM:
def
_render_and_run_requests
(
self
,
prompts
:
Iterable
[
Processor
Input
s
],
prompts
:
Iterable
[
Engine
Input
],
params
:
Sequence
[
SamplingParams
|
PoolingParams
],
output_type
:
type
[
_O
],
*
,
...
...
@@ -1938,7 +1938,7 @@ class LLM:
def
_render_and_add_requests
(
self
,
prompts
:
Iterable
[
Processor
Input
s
],
prompts
:
Iterable
[
Engine
Input
],
params
:
Sequence
[
SamplingParams
|
PoolingParams
],
*
,
lora_requests
:
Sequence
[
LoRARequest
|
None
]
|
None
=
None
,
...
...
@@ -1967,7 +1967,7 @@ class LLM:
def
_add_request
(
self
,
prompt
:
Processor
Input
s
,
prompt
:
Engine
Input
,
params
:
SamplingParams
|
PoolingParams
,
lora_request
:
LoRARequest
|
None
=
None
,
priority
:
int
=
0
,
...
...
vllm/entrypoints/openai/chat_completion/serving.py
View file @
ba2f0acc
...
...
@@ -63,7 +63,7 @@ from vllm.entrypoints.openai.parser.harmony_utils import (
)
from
vllm.entrypoints.openai.utils
import
maybe_filter_parallel_tool_calls
from
vllm.entrypoints.utils
import
get_max_tokens
,
should_include_usage
from
vllm.inputs
.data
import
Processor
Input
s
from
vllm.inputs
import
Engine
Input
from
vllm.logger
import
init_logger
from
vllm.logprobs
import
Logprob
from
vllm.outputs
import
CompletionOutput
,
RequestOutput
...
...
@@ -177,7 +177,7 @@ class OpenAIServingChat(OpenAIServing):
async
def
render_chat_request
(
self
,
request
:
ChatCompletionRequest
,
)
->
tuple
[
list
[
ConversationMessage
],
list
[
Processor
Input
s
]]
|
ErrorResponse
:
)
->
tuple
[
list
[
ConversationMessage
],
list
[
Engine
Input
]]
|
ErrorResponse
:
"""
Validate the model and preprocess a chat completion request.
...
...
@@ -185,7 +185,7 @@ class OpenAIServingChat(OpenAIServing):
engine-aware checks (LoRA model validation, engine health).
Returns:
A tuple of (conversation, engine_
promp
ts) on success,
A tuple of (conversation, engine_
inpu
ts) on success,
or an ErrorResponse on failure.
"""
error_check_ret
=
await
self
.
_check_model
(
request
)
...
...
@@ -231,7 +231,7 @@ class OpenAIServingChat(OpenAIServing):
if
isinstance
(
result
,
ErrorResponse
):
return
result
conversation
,
engine_
promp
ts
=
result
conversation
,
engine_
inpu
ts
=
result
request_id
=
(
f
"chatcmpl-
{
self
.
_base_request_id
(
raw_request
,
request
.
request_id
)
}
"
...
...
@@ -251,13 +251,13 @@ class OpenAIServingChat(OpenAIServing):
# Schedule the request and get the result generator.
max_model_len
=
self
.
model_config
.
max_model_len
generators
:
list
[
AsyncGenerator
[
RequestOutput
,
None
]]
=
[]
for
i
,
engine_
promp
t
in
enumerate
(
engine_
promp
ts
):
prompt_token_ids
=
self
.
_extract_prompt_components
(
engine_
promp
t
).
token_ids
for
i
,
engine_
inpu
t
in
enumerate
(
engine_
inpu
ts
):
prompt_token_ids
=
self
.
_extract_prompt_components
(
engine_
inpu
t
).
token_ids
# If we are creating sub requests for multiple prompts, ensure that they
# have unique request ids.
sub_request_id
=
(
request_id
if
len
(
engine_
promp
ts
)
==
1
else
f
"
{
request_id
}
_
{
i
}
"
request_id
if
len
(
engine_
inpu
ts
)
==
1
else
f
"
{
request_id
}
_
{
i
}
"
)
max_tokens
=
get_max_tokens
(
...
...
@@ -265,7 +265,7 @@ class OpenAIServingChat(OpenAIServing):
request
.
max_completion_tokens
if
request
.
max_completion_tokens
is
not
None
else
request
.
max_tokens
,
self
.
_extract_prompt_len
(
engine_
promp
t
),
self
.
_extract_prompt_len
(
engine_
inpu
t
),
self
.
default_sampling_params
,
self
.
override_max_tokens
,
)
...
...
@@ -283,7 +283,7 @@ class OpenAIServingChat(OpenAIServing):
self
.
_log_inputs
(
sub_request_id
,
engine_
promp
t
,
engine_
inpu
t
,
params
=
sampling_params
,
lora_request
=
lora_request
,
)
...
...
@@ -296,7 +296,7 @@ class OpenAIServingChat(OpenAIServing):
if
isinstance
(
sampling_params
,
BeamSearchParams
):
generator
=
self
.
beam_search
(
prompt
=
engine_
promp
t
,
prompt
=
engine_
inpu
t
,
request_id
=
sub_request_id
,
params
=
sampling_params
,
lora_request
=
lora_request
,
...
...
@@ -313,7 +313,7 @@ class OpenAIServingChat(OpenAIServing):
reasoning_ended
=
None
generator
=
self
.
engine_client
.
generate
(
engine_
promp
t
,
engine_
inpu
t
,
sampling_params
,
sub_request_id
,
lora_request
=
lora_request
,
...
...
vllm/entrypoints/openai/completion/serving.py
View file @
ba2f0acc
...
...
@@ -33,7 +33,7 @@ from vllm.entrypoints.openai.engine.serving import (
from
vllm.entrypoints.openai.models.serving
import
OpenAIServingModels
from
vllm.entrypoints.utils
import
get_max_tokens
,
should_include_usage
from
vllm.exceptions
import
VLLMValidationError
from
vllm.inputs
.data
import
Processor
Input
s
from
vllm.inputs
import
Engine
Input
from
vllm.logger
import
init_logger
from
vllm.logprobs
import
Logprob
from
vllm.outputs
import
RequestOutput
...
...
@@ -82,7 +82,7 @@ class OpenAIServingCompletion(OpenAIServing):
async
def
render_completion_request
(
self
,
request
:
CompletionRequest
,
)
->
list
[
Processor
Input
s
]
|
ErrorResponse
:
)
->
list
[
Engine
Input
]
|
ErrorResponse
:
"""
Validate the model and preprocess a completion request.
...
...
@@ -90,8 +90,7 @@ class OpenAIServingCompletion(OpenAIServing):
engine-aware checks (LoRA model validation, engine health).
Returns:
A list of engine_prompts on success,
or an ErrorResponse on failure.
A list of engine_inputs on success, or an ErrorResponse on failure.
"""
error_check_ret
=
await
self
.
_check_model
(
request
)
if
error_check_ret
is
not
None
:
...
...
@@ -128,7 +127,7 @@ class OpenAIServingCompletion(OpenAIServing):
if
isinstance
(
result
,
ErrorResponse
):
return
result
engine_
promp
ts
=
result
engine_
inpu
ts
=
result
request_id
=
f
"cmpl-
{
self
.
_base_request_id
(
raw_request
,
request
.
request_id
)
}
"
created_time
=
int
(
time
.
time
())
...
...
@@ -145,11 +144,11 @@ class OpenAIServingCompletion(OpenAIServing):
# Schedule the request and get the result generator.
max_model_len
=
self
.
model_config
.
max_model_len
generators
:
list
[
AsyncGenerator
[
RequestOutput
,
None
]]
=
[]
for
i
,
engine_
promp
t
in
enumerate
(
engine_
promp
ts
):
for
i
,
engine_
inpu
t
in
enumerate
(
engine_
inpu
ts
):
max_tokens
=
get_max_tokens
(
max_model_len
,
request
.
max_tokens
,
self
.
_extract_prompt_len
(
engine_
promp
t
),
self
.
_extract_prompt_len
(
engine_
inpu
t
),
self
.
default_sampling_params
,
self
.
override_max_tokens
,
)
...
...
@@ -169,7 +168,7 @@ class OpenAIServingCompletion(OpenAIServing):
self
.
_log_inputs
(
request_id_item
,
engine_
promp
t
,
engine_
inpu
t
,
params
=
sampling_params
,
lora_request
=
lora_request
,
)
...
...
@@ -182,7 +181,7 @@ class OpenAIServingCompletion(OpenAIServing):
if
isinstance
(
sampling_params
,
BeamSearchParams
):
generator
=
self
.
beam_search
(
prompt
=
engine_
promp
t
,
prompt
=
engine_
inpu
t
,
request_id
=
request_id
,
params
=
sampling_params
,
lora_request
=
lora_request
,
...
...
@@ -190,7 +189,7 @@ class OpenAIServingCompletion(OpenAIServing):
)
else
:
generator
=
self
.
engine_client
.
generate
(
engine_
promp
t
,
engine_
inpu
t
,
sampling_params
,
request_id_item
,
lora_request
=
lora_request
,
...
...
@@ -204,7 +203,7 @@ class OpenAIServingCompletion(OpenAIServing):
result_generator
=
merge_async_iterators
(
*
generators
)
model_name
=
self
.
models
.
model_name
(
lora_request
)
num_prompts
=
len
(
engine_
promp
ts
)
num_prompts
=
len
(
engine_
inpu
ts
)
# Streaming response
tokenizer
=
self
.
renderer
.
tokenizer
...
...
@@ -212,7 +211,7 @@ class OpenAIServingCompletion(OpenAIServing):
if
request
.
stream
:
return
self
.
completion_stream_generator
(
request
,
engine_
promp
ts
,
engine_
inpu
ts
,
result_generator
,
request_id
,
created_time
,
...
...
@@ -235,8 +234,7 @@ class OpenAIServingCompletion(OpenAIServing):
# We did not pass it into vLLM engine to avoid being redundant
# with the inputs token IDs
if
final_res
.
prompt
is
None
:
engine_prompt
=
engine_prompts
[
i
]
final_res
.
prompt
=
self
.
_extract_prompt_text
(
engine_prompt
)
final_res
.
prompt
=
self
.
_extract_prompt_text
(
engine_inputs
[
i
])
final_res_batch_checked
=
cast
(
list
[
RequestOutput
],
final_res_batch
)
...
...
@@ -268,7 +266,7 @@ class OpenAIServingCompletion(OpenAIServing):
async
def
completion_stream_generator
(
self
,
request
:
CompletionRequest
,
engine_
promp
ts
:
list
[
Processor
Input
s
],
engine_
inpu
ts
:
list
[
Engine
Input
],
result_generator
:
AsyncIterator
[
tuple
[
int
,
RequestOutput
]],
request_id
:
str
,
created_time
:
int
,
...
...
@@ -301,8 +299,8 @@ class OpenAIServingCompletion(OpenAIServing):
prompt_text
=
res
.
prompt
if
prompt_text
is
None
:
engine_
promp
t
=
engine_
promp
ts
[
prompt_idx
]
prompt_text
=
self
.
_extract_prompt_text
(
engine_
promp
t
)
engine_
inpu
t
=
engine_
inpu
ts
[
prompt_idx
]
prompt_text
=
self
.
_extract_prompt_text
(
engine_
inpu
t
)
# Prompt details are excluded from later streamed outputs
if
prompt_token_ids
is
not
None
:
...
...
vllm/entrypoints/openai/engine/serving.py
View file @
ba2f0acc
...
...
@@ -72,11 +72,7 @@ from vllm.entrypoints.serve.tokenize.protocol import (
)
from
vllm.entrypoints.utils
import
create_error_response
from
vllm.exceptions
import
VLLMValidationError
from
vllm.inputs.data
import
(
ProcessorInputs
,
PromptType
,
TokensPrompt
,
)
from
vllm.inputs
import
EngineInput
,
PromptType
,
TokensPrompt
from
vllm.logger
import
init_logger
from
vllm.logprobs
import
Logprob
,
PromptLogprobs
from
vllm.lora.request
import
LoRARequest
...
...
@@ -163,7 +159,7 @@ class ServeContext(Generic[RequestT]):
request_id
:
str
created_time
:
int
=
field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
lora_request
:
LoRARequest
|
None
=
None
engine_
promp
ts
:
list
[
Processor
Input
s
]
|
None
=
None
engine_
inpu
ts
:
list
[
Engine
Input
]
|
None
=
None
result_generator
:
AsyncGenerator
[
tuple
[
int
,
PoolingRequestOutput
],
None
]
|
None
=
(
None
...
...
@@ -202,7 +198,7 @@ class OpenAIServing:
async
def
beam_search
(
self
,
prompt
:
Processor
Input
s
,
prompt
:
Engine
Input
,
request_id
:
str
,
params
:
BeamSearchParams
,
lora_request
:
LoRARequest
|
None
=
None
,
...
...
@@ -493,21 +489,21 @@ class OpenAIServing:
if
isinstance
(
pooling_params
,
ErrorResponse
):
return
pooling_params
if
ctx
.
engine_
promp
ts
is
None
:
if
ctx
.
engine_
inpu
ts
is
None
:
return
self
.
create_error_response
(
"Engine prompts not available"
)
for
i
,
engine_
promp
t
in
enumerate
(
ctx
.
engine_
promp
ts
):
for
i
,
engine_
inpu
t
in
enumerate
(
ctx
.
engine_
inpu
ts
):
request_id_item
=
f
"
{
ctx
.
request_id
}
-
{
i
}
"
self
.
_log_inputs
(
request_id_item
,
engine_
promp
t
,
engine_
inpu
t
,
params
=
pooling_params
,
lora_request
=
ctx
.
lora_request
,
)
generator
=
self
.
engine_client
.
encode
(
engine_
promp
t
,
engine_
inpu
t
,
pooling_params
,
request_id_item
,
lora_request
=
ctx
.
lora_request
,
...
...
@@ -526,10 +522,10 @@ class OpenAIServing:
ctx
:
ServeContext
,
)
->
ErrorResponse
|
None
:
"""Collect batch results from the result generator."""
if
ctx
.
engine_
promp
ts
is
None
:
if
ctx
.
engine_
inpu
ts
is
None
:
return
self
.
create_error_response
(
"Engine prompts not available"
)
num_prompts
=
len
(
ctx
.
engine_
promp
ts
)
num_prompts
=
len
(
ctx
.
engine_
inpu
ts
)
final_res_batch
:
list
[
PoolingRequestOutput
|
None
]
final_res_batch
=
[
None
]
*
num_prompts
...
...
@@ -806,19 +802,19 @@ class OpenAIServing:
# Apply server defaults first, then request kwargs override.
return
default_chat_template_kwargs
|
request_chat_template_kwargs
def
_extract_prompt_components
(
self
,
prompt
:
PromptType
|
Processor
Input
s
):
def
_extract_prompt_components
(
self
,
prompt
:
PromptType
|
Engine
Input
):
return
extract_prompt_components
(
self
.
model_config
,
prompt
)
def
_extract_prompt_text
(
self
,
prompt
:
Pro
cessor
Input
s
):
def
_extract_prompt_text
(
self
,
prompt
:
Pro
mptType
|
Engine
Input
):
return
self
.
_extract_prompt_components
(
prompt
).
text
def
_extract_prompt_len
(
self
,
prompt
:
Processor
Input
s
):
def
_extract_prompt_len
(
self
,
prompt
:
Engine
Input
):
return
extract_prompt_len
(
self
.
model_config
,
prompt
)
def
_log_inputs
(
self
,
request_id
:
str
,
inputs
:
PromptType
|
Processor
Input
s
,
inputs
:
PromptType
|
Engine
Input
,
params
:
SamplingParams
|
PoolingParams
|
BeamSearchParams
|
None
,
lora_request
:
LoRARequest
|
None
,
)
->
None
:
...
...
vllm/entrypoints/openai/realtime/serving.py
View file @
ba2f0acc
...
...
@@ -12,7 +12,7 @@ from vllm.engine.protocol import EngineClient, StreamingInput
from
vllm.entrypoints.logger
import
RequestLogger
from
vllm.entrypoints.openai.engine.serving
import
OpenAIServing
from
vllm.entrypoints.openai.models.serving
import
OpenAIServingModels
from
vllm.inputs
.data
import
PromptType
from
vllm.inputs
import
PromptType
from
vllm.logger
import
init_logger
from
vllm.model_executor.models.interfaces
import
SupportsRealtime
from
vllm.renderers.inputs.preprocess
import
parse_model_prompt
...
...
@@ -83,6 +83,6 @@ class OpenAIServingRealtime(OpenAIServing):
async
for
prompt
in
stream_input_iter
:
parsed_prompt
=
parse_model_prompt
(
model_config
,
prompt
)
(
engine_
promp
t
,)
=
await
renderer
.
render_cmpl_async
([
parsed_prompt
])
(
engine_
inpu
t
,)
=
await
renderer
.
render_cmpl_async
([
parsed_prompt
])
yield
StreamingInput
(
prompt
=
engine_
promp
t
)
yield
StreamingInput
(
prompt
=
engine_
inpu
t
)
vllm/entrypoints/openai/responses/serving.py
View file @
ba2f0acc
...
...
@@ -110,7 +110,7 @@ from vllm.entrypoints.openai.responses.utils import (
from
vllm.entrypoints.serve.render.serving
import
OpenAIServingRender
from
vllm.entrypoints.utils
import
get_max_tokens
from
vllm.exceptions
import
VLLMValidationError
from
vllm.inputs
.data
import
Processor
Input
s
,
token_input
s
from
vllm.inputs
import
Engine
Input
,
token
s
_input
from
vllm.logger
import
init_logger
from
vllm.logprobs
import
Logprob
as
SampleLogprob
from
vllm.logprobs
import
SampleLogprobs
...
...
@@ -269,10 +269,10 @@ class OpenAIServingResponses(OpenAIServing):
def
_validate_generator_input
(
self
,
engine_
prompt
:
Processor
Input
s
,
engine_
input
:
Engine
Input
,
)
->
ErrorResponse
|
None
:
"""Add validations to the input to the generator here."""
prompt_len
=
self
.
_extract_prompt_len
(
engine_
promp
t
)
prompt_len
=
self
.
_extract_prompt_len
(
engine_
inpu
t
)
max_model_len
=
self
.
model_config
.
max_model_len
if
prompt_len
>=
max_model_len
:
...
...
@@ -369,11 +369,11 @@ class OpenAIServingResponses(OpenAIServing):
model_name
=
self
.
models
.
model_name
(
lora_request
)
if
self
.
use_harmony
:
messages
,
engine_
promp
ts
=
self
.
_make_request_with_harmony
(
messages
,
engine_
inpu
ts
=
self
.
_make_request_with_harmony
(
request
,
prev_response
)
else
:
messages
,
engine_
promp
ts
=
await
self
.
_make_request
(
request
,
prev_response
)
messages
,
engine_
inpu
ts
=
await
self
.
_make_request
(
request
,
prev_response
)
request_metadata
=
RequestResponseMetadata
(
request_id
=
request
.
request_id
)
if
raw_request
:
...
...
@@ -413,15 +413,15 @@ class OpenAIServingResponses(OpenAIServing):
available_tools
=
[]
tokenizer
=
self
.
renderer
.
get_tokenizer
()
for
engine_
promp
t
in
engine_
promp
ts
:
maybe_error
=
self
.
_validate_generator_input
(
engine_
promp
t
)
for
engine_
inpu
t
in
engine_
inpu
ts
:
maybe_error
=
self
.
_validate_generator_input
(
engine_
inpu
t
)
if
maybe_error
is
not
None
:
return
maybe_error
default_max_tokens
=
get_max_tokens
(
max_model_len
,
request
.
max_output_tokens
,
self
.
_extract_prompt_len
(
engine_
promp
t
),
self
.
_extract_prompt_len
(
engine_
inpu
t
),
self
.
default_sampling_params
,
self
.
override_max_tokens
,
)
...
...
@@ -480,7 +480,7 @@ class OpenAIServingResponses(OpenAIServing):
)
generator
=
self
.
_generate_with_builtin_tools
(
request_id
=
request
.
request_id
,
engine_
promp
t
=
engine_
promp
t
,
engine_
inpu
t
=
engine_
inpu
t
,
sampling_params
=
sampling_params
,
context
=
context
,
lora_request
=
lora_request
,
...
...
@@ -586,7 +586,7 @@ class OpenAIServingResponses(OpenAIServing):
prev_response_output
=
prev_response
.
output
if
prev_response
else
None
,
)
_
,
engine_
promp
ts
=
await
self
.
openai_serving_render
.
preprocess_chat
(
_
,
engine_
inpu
ts
=
await
self
.
openai_serving_render
.
preprocess_chat
(
request
,
messages
,
default_template
=
self
.
chat_template
,
...
...
@@ -595,7 +595,7 @@ class OpenAIServingResponses(OpenAIServing):
tool_dicts
=
tool_dicts
,
tool_parser
=
self
.
parser
.
tool_parser_cls
if
self
.
parser
else
None
,
)
return
messages
,
engine_
promp
ts
return
messages
,
engine_
inpu
ts
async
def
_render_next_turn
(
self
,
...
...
@@ -610,7 +610,7 @@ class OpenAIServingResponses(OpenAIServing):
request_input
=
messages
,
)
_
,
engine_
promp
ts
=
await
self
.
openai_serving_render
.
preprocess_chat
(
_
,
engine_
inpu
ts
=
await
self
.
openai_serving_render
.
preprocess_chat
(
request
,
new_messages
,
default_template
=
chat_template
,
...
...
@@ -619,12 +619,12 @@ class OpenAIServingResponses(OpenAIServing):
tool_dicts
=
tool_dicts
,
tool_parser
=
tool_parser
,
)
return
engine_
promp
ts
return
engine_
inpu
ts
async
def
_generate_with_builtin_tools
(
self
,
request_id
:
str
,
engine_
prompt
:
Processor
Input
s
,
engine_
input
:
Engine
Input
,
sampling_params
:
SamplingParams
,
context
:
ConversationContext
,
lora_request
:
LoRARequest
|
None
=
None
,
...
...
@@ -641,13 +641,13 @@ class OpenAIServingResponses(OpenAIServing):
self
.
_log_inputs
(
sub_request_id
,
engine_
promp
t
,
engine_
inpu
t
,
params
=
sampling_params
,
lora_request
=
lora_request
,
)
generator
=
self
.
engine_client
.
generate
(
engine_
promp
t
,
engine_
inpu
t
,
sampling_params
,
sub_request_id
,
lora_request
=
lora_request
,
...
...
@@ -675,11 +675,11 @@ class OpenAIServingResponses(OpenAIServing):
# Render the next prompt token ids and update sampling_params.
if
isinstance
(
context
,
(
HarmonyContext
,
StreamingHarmonyContext
)):
token_ids
=
context
.
render_for_completion
()
engine_
promp
t
=
token_input
s
(
token_ids
)
engine_
inpu
t
=
token
s
_input
(
token_ids
)
sampling_params
.
max_tokens
=
max_model_len
-
len
(
token_ids
)
elif
isinstance
(
context
,
ParsableContext
):
(
engine_
promp
t
,)
=
await
self
.
_render_next_turn
(
(
engine_
inpu
t
,)
=
await
self
.
_render_next_turn
(
context
.
request
,
context
.
parser
.
response_messages
,
context
.
tool_dicts
,
...
...
@@ -691,7 +691,7 @@ class OpenAIServingResponses(OpenAIServing):
sampling_params
.
max_tokens
=
get_max_tokens
(
max_model_len
,
context
.
request
.
max_output_tokens
,
self
.
_extract_prompt_len
(
engine_
promp
t
),
self
.
_extract_prompt_len
(
engine_
inpu
t
),
self
.
default_sampling_params
,
# type: ignore
self
.
override_max_tokens
,
# type: ignore
)
...
...
@@ -713,14 +713,10 @@ class OpenAIServingResponses(OpenAIServing):
arrival_time
=
time
.
time
()
messages
=
self
.
_construct_input_messages_with_harmony
(
request
,
prev_response
)
prompt_token_ids
=
render_for_completion
(
messages
)
engine_
promp
t
=
token_input
s
(
prompt_token_ids
)
engine_
promp
t
[
"arrival_time"
]
=
arrival_time
engine_
inpu
t
=
token
s
_input
(
prompt_token_ids
,
cache_salt
=
request
.
cache_salt
)
engine_
inpu
t
[
"arrival_time"
]
=
arrival_time
# Add cache_salt if provided in the request
if
request
.
cache_salt
is
not
None
:
engine_prompt
[
"cache_salt"
]
=
request
.
cache_salt
return
messages
,
[
engine_prompt
]
return
messages
,
[
engine_input
]
async
def
_initialize_tool_sessions
(
self
,
...
...
vllm/entrypoints/openai/speech_to_text/speech_to_text.py
View file @
ba2f0acc
...
...
@@ -38,7 +38,7 @@ from vllm.entrypoints.openai.speech_to_text.protocol import (
)
from
vllm.entrypoints.utils
import
get_max_tokens
from
vllm.exceptions
import
VLLMValidationError
from
vllm.inputs
import
EncoderDecoderInput
s
,
Processor
Input
s
from
vllm.inputs
import
EncoderDecoderInput
,
Engine
Input
from
vllm.logger
import
init_logger
from
vllm.logprobs
import
FlatLogprobs
,
Logprob
from
vllm.model_executor.models
import
SupportsTranscription
...
...
@@ -171,7 +171,7 @@ class OpenAISpeechToText(OpenAIServing):
request
:
SpeechToTextRequest
,
audio_data
:
bytes
,
request_id
:
str
,
)
->
tuple
[
list
[
Processor
Input
s
],
float
]:
)
->
tuple
[
list
[
Engine
Input
],
float
]:
# Validate request
language
=
self
.
model_cls
.
validate_language
(
request
.
language
)
# Skip to_language validation to avoid extra logging for Whisper.
...
...
@@ -250,9 +250,9 @@ class OpenAISpeechToText(OpenAIServing):
parsed_prompts
.
append
(
parsed_prompt
)
engine_
promp
ts
=
await
self
.
renderer
.
render_cmpl_async
(
parsed_prompts
)
engine_
inpu
ts
=
await
self
.
renderer
.
render_cmpl_async
(
parsed_prompts
)
return
engine_
promp
ts
,
duration
return
engine_
inpu
ts
,
duration
def
_preprocess_verbose_prompt
(
self
,
prompt
:
EncoderDecoderDictPrompt
):
dec_prompt
=
prompt
[
"decoder_prompt"
]
...
...
@@ -271,7 +271,7 @@ class OpenAISpeechToText(OpenAIServing):
return
prompt
@
staticmethod
def
_get_decoder_prompt_len
(
engine_
promp
ts
:
list
[
Processor
Input
s
])
->
int
:
def
_get_decoder_prompt_len
(
engine_
inpu
ts
:
list
[
Engine
Input
])
->
int
:
"""Get the length of the decoder prompt. Currently we need to offset
by the decoder prompt length when running beam search because the mm
encoder is not currently cached and runs on decode calls; because of
...
...
@@ -282,12 +282,13 @@ class OpenAISpeechToText(OpenAIServing):
encoder/decoder caching is implemented.
"""
input_len
=
0
assert
len
(
engine_prompts
)
>
0
first_eng_prompt
=
engine_prompts
[
0
]
assert
len
(
engine_inputs
)
>
0
first_input
=
engine_inputs
[
0
]
if
first_input
.
get
(
"type"
)
==
"enc_dec"
:
first_input
=
cast
(
EncoderDecoderInput
,
first_input
)
input_len
=
len
(
first_input
[
"decoder_prompt"
][
"prompt_token_ids"
])
if
first_eng_prompt
.
get
(
"type"
)
==
"enc_dec"
:
first_eng_prompt
=
cast
(
EncoderDecoderInputs
,
first_eng_prompt
)
input_len
=
len
(
first_eng_prompt
[
"decoder_prompt"
][
"prompt_token_ids"
])
return
input_len
def
_get_verbose_segments
(
...
...
@@ -409,7 +410,7 @@ class OpenAISpeechToText(OpenAIServing):
lora_request
=
self
.
_maybe_get_adapters
(
request
)
engine_
promp
ts
,
duration_s
=
await
self
.
_preprocess_speech_to_text
(
engine_
inpu
ts
,
duration_s
=
await
self
.
_preprocess_speech_to_text
(
request
=
request
,
audio_data
=
audio_data
,
request_id
=
request_id
,
...
...
@@ -420,7 +421,7 @@ class OpenAISpeechToText(OpenAIServing):
list_result_generator
:
list
[
AsyncGenerator
[
RequestOutput
,
None
]]
|
None
=
None
input_len
=
(
OpenAISpeechToText
.
_get_decoder_prompt_len
(
engine_
promp
ts
)
OpenAISpeechToText
.
_get_decoder_prompt_len
(
engine_
inpu
ts
)
if
request
.
use_beam_search
else
0
)
...
...
@@ -450,12 +451,12 @@ class OpenAISpeechToText(OpenAIServing):
sampling_params
.
logprobs
=
1
list_result_generator
=
[]
for
i
,
engine_
promp
t
in
enumerate
(
engine_
promp
ts
):
for
i
,
engine_
inpu
t
in
enumerate
(
engine_
inpu
ts
):
request_id_item
=
f
"
{
request_id
}
_
{
i
}
"
self
.
_log_inputs
(
request_id_item
,
engine_
promp
t
,
engine_
inpu
t
,
params
=
sampling_params
,
lora_request
=
lora_request
,
)
...
...
@@ -468,7 +469,7 @@ class OpenAISpeechToText(OpenAIServing):
if
isinstance
(
sampling_params
,
BeamSearchParams
):
generator
=
self
.
beam_search
(
prompt
=
engine_
promp
t
,
prompt
=
engine_
inpu
t
,
params
=
sampling_params
,
request_id
=
request_id_item
,
lora_request
=
lora_request
,
...
...
@@ -476,7 +477,7 @@ class OpenAISpeechToText(OpenAIServing):
)
else
:
generator
=
self
.
engine_client
.
generate
(
engine_
promp
t
,
engine_
inpu
t
,
sampling_params
,
request_id_item
,
lora_request
=
lora_request
,
...
...
vllm/entrypoints/pooling/base/io_processor.py
View file @
ba2f0acc
...
...
@@ -18,7 +18,7 @@ from vllm.entrypoints.pooling.typing import (
PoolingCompletionLikeRequest
,
PoolingServeContext
,
)
from
vllm.inputs
.data
import
Processor
Input
s
,
SingletonPrompt
from
vllm.inputs
import
Engine
Input
,
SingletonPrompt
from
vllm.renderers
import
BaseRenderer
,
merge_kwargs
from
vllm.renderers.inputs.preprocess
import
parse_model_prompt
,
prompt_to_seq
from
vllm.tool_parsers
import
ToolParser
...
...
@@ -60,7 +60,7 @@ class PoolingIOProcessor:
chat_template_kwargs
=
request
.
chat_template_kwargs
,
trust_request_chat_template
=
self
.
trust_request_chat_template
,
)
_
,
engine_
promp
ts
=
self
.
_preprocess_chat_online
(
_
,
engine_
inpu
ts
=
self
.
_preprocess_chat_online
(
request
,
request
.
messages
,
default_template
=
self
.
chat_template
,
...
...
@@ -68,7 +68,7 @@ class PoolingIOProcessor:
default_template_kwargs
=
None
,
)
elif
isinstance
(
request
,
PoolingCompletionLikeRequest
):
engine_
promp
ts
=
self
.
_preprocess_completion_online
(
engine_
inpu
ts
=
self
.
_preprocess_completion_online
(
request
,
prompt_input
=
request
.
input
,
prompt_embeds
=
None
,
...
...
@@ -76,7 +76,7 @@ class PoolingIOProcessor:
else
:
raise
ValueError
(
f
"Invalid
{
self
.
name
}
request type"
)
ctx
.
engine_
promp
ts
=
engine_
promp
ts
ctx
.
engine_
inpu
ts
=
engine_
inpu
ts
async
def
pre_process_online_async
(
self
,
ctx
:
PoolingServeContext
):
self
.
pre_process_online
(
ctx
)
...
...
@@ -100,7 +100,7 @@ class PoolingIOProcessor:
self
,
prompts
:
PromptType
|
Sequence
[
PromptType
],
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
)
->
Sequence
[
Processor
Input
s
]:
)
->
Sequence
[
Engine
Input
]:
return
self
.
_preprocess_completion_offline
(
prompts
=
prompts
,
tokenization_kwargs
=
tokenization_kwargs
)
...
...
@@ -128,7 +128,7 @@ class PoolingIOProcessor:
request
:
RendererRequest
,
prompt_input
:
str
|
list
[
str
]
|
list
[
int
]
|
list
[
list
[
int
]]
|
None
,
prompt_embeds
:
bytes
|
list
[
bytes
]
|
None
,
)
->
list
[
Processor
Input
s
]:
)
->
list
[
Engine
Input
]:
renderer
=
self
.
renderer
model_config
=
self
.
model_config
...
...
@@ -167,7 +167,7 @@ class PoolingIOProcessor:
default_template_kwargs
:
dict
[
str
,
Any
]
|
None
,
tool_dicts
:
list
[
dict
[
str
,
Any
]]
|
None
=
None
,
tool_parser
:
type
[
ToolParser
]
|
None
=
None
,
)
->
tuple
[
list
[
ConversationMessage
],
list
[
Processor
Input
s
]]:
)
->
tuple
[
list
[
ConversationMessage
],
list
[
Engine
Input
]]:
renderer
=
self
.
renderer
default_template_kwargs
=
merge_kwargs
(
...
...
@@ -188,7 +188,7 @@ class PoolingIOProcessor:
default_media_io_kwargs
=
(
mm_config
.
media_io_kwargs
if
mm_config
else
None
),
)
(
conversation
,),
(
engine_
promp
t
,)
=
renderer
.
render_chat
(
(
conversation
,),
(
engine_
inpu
t
,)
=
renderer
.
render_chat
(
[
messages
],
chat_params
,
tok_params
,
...
...
@@ -199,13 +199,13 @@ class PoolingIOProcessor:
},
)
return
conversation
,
[
engine_
promp
t
]
return
conversation
,
[
engine_
inpu
t
]
def
_preprocess_completion_offline
(
self
,
prompts
:
PromptType
|
Sequence
[
PromptType
],
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
)
->
Sequence
[
Processor
Input
s
]:
)
->
Sequence
[
Engine
Input
]:
renderer
=
self
.
renderer
model_config
=
self
.
model_config
...
...
vllm/entrypoints/pooling/base/serving.py
View file @
ba2f0acc
...
...
@@ -20,7 +20,7 @@ from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from
vllm.entrypoints.openai.models.serving
import
OpenAIServingModels
from
vllm.entrypoints.pooling.typing
import
AnyPoolingRequest
,
PoolingServeContext
from
vllm.exceptions
import
VLLMNotFoundError
from
vllm.inputs
.data
import
Processor
Input
s
from
vllm.inputs
import
Engine
Input
from
vllm.lora.request
import
LoRARequest
from
vllm.renderers.base
import
BaseRenderer
from
vllm.renderers.inputs.preprocess
import
extract_prompt_components
...
...
@@ -106,7 +106,7 @@ class PoolingServing:
self
,
ctx
:
PoolingServeContext
,
):
if
ctx
.
engine_
promp
ts
is
None
:
if
ctx
.
engine_
inpu
ts
is
None
:
raise
ValueError
(
"Engine prompts not available"
)
generators
:
list
[
AsyncGenerator
[
PoolingRequestOutput
,
None
]]
=
[]
...
...
@@ -120,7 +120,7 @@ class PoolingServing:
pooling_params
=
self
.
io_processor
.
create_pooling_params
(
ctx
.
request
)
pooling_params
.
verify
(
self
.
model_config
)
for
i
,
engine_
promp
t
in
enumerate
(
ctx
.
engine_
promp
ts
):
for
i
,
engine_
inpu
t
in
enumerate
(
ctx
.
engine_
inpu
ts
):
prompt_request_id
=
(
f
"
{
ctx
.
request_id
}
-
{
i
}
"
if
ctx
.
prompt_request_ids
is
None
...
...
@@ -129,13 +129,13 @@ class PoolingServing:
self
.
_log_inputs
(
prompt_request_id
,
engine_
promp
t
,
engine_
inpu
t
,
params
=
pooling_params
,
lora_request
=
ctx
.
lora_request
,
)
generator
=
self
.
engine_client
.
encode
(
engine_
promp
t
,
engine_
inpu
t
,
pooling_params
,
prompt_request_id
,
lora_request
=
ctx
.
lora_request
,
...
...
@@ -151,13 +151,13 @@ class PoolingServing:
self
,
ctx
:
PoolingServeContext
,
):
if
ctx
.
engine_
promp
ts
is
None
:
if
ctx
.
engine_
inpu
ts
is
None
:
raise
ValueError
(
"Engine prompts not available"
)
if
ctx
.
result_generator
is
None
:
raise
ValueError
(
"Result generator not available"
)
num_inputs
=
len
(
ctx
.
engine_
promp
ts
)
num_inputs
=
len
(
ctx
.
engine_
inpu
ts
)
final_res_batch
:
list
[
PoolingRequestOutput
|
None
]
final_res_batch
=
[
None
]
*
num_inputs
...
...
@@ -317,7 +317,7 @@ class PoolingServing:
def
_log_inputs
(
self
,
request_id
:
str
,
inputs
:
Processor
Input
s
,
inputs
:
Engine
Input
,
params
:
PoolingParams
,
lora_request
:
LoRARequest
|
None
,
)
->
None
:
...
...
vllm/entrypoints/pooling/embed/io_processor.py
View file @
ba2f0acc
...
...
@@ -24,7 +24,7 @@ from vllm.entrypoints.pooling.embed.protocol import (
EmbeddingCompletionRequest
,
)
from
vllm.entrypoints.pooling.typing
import
PoolingServeContext
from
vllm.inputs
.data
import
Processor
Input
s
,
token_input
s
from
vllm.inputs
import
Engine
Input
,
token
s
_input
from
vllm.logger
import
init_logger
from
vllm.outputs
import
PoolingOutput
,
PoolingRequestOutput
from
vllm.renderers
import
merge_kwargs
...
...
@@ -83,20 +83,20 @@ class EmbedIOProcessor(PoolingIOProcessor):
#################################################################
def
_pre_process_chunked
(
self
,
ctx
:
PoolingServeContext
)
->
None
:
if
ctx
.
engine_
promp
ts
is
None
:
if
ctx
.
engine_
inpu
ts
is
None
:
raise
ValueError
(
"Engine prompts not available"
)
ctx
.
intermediates
=
ctx
.
engine_
promp
ts
ctx
.
intermediates
=
ctx
.
engine_
inpu
ts
request_id
=
ctx
.
request_id
max_model_len
=
self
.
model_config
.
max_model_len
chunked_engine_
promp
ts
:
list
[
Processor
Input
s
]
=
[]
chunked_engine_
inpu
ts
:
list
[
Engine
Input
]
=
[]
prompt_request_ids
:
list
[
str
]
=
[]
for
prompt_idx
,
engine_
promp
t
in
enumerate
(
ctx
.
engine_
promp
ts
):
token_ids
=
engine_
promp
t
.
get
(
"prompt_token_ids"
,
None
)
for
prompt_idx
,
engine_
inpu
t
in
enumerate
(
ctx
.
engine_
inpu
ts
):
token_ids
=
engine_
inpu
t
.
get
(
"prompt_token_ids"
,
None
)
if
token_ids
is
None
:
raise
NotImplementedError
(
"Long Text Embedding with Chunked Processing does "
"not support EmbedsPrompt and EncoderDecoderInput
s
."
"not support EmbedsPrompt and EncoderDecoderInput."
)
prompt_token_ids
=
cast
(
list
[
int
],
token_ids
)
...
...
@@ -104,14 +104,14 @@ class EmbedIOProcessor(PoolingIOProcessor):
for
chunk_idx
,
chunk_tokens
in
enumerate
(
chunk_list
(
prompt_token_ids
,
max_model_len
)
):
chunked_engine_
promp
ts
.
append
(
token_input
s
(
prompt_token_ids
=
chunk_tokens
)
chunked_engine_
inpu
ts
.
append
(
token
s
_input
(
prompt_token_ids
=
chunk_tokens
)
)
prompt_request_ids
.
append
(
f
"
{
request_id
}
-prompt-
{
prompt_idx
}
-chunk-
{
chunk_idx
}
"
)
ctx
.
engine_
promp
ts
=
chunked_engine_
promp
ts
ctx
.
engine_
inpu
ts
=
chunked_engine_
inpu
ts
ctx
.
prompt_request_ids
=
prompt_request_ids
return
None
...
...
@@ -184,8 +184,8 @@ class EmbedIOProcessor(PoolingIOProcessor):
if
ctx
.
intermediates
is
None
:
raise
ValueError
(
"Original prompts inputs not available"
)
original_engine_
promp
ts
=
cast
(
list
[
Processor
Input
s
],
ctx
.
intermediates
)
num_prompts
=
len
(
original_engine_
promp
ts
)
original_engine_
inpu
ts
=
cast
(
list
[
Engine
Input
],
ctx
.
intermediates
)
num_prompts
=
len
(
original_engine_
inpu
ts
)
# Finalize aggregated results
final_res_batch
:
list
[
PoolingRequestOutput
]
=
[]
...
...
@@ -211,12 +211,12 @@ class EmbedIOProcessor(PoolingIOProcessor):
pooling_output_data
=
PoolingOutput
(
data
=
final_embedding
)
# Get original prompt token IDs for this prompt
original_prompt
=
original_engine_
promp
ts
[
prompt_idx
]
original_prompt
=
original_engine_
inpu
ts
[
prompt_idx
]
token_ids
=
original_prompt
.
get
(
"prompt_token_ids"
,
None
)
if
token_ids
is
None
:
raise
NotImplementedError
(
"Long Text Embedding with Chunked Processing does "
"not support EmbedsPrompt and EncoderDecoderInput
s
."
"not support EmbedsPrompt and EncoderDecoderInput."
)
original_token_ids
=
cast
(
list
[
int
],
token_ids
)
...
...
@@ -372,7 +372,7 @@ class EmbedIOProcessor(PoolingIOProcessor):
]
for
uri
in
request
.
images
]
ctx
.
engine_
promp
ts
=
self
.
_batch_render_chat
(
ctx
.
engine_
inpu
ts
=
self
.
_batch_render_chat
(
request
,
all_messages
,
truncate_prompt_tokens
,
truncation_side
)
...
...
@@ -382,7 +382,7 @@ class EmbedIOProcessor(PoolingIOProcessor):
self
.
_mixed_input_to_messages
(
inp
,
task_prefix
=
task_prefix
)
for
inp
in
request
.
inputs
]
ctx
.
engine_
promp
ts
=
self
.
_batch_render_chat
(
ctx
.
engine_
inpu
ts
=
self
.
_batch_render_chat
(
request
,
all_messages
,
truncate_prompt_tokens
,
truncation_side
)
...
...
@@ -396,7 +396,7 @@ class EmbedIOProcessor(PoolingIOProcessor):
truncate_prompt_tokens
=
truncate_prompt_tokens
,
truncation_side
=
truncation_side
,
)
ctx
.
engine_
promp
ts
=
self
.
_preprocess_completion_online
(
ctx
.
engine_
inpu
ts
=
self
.
_preprocess_completion_online
(
proxy
,
prompt_input
=
proxy
.
input
,
prompt_embeds
=
None
)
...
...
@@ -406,7 +406,7 @@ class EmbedIOProcessor(PoolingIOProcessor):
all_messages
:
Sequence
[
list
[
ChatCompletionMessageParam
]],
truncate_prompt_tokens
:
int
|
None
,
truncation_side
:
Literal
[
"left"
,
"right"
]
|
None
,
)
->
list
[
Processor
Input
s
]:
)
->
list
[
Engine
Input
]:
"""Batch-render multiple conversations through the chat template."""
if
not
all_messages
:
return
[]
...
...
@@ -438,8 +438,8 @@ class EmbedIOProcessor(PoolingIOProcessor):
default_media_io_kwargs
=
(
mm_config
.
media_io_kwargs
if
mm_config
else
None
),
)
_
,
engine_
promp
ts
=
renderer
.
render_chat
(
all_messages
,
chat_params
,
tok_params
)
return
engine_
promp
ts
_
,
engine_
inpu
ts
=
renderer
.
render_chat
(
all_messages
,
chat_params
,
tok_params
)
return
engine_
inpu
ts
def
_validate_input_type
(
self
,
input_type
:
str
|
None
)
->
None
:
"""Raise if *input_type* is not supported by this model."""
...
...
vllm/entrypoints/pooling/pooling/serving.py
View file @
ba2f0acc
...
...
@@ -33,7 +33,7 @@ from vllm.entrypoints.pooling.utils import (
encode_pooling_output_float
,
)
from
vllm.entrypoints.serve.render.serving
import
OpenAIServingRender
from
vllm.inputs
import
Processor
Input
s
from
vllm.inputs
import
Engine
Input
from
vllm.logger
import
init_logger
from
vllm.outputs
import
PoolingRequestOutput
from
vllm.renderers.inputs.preprocess
import
prompt_to_seq
...
...
@@ -110,7 +110,7 @@ class OpenAIServingPooling(OpenAIServing):
request
.
task
,
)
engine_
promp
ts
:
Sequence
[
Processor
Input
s
]
engine_
inpu
ts
:
Sequence
[
Engine
Input
]
if
use_io_processor
:
=
isinstance
(
request
,
IOProcessorRequest
):
if
self
.
io_processor
is
None
:
raise
ValueError
(
...
...
@@ -125,7 +125,7 @@ class OpenAIServingPooling(OpenAIServing):
raw_prompts
=
await
self
.
io_processor
.
pre_process_async
(
prompt
=
validated_prompt
,
request_id
=
request_id
)
engine_
promp
ts
=
await
self
.
openai_serving_render
.
preprocess_cmpl
(
engine_
inpu
ts
=
await
self
.
openai_serving_render
.
preprocess_cmpl
(
request
,
prompt_to_seq
(
raw_prompts
),
)
...
...
@@ -138,7 +138,7 @@ class OpenAIServingPooling(OpenAIServing):
if
error_check_ret
is
not
None
:
return
error_check_ret
_
,
engine_
promp
ts
=
await
self
.
openai_serving_render
.
preprocess_chat
(
_
,
engine_
inpu
ts
=
await
self
.
openai_serving_render
.
preprocess_chat
(
request
,
request
.
messages
,
default_template
=
self
.
chat_template
,
...
...
@@ -146,7 +146,7 @@ class OpenAIServingPooling(OpenAIServing):
default_template_kwargs
=
None
,
)
elif
isinstance
(
request
,
PoolingCompletionRequest
):
engine_
promp
ts
=
await
self
.
openai_serving_render
.
preprocess_completion
(
engine_
inpu
ts
=
await
self
.
openai_serving_render
.
preprocess_completion
(
request
,
prompt_input
=
request
.
input
,
prompt_embeds
=
None
,
...
...
@@ -165,12 +165,12 @@ class OpenAIServingPooling(OpenAIServing):
else
:
pooling_params
=
request
.
to_pooling_params
()
# type: ignore
for
i
,
engine_
promp
t
in
enumerate
(
engine_
promp
ts
):
for
i
,
engine_
inpu
t
in
enumerate
(
engine_
inpu
ts
):
request_id_item
=
f
"
{
request_id
}
-
{
i
}
"
self
.
_log_inputs
(
request_id_item
,
engine_
promp
t
,
engine_
inpu
t
,
params
=
pooling_params
,
lora_request
=
lora_request
,
)
...
...
@@ -182,7 +182,7 @@ class OpenAIServingPooling(OpenAIServing):
)
generator
=
self
.
engine_client
.
encode
(
engine_
promp
t
,
engine_
inpu
t
,
pooling_params
,
request_id_item
,
lora_request
=
lora_request
,
...
...
@@ -221,7 +221,7 @@ class OpenAIServingPooling(OpenAIServing):
return
IOProcessorResponse
(
request_id
=
request_id
,
data
=
output
)
assert
isinstance
(
request
,
(
PoolingCompletionRequest
,
PoolingChatRequest
))
num_prompts
=
len
(
engine_
promp
ts
)
num_prompts
=
len
(
engine_
inpu
ts
)
# Non-streaming response
final_res_batch
:
list
[
PoolingRequestOutput
|
None
]
...
...
vllm/entrypoints/pooling/score/serving.py
View file @
ba2f0acc
...
...
@@ -35,7 +35,7 @@ from vllm.entrypoints.pooling.score.utils import (
parse_score_data_single
,
validate_score_input
,
)
from
vllm.inputs
.data
import
Processor
Input
s
,
TokensPrompt
,
token_input
s
from
vllm.inputs
import
Engine
Input
,
TokensPrompt
,
token
s
_input
from
vllm.logger
import
init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.outputs
import
PoolingRequestOutput
,
ScoringRequestOutput
...
...
@@ -110,12 +110,12 @@ class ServingScores(OpenAIServing):
*
(
encode_async
(
t
,
**
tokenization_kwargs
)
for
t
in
input_texts
)
)
engine_
promp
ts
:
list
[
Processor
Input
s
]
=
[]
engine_
inpu
ts
:
list
[
Engine
Input
]
=
[]
for
tok_result
,
input_text
in
zip
(
tokenized_prompts
,
input_texts
):
text_token_prompt
=
self
.
_validate_input
(
request
,
tok_result
,
input_text
)
engine_
promp
ts
.
append
(
token_input
s
(
engine_
inpu
ts
.
append
(
token
s
_input
(
text_token_prompt
[
"prompt_token_ids"
],
prompt
=
input_text
,
)
...
...
@@ -125,19 +125,19 @@ class ServingScores(OpenAIServing):
generators
:
list
[
AsyncGenerator
[
PoolingRequestOutput
,
None
]]
=
[]
pooling_params
=
request
.
to_pooling_params
(
"embed"
)
for
i
,
engine_
promp
t
in
enumerate
(
engine_
promp
ts
):
for
i
,
engine_
inpu
t
in
enumerate
(
engine_
inpu
ts
):
request_id_item
=
f
"
{
request_id
}
-
{
i
}
"
self
.
_log_inputs
(
request_id_item
,
engine_
promp
t
,
engine_
inpu
t
,
params
=
pooling_params
,
lora_request
=
lora_request
,
)
generators
.
append
(
self
.
engine_client
.
encode
(
engine_
promp
t
,
engine_
inpu
t
,
pooling_params
,
request_id_item
,
lora_request
=
lora_request
,
...
...
@@ -151,7 +151,7 @@ class ServingScores(OpenAIServing):
# Non-streaming response
final_res_batch
:
list
[
PoolingRequestOutput
]
=
[]
embeddings
:
list
[
PoolingRequestOutput
|
None
]
=
[
None
]
*
len
(
engine_
promp
ts
)
embeddings
:
list
[
PoolingRequestOutput
|
None
]
=
[
None
]
*
len
(
engine_
inpu
ts
)
async
for
i
,
res
in
result_generator
:
embeddings
[
i
]
=
res
...
...
@@ -183,7 +183,7 @@ class ServingScores(OpenAIServing):
request
:
RerankRequest
|
ScoreRequest
,
tokenizer
:
TokenizerLike
,
tokenization_kwargs
:
dict
[
str
,
Any
],
)
->
tuple
[
str
,
TokensPrompt
]
:
)
->
TokensPrompt
:
"""Parse a single ScoreData into a text + optional multimodal
TokensPrompt for late-interaction encoding.
...
...
@@ -197,21 +197,22 @@ class ServingScores(OpenAIServing):
else
:
text
,
mm_data
,
mm_uuids
=
parse_score_data_single
(
data
,
role
,
model_config
)
prompt_i
nput
s
=
tokenizer
(
text
,
**
tokenization_kwargs
)
self
.
_validate_input
(
request
,
prompt_i
nputs
[
"input_ids"
]
,
text
)
prompt_i
d
s
=
tokenizer
.
encode
(
text
,
**
tokenization_kwargs
)
self
.
_validate_input
(
request
,
prompt_i
ds
,
text
)
engine_prompt
=
TokensPrompt
(
prompt_token_ids
=
prompt_inputs
[
"input_ids"
],
tok_prompt
=
TokensPrompt
(
prompt_token_ids
=
prompt_ids
,
prompt
=
text
,
)
if
mm_data
is
not
None
:
engine
_prompt
[
"multi_modal_data"
]
=
mm_data
tok
_prompt
[
"multi_modal_data"
]
=
mm_data
if
mm_uuids
is
not
None
:
engine
_prompt
[
"multi_modal_uuids"
]
=
mm_uuids
tok
_prompt
[
"multi_modal_uuids"
]
=
mm_uuids
if
request
.
mm_processor_kwargs
is
not
None
:
engine
_prompt
[
"mm_processor_kwargs"
]
=
request
.
mm_processor_kwargs
tok
_prompt
[
"mm_processor_kwargs"
]
=
request
.
mm_processor_kwargs
return
t
ext
,
engine
_prompt
return
t
ok
_prompt
async
def
_late_interaction_score
(
self
,
...
...
@@ -240,7 +241,7 @@ class ServingScores(OpenAIServing):
executor
=
self
.
_tokenizer_executor
,
)
preprocessed
=
await
asyncio
.
gather
(
tok_prompts
=
await
asyncio
.
gather
(
*
(
preprocess_async
(
data
=
d
,
...
...
@@ -253,12 +254,8 @@ class ServingScores(OpenAIServing):
)
)
query_prompts
:
list
[
TokensPrompt
]
=
[
prompt
for
_
,
prompt
in
preprocessed
[:
len
(
data_1
)]
]
doc_prompts
:
list
[
TokensPrompt
]
=
[
prompt
for
_
,
prompt
in
preprocessed
[
len
(
data_1
)
:]
]
query_prompts
=
tok_prompts
[:
len
(
data_1
)]
doc_prompts
=
tok_prompts
[
len
(
data_1
)
:]
default_pooling_params
=
request
.
to_pooling_params
(
"token_embed"
)
...
...
@@ -268,7 +265,7 @@ class ServingScores(OpenAIServing):
query_prompts
)
query_generators
:
list
[
AsyncGenerator
[
PoolingRequestOutput
,
None
]]
=
[]
for
i
,
engine
_prompt
in
enumerate
(
query_prompts
):
for
i
,
tok
_prompt
in
enumerate
(
query_prompts
):
request_id_item
=
f
"
{
request_id
}
-query-
{
i
}
"
pooling_params
=
default_pooling_params
.
clone
()
pooling_params
.
late_interaction_params
=
(
...
...
@@ -280,14 +277,14 @@ class ServingScores(OpenAIServing):
self
.
_log_inputs
(
request_id_item
,
engine
_prompt
,
tok
_prompt
,
params
=
pooling_params
,
lora_request
=
lora_request
,
)
query_generators
.
append
(
self
.
engine_client
.
encode
(
engine
_prompt
,
tok
_prompt
,
pooling_params
,
request_id_item
,
lora_request
=
lora_request
,
...
...
@@ -306,7 +303,7 @@ class ServingScores(OpenAIServing):
# stage 2: encode docs and return scalar scores from workers.
doc_generators
:
list
[
AsyncGenerator
[
PoolingRequestOutput
,
None
]]
=
[]
for
i
,
engine
_prompt
in
enumerate
(
doc_prompts
):
for
i
,
tok
_prompt
in
enumerate
(
doc_prompts
):
request_id_item
=
f
"
{
request_id
}
-doc-
{
i
}
"
query_idx
=
0
if
len
(
query_prompts
)
==
1
else
i
pooling_params
=
default_pooling_params
.
clone
()
...
...
@@ -316,14 +313,14 @@ class ServingScores(OpenAIServing):
self
.
_log_inputs
(
request_id_item
,
engine
_prompt
,
tok
_prompt
,
params
=
pooling_params
,
lora_request
=
lora_request
,
)
doc_generators
.
append
(
self
.
engine_client
.
encode
(
engine
_prompt
,
tok
_prompt
,
pooling_params
,
request_id_item
,
lora_request
=
lora_request
,
...
...
@@ -404,28 +401,22 @@ class ServingScores(OpenAIServing):
)
)
request_prompts
:
list
[
str
]
=
[]
engine_prompts
:
list
[
TokensPrompt
]
=
[]
for
full_prompt
,
engine_prompt
in
preprocessed_prompts
:
request_prompts
.
append
(
full_prompt
)
engine_prompts
.
append
(
engine_prompt
)
# Schedule the request and get the result generator.
generators
:
list
[
AsyncGenerator
[
PoolingRequestOutput
,
None
]]
=
[]
default_pooling_params
=
request
.
to_pooling_params
(
"classify"
)
for
i
,
engine
_prompt
in
enumerate
(
engine
_prompts
):
for
i
,
(
full
_prompt
,
tok_prompt
)
in
enumerate
(
preprocessed
_prompts
):
request_id_item
=
f
"
{
request_id
}
-
{
i
}
"
self
.
_log_inputs
(
request_id_item
,
request
_prompt
s
[
i
]
,
full
_prompt
,
params
=
default_pooling_params
,
lora_request
=
lora_request
,
)
if
token_type_ids
:
=
engine
_prompt
.
pop
(
"token_type_ids"
,
None
):
if
token_type_ids
:
=
tok
_prompt
.
pop
(
"token_type_ids"
,
None
):
pooling_params
=
default_pooling_params
.
clone
()
compressed
=
compress_token_type_ids
(
token_type_ids
)
pooling_params
.
extra_kwargs
=
{
"compressed_token_type_ids"
:
compressed
}
...
...
@@ -433,7 +424,7 @@ class ServingScores(OpenAIServing):
pooling_params
=
default_pooling_params
generator
=
self
.
engine_client
.
encode
(
engine
_prompt
,
tok
_prompt
,
pooling_params
,
request_id_item
,
lora_request
=
lora_request
,
...
...
@@ -447,7 +438,7 @@ class ServingScores(OpenAIServing):
# Non-streaming response
final_res_batch
:
list
[
PoolingRequestOutput
|
None
]
=
[
None
]
*
len
(
engine
_prompts
preprocessed
_prompts
)
async
for
i
,
res
in
result_generator
:
...
...
@@ -464,7 +455,7 @@ class ServingScores(OpenAIServing):
data_2
:
ScoreData
,
)
->
tuple
[
str
,
TokensPrompt
]:
model_config
=
self
.
model_config
full_prompt
,
engine_
promp
t
=
get_score_prompt
(
full_prompt
,
engine_
inpu
t
=
get_score_prompt
(
model_config
=
model_config
,
data_1
=
data_1
,
data_2
=
data_2
,
...
...
@@ -472,11 +463,11 @@ class ServingScores(OpenAIServing):
tokenization_kwargs
=
tokenization_kwargs
,
score_template
=
self
.
score_template
,
)
self
.
_validate_input
(
request
,
engine_
promp
t
[
"prompt_token_ids"
],
full_prompt
)
self
.
_validate_input
(
request
,
engine_
inpu
t
[
"prompt_token_ids"
],
full_prompt
)
if
request
.
mm_processor_kwargs
is
not
None
:
engine_
promp
t
[
"mm_processor_kwargs"
]
=
request
.
mm_processor_kwargs
engine_
inpu
t
[
"mm_processor_kwargs"
]
=
request
.
mm_processor_kwargs
return
full_prompt
,
engine_
promp
t
return
full_prompt
,
engine_
inpu
t
async
def
_run_scoring
(
self
,
...
...
vllm/entrypoints/pooling/score/utils.py
View file @
ba2f0acc
...
...
@@ -20,10 +20,14 @@ from vllm.entrypoints.chat_utils import (
MultiModalItemTracker
,
_parse_chat_message_content_parts
,
)
from
vllm.inputs
import
TokensPrompt
from
vllm.inputs.data
import
PromptType
,
TextPrompt
from
vllm.inputs
import
(
MultiModalDataDict
,
MultiModalUUIDDict
,
PromptType
,
TextPrompt
,
TokensPrompt
,
)
from
vllm.model_executor.models.interfaces
import
supports_score_template
from
vllm.multimodal.inputs
import
MultiModalDataDict
,
MultiModalUUIDDict
from
vllm.outputs
import
PoolingRequestOutput
from
vllm.renderers.hf
import
safe_apply_chat_template
from
vllm.tokenizers
import
TokenizerLike
...
...
vllm/entrypoints/pooling/typing.py
View file @
ba2f0acc
...
...
@@ -32,7 +32,7 @@ from vllm.entrypoints.pooling.score.protocol import (
ScoreRequest
,
ScoreResponse
,
)
from
vllm.inputs
import
Processor
Input
s
from
vllm.inputs
import
Engine
Input
from
vllm.lora.request
import
LoRARequest
PoolingCompletionLikeRequest
:
TypeAlias
=
(
...
...
@@ -74,7 +74,7 @@ class PoolingServeContext(Generic[PoolingRequestT]):
created_time
:
int
=
field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
lora_request
:
LoRARequest
|
None
=
None
engine_
promp
ts
:
list
[
Processor
Input
s
]
|
None
=
None
engine_
inpu
ts
:
list
[
Engine
Input
]
|
None
=
None
prompt_request_ids
:
list
[
str
]
|
None
=
None
intermediates
:
Any
|
None
=
None
...
...
vllm/entrypoints/serve/disagg/protocol.py
View file @
ba2f0acc
...
...
@@ -33,19 +33,20 @@ class MultiModalFeatures(BaseModel):
"""Lightweight multimodal metadata produced by the render step.
Carries hashes (for cache lookup / identification) and placeholder
positions so the downstream
`
`/generate`
`
service knows *where* in
positions so the downstream `/generate` service knows *where* in
the token sequence each multimodal item lives.
.. note:: Phase 1 — metadata only.
Phase 2 should add ``mm_kwargs`` (processed tensor data) using a
binary transport so the ``/generate`` side can skip re-processing.
The ``/generate`` endpoint must also be updated to inject these
features into ``ProcessorInputs`` before passing to
``InputProcessor.process_inputs``.
Note:
Phase 1 — metadata only.
Phase 2 should add `mm_kwargs` (processed tensor data) using a
binary transport so the ``/generate` side can skip re-processing.
The `/generate` endpoint must also be updated to inject these
features into `EngineInput` before passing to
`InputProcessor.process_inputs`.
"""
mm_hashes
:
dict
[
str
,
list
[
str
]]
"""Per-modality item hashes, e.g.
`
`{"image": ["abc", "def"]}`
`
."""
"""Per-modality item hashes, e.g. `{"image": ["abc", "def"]}`."""
mm_placeholders
:
dict
[
str
,
list
[
PlaceholderRangeInfo
]]
"""Per-modality placeholder ranges in the token sequence."""
...
...
vllm/entrypoints/serve/disagg/serving.py
View file @
ba2f0acc
...
...
@@ -99,13 +99,11 @@ class ServingTokens(OpenAIServing):
if
raw_request
:
raw_request
.
state
.
request_metadata
=
request_metadata
engine_
prompts
=
await
self
.
openai_serving_render
.
preprocess_completion
(
(
engine_
input
,)
=
await
self
.
openai_serving_render
.
preprocess_completion
(
request
,
prompt_input
=
request
.
token_ids
,
prompt_embeds
=
None
,
)
assert
len
(
engine_prompts
)
==
1
engine_prompt
=
engine_prompts
[
0
]
# Schedule the request and get the result generator.
result_generator
:
AsyncGenerator
[
RequestOutput
,
None
]
|
None
=
None
...
...
@@ -115,7 +113,7 @@ class ServingTokens(OpenAIServing):
self
.
_log_inputs
(
request_id
,
engine_
promp
t
,
engine_
inpu
t
,
params
=
sampling_params
,
lora_request
=
lora_request
,
)
...
...
@@ -127,7 +125,7 @@ class ServingTokens(OpenAIServing):
)
result_generator
=
self
.
engine_client
.
generate
(
engine_
promp
t
,
engine_
inpu
t
,
sampling_params
,
request_id
,
lora_request
=
lora_request
,
...
...
vllm/entrypoints/serve/render/serving.py
View file @
ba2f0acc
...
...
@@ -34,9 +34,15 @@ from vllm.entrypoints.utils import (
create_error_response
,
get_max_tokens
,
)
from
vllm.inputs.data
import
ProcessorInputs
,
PromptType
,
SingletonPrompt
,
TokensPrompt
from
vllm.inputs
import
(
EngineInput
,
MultiModalHashes
,
MultiModalPlaceholders
,
PromptType
,
SingletonPrompt
,
tokens_input
,
)
from
vllm.logger
import
init_logger
from
vllm.multimodal.inputs
import
MultiModalHashes
,
MultiModalPlaceholderDict
from
vllm.parser
import
ParserManager
from
vllm.renderers
import
BaseRenderer
,
merge_kwargs
from
vllm.renderers.inputs.preprocess
import
(
...
...
@@ -127,22 +133,22 @@ class OpenAIServingRender:
if
isinstance
(
result
,
ErrorResponse
):
return
result
_
,
engine_
promp
ts
=
result
_
,
engine_
inpu
ts
=
result
if
len
(
engine_
promp
ts
)
!=
1
:
if
len
(
engine_
inpu
ts
)
!=
1
:
return
self
.
create_error_response
(
f
"Expected exactly 1 engine prompt, got
{
len
(
engine_
promp
ts
)
}
"
f
"Expected exactly 1 engine prompt, got
{
len
(
engine_
inpu
ts
)
}
"
)
engine_
promp
t
=
engine_
promp
ts
[
0
]
engine_
inpu
t
=
engine_
inpu
ts
[
0
]
prompt_components
=
extract_prompt_components
(
self
.
model_config
,
engine_
promp
t
)
prompt_components
=
extract_prompt_components
(
self
.
model_config
,
engine_
inpu
t
)
token_ids
=
prompt_components
.
token_ids
if
not
token_ids
:
return
self
.
create_error_response
(
"No token_ids rendered"
)
token_ids
=
list
(
token_ids
)
input_length
=
extract_prompt_len
(
self
.
model_config
,
engine_
promp
t
)
input_length
=
extract_prompt_len
(
self
.
model_config
,
engine_
inpu
t
)
max_tokens
=
get_max_tokens
(
self
.
model_config
.
max_model_len
,
request
.
max_completion_tokens
...
...
@@ -159,7 +165,7 @@ class OpenAIServingRender:
return
GenerateRequest
(
request_id
=
request_id
,
token_ids
=
token_ids
,
features
=
self
.
_extract_mm_features
(
engine_
promp
t
),
features
=
self
.
_extract_mm_features
(
engine_
inpu
t
),
sampling_params
=
params
,
model
=
request
.
model
,
stream
=
bool
(
request
.
stream
),
...
...
@@ -171,7 +177,7 @@ class OpenAIServingRender:
async
def
render_chat
(
self
,
request
:
ChatCompletionRequest
,
)
->
tuple
[
list
[
ConversationMessage
],
list
[
Processor
Input
s
]]
|
ErrorResponse
:
)
->
tuple
[
list
[
ConversationMessage
],
list
[
Engine
Input
]]
|
ErrorResponse
:
"""Core preprocessing logic for chat requests (no model/engine check).
Called directly by render_chat_request and delegated to by
...
...
@@ -184,7 +190,6 @@ class OpenAIServingRender:
if
is_mistral_tokenizer
(
tokenizer
):
# because of issues with pydantic we need to potentially
# re-serialize the tool_calls field of the request
# for more info: see comment in `maybe_serialize_tool_calls`
_mt
.
maybe_serialize_tool_calls
(
request
)
# type: ignore[arg-type]
_mt
.
truncate_tool_call_ids
(
request
)
# type: ignore[arg-type]
_mt
.
validate_request_params
(
request
)
...
...
@@ -232,7 +237,7 @@ class OpenAIServingRender:
if
error_check_ret
is
not
None
:
return
error_check_ret
conversation
,
engine_
promp
ts
=
await
self
.
preprocess_chat
(
conversation
,
engine_
inpu
ts
=
await
self
.
preprocess_chat
(
request
,
request
.
messages
,
default_template
=
self
.
chat_template
,
...
...
@@ -244,11 +249,11 @@ class OpenAIServingRender:
else
:
# For GPT-OSS.
should_include_tools
=
tool_dicts
is
not
None
conversation
,
engine_
promp
ts
=
self
.
_make_request_with_harmony
(
conversation
,
engine_
inpu
ts
=
self
.
_make_request_with_harmony
(
request
,
should_include_tools
)
return
conversation
,
engine_
promp
ts
return
conversation
,
engine_
inpu
ts
async
def
render_completion_request
(
self
,
...
...
@@ -266,16 +271,16 @@ class OpenAIServingRender:
if
isinstance
(
result
,
ErrorResponse
):
return
result
generate_requests
:
list
[
GenerateRequest
]
=
[]
for
engine_
promp
t
in
result
:
for
engine_
inpu
t
in
result
:
prompt_components
=
extract_prompt_components
(
self
.
model_config
,
engine_
promp
t
self
.
model_config
,
engine_
inpu
t
)
token_ids
=
prompt_components
.
token_ids
if
not
token_ids
:
return
self
.
create_error_response
(
"No token_ids rendered"
)
token_ids
=
list
(
token_ids
)
input_length
=
extract_prompt_len
(
self
.
model_config
,
engine_
promp
t
)
input_length
=
extract_prompt_len
(
self
.
model_config
,
engine_
inpu
t
)
max_tokens
=
get_max_tokens
(
self
.
model_config
.
max_model_len
,
request
.
max_tokens
,
...
...
@@ -293,7 +298,7 @@ class OpenAIServingRender:
GenerateRequest
(
request_id
=
request_id
,
token_ids
=
token_ids
,
features
=
self
.
_extract_mm_features
(
engine_
promp
t
),
features
=
self
.
_extract_mm_features
(
engine_
inpu
t
),
sampling_params
=
params
,
model
=
request
.
model
,
stream
=
bool
(
request
.
stream
),
...
...
@@ -308,7 +313,7 @@ class OpenAIServingRender:
async
def
render_completion
(
self
,
request
:
CompletionRequest
,
)
->
list
[
Processor
Input
s
]
|
ErrorResponse
:
)
->
list
[
Engine
Input
]
|
ErrorResponse
:
"""Core preprocessing logic for completion requests (no model/engine check).
Called directly by render_completion_request and delegated to by
...
...
@@ -326,28 +331,28 @@ class OpenAIServingRender:
"prompt_logprobs is not compatible with prompt embeds."
)
engine_
promp
ts
=
await
self
.
preprocess_completion
(
engine_
inpu
ts
=
await
self
.
preprocess_completion
(
request
,
prompt_input
=
request
.
prompt
,
prompt_embeds
=
request
.
prompt_embeds
,
)
return
engine_
promp
ts
return
engine_
inpu
ts
@
staticmethod
def
_extract_mm_features
(
engine_
prompt
:
Processor
Input
s
,
engine_
input
:
Engine
Input
,
)
->
MultiModalFeatures
|
None
:
"""Extract multimodal metadata from a rendered engine prompt.
Returns ``None`` for text-only prompts.
"""
if
engine_
promp
t
.
get
(
"type"
)
!=
"multimodal"
:
if
engine_
inpu
t
.
get
(
"type"
)
!=
"multimodal"
:
return
None
# At this point engine_
promp
t is a MultiModalInputs TypedDict.
mm_hashes
:
MultiModalHashes
=
engine_
promp
t
[
"mm_hashes"
]
# type: ignore[typeddict-item]
raw_placeholders
:
MultiModalPlaceholder
Dict
=
engine_
promp
t
[
"mm_placeholders"
]
# type: ignore[typeddict-item]
# At this point engine_
inpu
t is a MultiModalInputs TypedDict.
mm_hashes
:
MultiModalHashes
=
engine_
inpu
t
[
"mm_hashes"
]
# type: ignore[typeddict-item]
raw_placeholders
:
MultiModalPlaceholder
s
=
engine_
inpu
t
[
"mm_placeholders"
]
# type: ignore[typeddict-item]
mm_placeholders
=
{
modality
:
[
...
...
@@ -401,13 +406,9 @@ class OpenAIServingRender:
# Render prompt token ids.
prompt_token_ids
=
render_for_completion
(
messages
)
engine_prompt
=
TokensPrompt
(
prompt_token_ids
=
prompt_token_ids
)
# Add cache_salt if provided in the request
if
request
.
cache_salt
is
not
None
:
engine_prompt
[
"cache_salt"
]
=
request
.
cache_salt
engine_input
=
tokens_input
(
prompt_token_ids
,
cache_salt
=
request
.
cache_salt
)
return
messages
,
[
engine_
promp
t
]
return
messages
,
[
engine_
inpu
t
]
def
create_error_response
(
self
,
...
...
@@ -450,7 +451,7 @@ class OpenAIServingRender:
request
:
Any
,
prompt_input
:
str
|
list
[
str
]
|
list
[
int
]
|
list
[
list
[
int
]]
|
None
,
prompt_embeds
:
bytes
|
list
[
bytes
]
|
None
,
)
->
list
[
Processor
Input
s
]:
)
->
list
[
Engine
Input
]:
"""Copied from OpenAIServing._preprocess_completion."""
prompts
=
list
[
SingletonPrompt
|
bytes
]()
if
prompt_embeds
is
not
None
:
# embeds take higher priority
...
...
@@ -463,7 +464,7 @@ class OpenAIServingRender:
self
,
request
:
Any
,
prompts
:
Sequence
[
PromptType
|
bytes
],
)
->
list
[
Processor
Input
s
]:
)
->
list
[
Engine
Input
]:
"""Copied from OpenAIServing._preprocess_cmpl."""
renderer
=
self
.
renderer
model_config
=
self
.
model_config
...
...
@@ -497,7 +498,7 @@ class OpenAIServingRender:
default_template_kwargs
:
dict
[
str
,
Any
]
|
None
,
tool_dicts
:
list
[
dict
[
str
,
Any
]]
|
None
=
None
,
tool_parser
:
type
[
ToolParser
]
|
None
=
None
,
)
->
tuple
[
list
[
ConversationMessage
],
list
[
Processor
Input
s
]]:
)
->
tuple
[
list
[
ConversationMessage
],
list
[
Engine
Input
]]:
"""Copied from OpenAIServing._preprocess_chat."""
renderer
=
self
.
renderer
mm_config
=
self
.
model_config
.
multimodal_config
...
...
@@ -519,7 +520,7 @@ class OpenAIServingRender:
default_mm_processor_kwargs
=
getattr
(
request
,
"mm_processor_kwargs"
,
None
),
)
(
conversation
,),
(
engine_
promp
t
,)
=
await
renderer
.
render_chat_async
(
(
conversation
,),
(
engine_
inpu
t
,)
=
await
renderer
.
render_chat_async
(
[
messages
],
chat_params
,
tok_params
,
...
...
@@ -546,4 +547,4 @@ class OpenAIServingRender:
tokenizer
=
renderer
.
get_tokenizer
()
request
=
tool_parser
(
tokenizer
).
adjust_request
(
request
=
request
)
# type: ignore[arg-type]
return
conversation
,
[
engine_
promp
t
]
return
conversation
,
[
engine_
inpu
t
]
vllm/entrypoints/serve/tokenize/serving.py
View file @
ba2f0acc
...
...
@@ -20,7 +20,7 @@ from vllm.entrypoints.serve.tokenize.protocol import (
TokenizeResponse
,
TokenizerInfoResponse
,
)
from
vllm.inputs
import
TokensPrompt
,
token_input
s
from
vllm.inputs
import
TokensPrompt
,
token
s
_input
from
vllm.logger
import
init_logger
from
vllm.tokenizers
import
TokenizerLike
...
...
@@ -79,7 +79,7 @@ class OpenAIServingTokenization(OpenAIServing):
if
error_check_ret
is
not
None
:
return
error_check_ret
_
,
engine_
promp
ts
=
await
self
.
openai_serving_render
.
preprocess_chat
(
_
,
engine_
inpu
ts
=
await
self
.
openai_serving_render
.
preprocess_chat
(
request
,
request
.
messages
,
default_template
=
self
.
chat_template
,
...
...
@@ -88,22 +88,22 @@ class OpenAIServingTokenization(OpenAIServing):
tool_dicts
=
tool_dicts
,
)
else
:
engine_
promp
ts
=
await
self
.
openai_serving_render
.
preprocess_completion
(
engine_
inpu
ts
=
await
self
.
openai_serving_render
.
preprocess_completion
(
request
,
prompt_input
=
request
.
prompt
,
prompt_embeds
=
None
,
)
input_ids
:
list
[
int
]
=
[]
for
engine_
promp
t
in
engine_
promp
ts
:
for
engine_
inpu
t
in
engine_
inpu
ts
:
self
.
_log_inputs
(
request_id
,
engine_
promp
t
,
engine_
inpu
t
,
params
=
None
,
lora_request
=
lora_request
,
)
prompt_components
=
self
.
_extract_prompt_components
(
engine_
promp
t
)
prompt_components
=
self
.
_extract_prompt_components
(
engine_
inpu
t
)
if
prompt_components
.
token_ids
is
not
None
:
input_ids
.
extend
(
prompt_components
.
token_ids
)
...
...
@@ -134,16 +134,16 @@ class OpenAIServingTokenization(OpenAIServing):
self
.
_log_inputs
(
request_id
,
token_input
s
(
request
.
tokens
),
token
s
_input
(
request
.
tokens
),
params
=
None
,
lora_request
=
lora_request
,
)
engine
_prompt
=
await
self
.
renderer
.
tokenize_prompt_async
(
tok
_prompt
=
await
self
.
renderer
.
tokenize_prompt_async
(
TokensPrompt
(
prompt_token_ids
=
request
.
tokens
),
request
.
build_tok_params
(
self
.
model_config
),
)
prompt_text
=
engine
_prompt
[
"prompt"
]
# type: ignore[typeddict-item]
prompt_text
=
tok
_prompt
[
"prompt"
]
# type: ignore[typeddict-item]
return
DetokenizeResponse
(
prompt
=
prompt_text
)
...
...
Prev
1
2
3
4
5
6
…
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment