Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ba2f0acc
Unverified
Commit
ba2f0acc
authored
Mar 26, 2026
by
Cyrus Leung
Committed by
GitHub
Mar 25, 2026
Browse files
[Misc] Reorganize inputs (#35182)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
678b3c99
Changes
141
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
260 additions
and
273 deletions
+260
-273
vllm/entrypoints/anthropic/serving.py
vllm/entrypoints/anthropic/serving.py
+4
-4
vllm/entrypoints/chat_utils.py
vllm/entrypoints/chat_utils.py
+2
-1
vllm/entrypoints/llm.py
vllm/entrypoints/llm.py
+22
-22
vllm/entrypoints/openai/chat_completion/serving.py
vllm/entrypoints/openai/chat_completion/serving.py
+11
-11
vllm/entrypoints/openai/completion/serving.py
vllm/entrypoints/openai/completion/serving.py
+15
-17
vllm/entrypoints/openai/engine/serving.py
vllm/entrypoints/openai/engine/serving.py
+13
-17
vllm/entrypoints/openai/realtime/serving.py
vllm/entrypoints/openai/realtime/serving.py
+3
-3
vllm/entrypoints/openai/responses/serving.py
vllm/entrypoints/openai/responses/serving.py
+22
-26
vllm/entrypoints/openai/speech_to_text/speech_to_text.py
vllm/entrypoints/openai/speech_to_text/speech_to_text.py
+17
-16
vllm/entrypoints/pooling/base/io_processor.py
vllm/entrypoints/pooling/base/io_processor.py
+10
-10
vllm/entrypoints/pooling/base/serving.py
vllm/entrypoints/pooling/base/serving.py
+8
-8
vllm/entrypoints/pooling/embed/io_processor.py
vllm/entrypoints/pooling/embed/io_processor.py
+20
-20
vllm/entrypoints/pooling/pooling/serving.py
vllm/entrypoints/pooling/pooling/serving.py
+9
-9
vllm/entrypoints/pooling/score/serving.py
vllm/entrypoints/pooling/score/serving.py
+36
-45
vllm/entrypoints/pooling/score/utils.py
vllm/entrypoints/pooling/score/utils.py
+7
-3
vllm/entrypoints/pooling/typing.py
vllm/entrypoints/pooling/typing.py
+2
-2
vllm/entrypoints/serve/disagg/protocol.py
vllm/entrypoints/serve/disagg/protocol.py
+9
-8
vllm/entrypoints/serve/disagg/serving.py
vllm/entrypoints/serve/disagg/serving.py
+3
-5
vllm/entrypoints/serve/render/serving.py
vllm/entrypoints/serve/render/serving.py
+38
-37
vllm/entrypoints/serve/tokenize/serving.py
vllm/entrypoints/serve/tokenize/serving.py
+9
-9
No files found.
vllm/entrypoints/anthropic/serving.py
View file @
ba2f0acc
...
@@ -797,12 +797,12 @@ class AnthropicServingMessages(OpenAIServingChat):
...
@@ -797,12 +797,12 @@ class AnthropicServingMessages(OpenAIServingChat):
if
isinstance
(
result
,
ErrorResponse
):
if
isinstance
(
result
,
ErrorResponse
):
return
result
return
result
_
,
engine_
promp
ts
=
result
_
,
engine_
inpu
ts
=
result
input_tokens
=
sum
(
# type: ignore
input_tokens
=
sum
(
# type: ignore
len
(
promp
t
[
"prompt_token_ids"
])
# type: ignore[typeddict-item, misc]
len
(
engine_inpu
t
[
"prompt_token_ids"
])
# type: ignore[typeddict-item, misc]
for
promp
t
in
engine_
promp
ts
for
engine_inpu
t
in
engine_
inpu
ts
if
"prompt_token_ids"
in
promp
t
if
"prompt_token_ids"
in
engine_inpu
t
)
)
response
=
AnthropicCountTokensResponse
(
response
=
AnthropicCountTokensResponse
(
...
...
vllm/entrypoints/chat_utils.py
View file @
ba2f0acc
...
@@ -40,9 +40,10 @@ from typing_extensions import Required, TypedDict
...
@@ -40,9 +40,10 @@ from typing_extensions import Required, TypedDict
from
vllm
import
envs
from
vllm
import
envs
from
vllm.config
import
ModelConfig
from
vllm.config
import
ModelConfig
from
vllm.inputs
import
MultiModalDataDict
,
MultiModalUUIDDict
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor.models
import
SupportsMultiModal
from
vllm.model_executor.models
import
SupportsMultiModal
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalDataDict
,
MultiModalUUIDDict
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
from
vllm.multimodal.inputs
import
(
MultiModalBatchedField
,
MultiModalBatchedField
,
MultiModalFlatField
,
MultiModalFlatField
,
...
...
vllm/entrypoints/llm.py
View file @
ba2f0acc
...
@@ -57,9 +57,9 @@ from vllm.entrypoints.pooling.score.utils import (
...
@@ -57,9 +57,9 @@ from vllm.entrypoints.pooling.score.utils import (
validate_score_input
,
validate_score_input
,
)
)
from
vllm.entrypoints.utils
import
log_non_default_args
from
vllm.entrypoints.utils
import
log_non_default_args
from
vllm.inputs
.data
import
(
from
vllm.inputs
import
(
DataPrompt
,
DataPrompt
,
Processor
Input
s
,
Engine
Input
,
PromptType
,
PromptType
,
SingletonPrompt
,
SingletonPrompt
,
TextPrompt
,
TextPrompt
,
...
@@ -589,7 +589,7 @@ class LLM:
...
@@ -589,7 +589,7 @@ class LLM:
def
_resolve_mm_lora
(
def
_resolve_mm_lora
(
self
,
self
,
prompt
:
Processor
Input
s
,
prompt
:
Engine
Input
,
lora_request
:
LoRARequest
|
None
,
lora_request
:
LoRARequest
|
None
,
)
->
LoRARequest
|
None
:
)
->
LoRARequest
|
None
:
if
prompt
[
"type"
]
!=
"multimodal"
:
if
prompt
[
"type"
]
!=
"multimodal"
:
...
@@ -716,8 +716,8 @@ class LLM:
...
@@ -716,8 +716,8 @@ class LLM:
eos_token_id
=
tokenizer
.
eos_token_id
eos_token_id
=
tokenizer
.
eos_token_id
sort_beams_key
=
create_sort_beams_key_function
(
eos_token_id
,
length_penalty
)
sort_beams_key
=
create_sort_beams_key_function
(
eos_token_id
,
length_penalty
)
engine_
promp
ts
=
self
.
_preprocess_cmpl
(
prompts
)
engine_
inpu
ts
=
self
.
_preprocess_cmpl
(
prompts
)
lora_requests
=
self
.
_lora_request_to_seq
(
lora_request
,
len
(
engine_
promp
ts
))
lora_requests
=
self
.
_lora_request_to_seq
(
lora_request
,
len
(
engine_
inpu
ts
))
if
use_tqdm
and
concurrency_limit
is
not
None
:
if
use_tqdm
and
concurrency_limit
is
not
None
:
logger
.
warning
(
logger
.
warning
(
...
@@ -727,7 +727,7 @@ class LLM:
...
@@ -727,7 +727,7 @@ class LLM:
use_tqdm
=
False
use_tqdm
=
False
if
concurrency_limit
is
None
:
if
concurrency_limit
is
None
:
concurrency_limit
=
len
(
engine_
promp
ts
)
concurrency_limit
=
len
(
engine_
inpu
ts
)
# generate 2 * beam_width candidates at each step
# generate 2 * beam_width candidates at each step
# following the huggingface transformers implementation
# following the huggingface transformers implementation
...
@@ -740,7 +740,7 @@ class LLM:
...
@@ -740,7 +740,7 @@ class LLM:
)
)
instances
:
list
[
BeamSearchInstance
]
=
[]
instances
:
list
[
BeamSearchInstance
]
=
[]
for
lora_req
,
prompt
in
zip
(
lora_requests
,
engine_
promp
ts
):
for
lora_req
,
prompt
in
zip
(
lora_requests
,
engine_
inpu
ts
):
if
prompt
[
"type"
]
==
"embeds"
:
if
prompt
[
"type"
]
==
"embeds"
:
raise
NotImplementedError
(
raise
NotImplementedError
(
"Embedding prompt not supported for beam search"
"Embedding prompt not supported for beam search"
...
@@ -845,7 +845,7 @@ class LLM:
...
@@ -845,7 +845,7 @@ class LLM:
self
,
self
,
prompts
:
Sequence
[
PromptType
],
prompts
:
Sequence
[
PromptType
],
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
)
->
Sequence
[
Processor
Input
s
]:
)
->
Sequence
[
Engine
Input
]:
"""
"""
Convert prompt inputs from LLM APIs (other than [LLM.chat][]) into
Convert prompt inputs from LLM APIs (other than [LLM.chat][]) into
a format that can be passed to `_add_request`.
a format that can be passed to `_add_request`.
...
@@ -853,7 +853,7 @@ class LLM:
...
@@ -853,7 +853,7 @@ class LLM:
Refer to [LLM.generate][] for a complete description of the arguments.
Refer to [LLM.generate][] for a complete description of the arguments.
Returns:
Returns:
A list of `
Processor
Input
s
` objects ready to be passed into LLMEngine.
A list of `
Engine
Input` objects ready to be passed into LLMEngine.
"""
"""
renderer
=
self
.
renderer
renderer
=
self
.
renderer
model_config
=
self
.
model_config
model_config
=
self
.
model_config
...
@@ -871,9 +871,9 @@ class LLM:
...
@@ -871,9 +871,9 @@ class LLM:
self
,
self
,
prompt
:
PromptType
,
prompt
:
PromptType
,
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
)
->
Processor
Input
s
:
)
->
Engine
Input
:
(
engine_
promp
t
,)
=
self
.
_preprocess_cmpl
([
prompt
],
tokenization_kwargs
)
(
engine_
inpu
t
,)
=
self
.
_preprocess_cmpl
([
prompt
],
tokenization_kwargs
)
return
engine_
promp
t
return
engine_
inpu
t
def
_preprocess_chat
(
def
_preprocess_chat
(
self
,
self
,
...
@@ -886,7 +886,7 @@ class LLM:
...
@@ -886,7 +886,7 @@ class LLM:
tools
:
list
[
dict
[
str
,
Any
]]
|
None
=
None
,
tools
:
list
[
dict
[
str
,
Any
]]
|
None
=
None
,
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
mm_processor_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
mm_processor_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
)
->
Sequence
[
Processor
Input
s
]:
)
->
Sequence
[
Engine
Input
]:
"""
"""
Convert a list of conversations into prompts so that they can then
Convert a list of conversations into prompts so that they can then
be used as input for other LLM APIs.
be used as input for other LLM APIs.
...
@@ -894,7 +894,7 @@ class LLM:
...
@@ -894,7 +894,7 @@ class LLM:
Refer to [LLM.chat][] for a complete description of the arguments.
Refer to [LLM.chat][] for a complete description of the arguments.
Returns:
Returns:
A list of `
Processor
Input
s
` objects ready to be passed into LLMEngine.
A list of `
Engine
Input` objects ready to be passed into LLMEngine.
"""
"""
renderer
=
self
.
renderer
renderer
=
self
.
renderer
...
@@ -915,14 +915,14 @@ class LLM:
...
@@ -915,14 +915,14 @@ class LLM:
**
(
tokenization_kwargs
or
{})
**
(
tokenization_kwargs
or
{})
)
)
_
,
engine_
promp
ts
=
renderer
.
render_chat
(
_
,
engine_
inpu
ts
=
renderer
.
render_chat
(
conversations
,
conversations
,
chat_params
,
chat_params
,
tok_params
,
tok_params
,
prompt_extras
=
{
"mm_processor_kwargs"
:
mm_processor_kwargs
},
prompt_extras
=
{
"mm_processor_kwargs"
:
mm_processor_kwargs
},
)
)
return
engine_
promp
ts
return
engine_
inpu
ts
def
_preprocess_chat_one
(
def
_preprocess_chat_one
(
self
,
self
,
...
@@ -935,8 +935,8 @@ class LLM:
...
@@ -935,8 +935,8 @@ class LLM:
tools
:
list
[
dict
[
str
,
Any
]]
|
None
=
None
,
tools
:
list
[
dict
[
str
,
Any
]]
|
None
=
None
,
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
mm_processor_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
mm_processor_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
)
->
Processor
Input
s
:
)
->
Engine
Input
:
(
engine_
promp
t
,)
=
self
.
_preprocess_chat
(
(
engine_
inpu
t
,)
=
self
.
_preprocess_chat
(
[
conversation
],
[
conversation
],
chat_template
=
chat_template
,
chat_template
=
chat_template
,
chat_template_content_format
=
chat_template_content_format
,
chat_template_content_format
=
chat_template_content_format
,
...
@@ -948,7 +948,7 @@ class LLM:
...
@@ -948,7 +948,7 @@ class LLM:
mm_processor_kwargs
=
mm_processor_kwargs
,
mm_processor_kwargs
=
mm_processor_kwargs
,
)
)
return
engine_
promp
t
return
engine_
inpu
t
def
chat
(
def
chat
(
self
,
self
,
...
@@ -1909,7 +1909,7 @@ class LLM:
...
@@ -1909,7 +1909,7 @@ class LLM:
def
_render_and_run_requests
(
def
_render_and_run_requests
(
self
,
self
,
prompts
:
Iterable
[
Processor
Input
s
],
prompts
:
Iterable
[
Engine
Input
],
params
:
Sequence
[
SamplingParams
|
PoolingParams
],
params
:
Sequence
[
SamplingParams
|
PoolingParams
],
output_type
:
type
[
_O
],
output_type
:
type
[
_O
],
*
,
*
,
...
@@ -1938,7 +1938,7 @@ class LLM:
...
@@ -1938,7 +1938,7 @@ class LLM:
def
_render_and_add_requests
(
def
_render_and_add_requests
(
self
,
self
,
prompts
:
Iterable
[
Processor
Input
s
],
prompts
:
Iterable
[
Engine
Input
],
params
:
Sequence
[
SamplingParams
|
PoolingParams
],
params
:
Sequence
[
SamplingParams
|
PoolingParams
],
*
,
*
,
lora_requests
:
Sequence
[
LoRARequest
|
None
]
|
None
=
None
,
lora_requests
:
Sequence
[
LoRARequest
|
None
]
|
None
=
None
,
...
@@ -1967,7 +1967,7 @@ class LLM:
...
@@ -1967,7 +1967,7 @@ class LLM:
def
_add_request
(
def
_add_request
(
self
,
self
,
prompt
:
Processor
Input
s
,
prompt
:
Engine
Input
,
params
:
SamplingParams
|
PoolingParams
,
params
:
SamplingParams
|
PoolingParams
,
lora_request
:
LoRARequest
|
None
=
None
,
lora_request
:
LoRARequest
|
None
=
None
,
priority
:
int
=
0
,
priority
:
int
=
0
,
...
...
vllm/entrypoints/openai/chat_completion/serving.py
View file @
ba2f0acc
...
@@ -63,7 +63,7 @@ from vllm.entrypoints.openai.parser.harmony_utils import (
...
@@ -63,7 +63,7 @@ from vllm.entrypoints.openai.parser.harmony_utils import (
)
)
from
vllm.entrypoints.openai.utils
import
maybe_filter_parallel_tool_calls
from
vllm.entrypoints.openai.utils
import
maybe_filter_parallel_tool_calls
from
vllm.entrypoints.utils
import
get_max_tokens
,
should_include_usage
from
vllm.entrypoints.utils
import
get_max_tokens
,
should_include_usage
from
vllm.inputs
.data
import
Processor
Input
s
from
vllm.inputs
import
Engine
Input
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.logprobs
import
Logprob
from
vllm.logprobs
import
Logprob
from
vllm.outputs
import
CompletionOutput
,
RequestOutput
from
vllm.outputs
import
CompletionOutput
,
RequestOutput
...
@@ -177,7 +177,7 @@ class OpenAIServingChat(OpenAIServing):
...
@@ -177,7 +177,7 @@ class OpenAIServingChat(OpenAIServing):
async
def
render_chat_request
(
async
def
render_chat_request
(
self
,
self
,
request
:
ChatCompletionRequest
,
request
:
ChatCompletionRequest
,
)
->
tuple
[
list
[
ConversationMessage
],
list
[
Processor
Input
s
]]
|
ErrorResponse
:
)
->
tuple
[
list
[
ConversationMessage
],
list
[
Engine
Input
]]
|
ErrorResponse
:
"""
"""
Validate the model and preprocess a chat completion request.
Validate the model and preprocess a chat completion request.
...
@@ -185,7 +185,7 @@ class OpenAIServingChat(OpenAIServing):
...
@@ -185,7 +185,7 @@ class OpenAIServingChat(OpenAIServing):
engine-aware checks (LoRA model validation, engine health).
engine-aware checks (LoRA model validation, engine health).
Returns:
Returns:
A tuple of (conversation, engine_
promp
ts) on success,
A tuple of (conversation, engine_
inpu
ts) on success,
or an ErrorResponse on failure.
or an ErrorResponse on failure.
"""
"""
error_check_ret
=
await
self
.
_check_model
(
request
)
error_check_ret
=
await
self
.
_check_model
(
request
)
...
@@ -231,7 +231,7 @@ class OpenAIServingChat(OpenAIServing):
...
@@ -231,7 +231,7 @@ class OpenAIServingChat(OpenAIServing):
if
isinstance
(
result
,
ErrorResponse
):
if
isinstance
(
result
,
ErrorResponse
):
return
result
return
result
conversation
,
engine_
promp
ts
=
result
conversation
,
engine_
inpu
ts
=
result
request_id
=
(
request_id
=
(
f
"chatcmpl-
{
self
.
_base_request_id
(
raw_request
,
request
.
request_id
)
}
"
f
"chatcmpl-
{
self
.
_base_request_id
(
raw_request
,
request
.
request_id
)
}
"
...
@@ -251,13 +251,13 @@ class OpenAIServingChat(OpenAIServing):
...
@@ -251,13 +251,13 @@ class OpenAIServingChat(OpenAIServing):
# Schedule the request and get the result generator.
# Schedule the request and get the result generator.
max_model_len
=
self
.
model_config
.
max_model_len
max_model_len
=
self
.
model_config
.
max_model_len
generators
:
list
[
AsyncGenerator
[
RequestOutput
,
None
]]
=
[]
generators
:
list
[
AsyncGenerator
[
RequestOutput
,
None
]]
=
[]
for
i
,
engine_
promp
t
in
enumerate
(
engine_
promp
ts
):
for
i
,
engine_
inpu
t
in
enumerate
(
engine_
inpu
ts
):
prompt_token_ids
=
self
.
_extract_prompt_components
(
engine_
promp
t
).
token_ids
prompt_token_ids
=
self
.
_extract_prompt_components
(
engine_
inpu
t
).
token_ids
# If we are creating sub requests for multiple prompts, ensure that they
# If we are creating sub requests for multiple prompts, ensure that they
# have unique request ids.
# have unique request ids.
sub_request_id
=
(
sub_request_id
=
(
request_id
if
len
(
engine_
promp
ts
)
==
1
else
f
"
{
request_id
}
_
{
i
}
"
request_id
if
len
(
engine_
inpu
ts
)
==
1
else
f
"
{
request_id
}
_
{
i
}
"
)
)
max_tokens
=
get_max_tokens
(
max_tokens
=
get_max_tokens
(
...
@@ -265,7 +265,7 @@ class OpenAIServingChat(OpenAIServing):
...
@@ -265,7 +265,7 @@ class OpenAIServingChat(OpenAIServing):
request
.
max_completion_tokens
request
.
max_completion_tokens
if
request
.
max_completion_tokens
is
not
None
if
request
.
max_completion_tokens
is
not
None
else
request
.
max_tokens
,
else
request
.
max_tokens
,
self
.
_extract_prompt_len
(
engine_
promp
t
),
self
.
_extract_prompt_len
(
engine_
inpu
t
),
self
.
default_sampling_params
,
self
.
default_sampling_params
,
self
.
override_max_tokens
,
self
.
override_max_tokens
,
)
)
...
@@ -283,7 +283,7 @@ class OpenAIServingChat(OpenAIServing):
...
@@ -283,7 +283,7 @@ class OpenAIServingChat(OpenAIServing):
self
.
_log_inputs
(
self
.
_log_inputs
(
sub_request_id
,
sub_request_id
,
engine_
promp
t
,
engine_
inpu
t
,
params
=
sampling_params
,
params
=
sampling_params
,
lora_request
=
lora_request
,
lora_request
=
lora_request
,
)
)
...
@@ -296,7 +296,7 @@ class OpenAIServingChat(OpenAIServing):
...
@@ -296,7 +296,7 @@ class OpenAIServingChat(OpenAIServing):
if
isinstance
(
sampling_params
,
BeamSearchParams
):
if
isinstance
(
sampling_params
,
BeamSearchParams
):
generator
=
self
.
beam_search
(
generator
=
self
.
beam_search
(
prompt
=
engine_
promp
t
,
prompt
=
engine_
inpu
t
,
request_id
=
sub_request_id
,
request_id
=
sub_request_id
,
params
=
sampling_params
,
params
=
sampling_params
,
lora_request
=
lora_request
,
lora_request
=
lora_request
,
...
@@ -313,7 +313,7 @@ class OpenAIServingChat(OpenAIServing):
...
@@ -313,7 +313,7 @@ class OpenAIServingChat(OpenAIServing):
reasoning_ended
=
None
reasoning_ended
=
None
generator
=
self
.
engine_client
.
generate
(
generator
=
self
.
engine_client
.
generate
(
engine_
promp
t
,
engine_
inpu
t
,
sampling_params
,
sampling_params
,
sub_request_id
,
sub_request_id
,
lora_request
=
lora_request
,
lora_request
=
lora_request
,
...
...
vllm/entrypoints/openai/completion/serving.py
View file @
ba2f0acc
...
@@ -33,7 +33,7 @@ from vllm.entrypoints.openai.engine.serving import (
...
@@ -33,7 +33,7 @@ from vllm.entrypoints.openai.engine.serving import (
from
vllm.entrypoints.openai.models.serving
import
OpenAIServingModels
from
vllm.entrypoints.openai.models.serving
import
OpenAIServingModels
from
vllm.entrypoints.utils
import
get_max_tokens
,
should_include_usage
from
vllm.entrypoints.utils
import
get_max_tokens
,
should_include_usage
from
vllm.exceptions
import
VLLMValidationError
from
vllm.exceptions
import
VLLMValidationError
from
vllm.inputs
.data
import
Processor
Input
s
from
vllm.inputs
import
Engine
Input
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.logprobs
import
Logprob
from
vllm.logprobs
import
Logprob
from
vllm.outputs
import
RequestOutput
from
vllm.outputs
import
RequestOutput
...
@@ -82,7 +82,7 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -82,7 +82,7 @@ class OpenAIServingCompletion(OpenAIServing):
async
def
render_completion_request
(
async
def
render_completion_request
(
self
,
self
,
request
:
CompletionRequest
,
request
:
CompletionRequest
,
)
->
list
[
Processor
Input
s
]
|
ErrorResponse
:
)
->
list
[
Engine
Input
]
|
ErrorResponse
:
"""
"""
Validate the model and preprocess a completion request.
Validate the model and preprocess a completion request.
...
@@ -90,8 +90,7 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -90,8 +90,7 @@ class OpenAIServingCompletion(OpenAIServing):
engine-aware checks (LoRA model validation, engine health).
engine-aware checks (LoRA model validation, engine health).
Returns:
Returns:
A list of engine_prompts on success,
A list of engine_inputs on success, or an ErrorResponse on failure.
or an ErrorResponse on failure.
"""
"""
error_check_ret
=
await
self
.
_check_model
(
request
)
error_check_ret
=
await
self
.
_check_model
(
request
)
if
error_check_ret
is
not
None
:
if
error_check_ret
is
not
None
:
...
@@ -128,7 +127,7 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -128,7 +127,7 @@ class OpenAIServingCompletion(OpenAIServing):
if
isinstance
(
result
,
ErrorResponse
):
if
isinstance
(
result
,
ErrorResponse
):
return
result
return
result
engine_
promp
ts
=
result
engine_
inpu
ts
=
result
request_id
=
f
"cmpl-
{
self
.
_base_request_id
(
raw_request
,
request
.
request_id
)
}
"
request_id
=
f
"cmpl-
{
self
.
_base_request_id
(
raw_request
,
request
.
request_id
)
}
"
created_time
=
int
(
time
.
time
())
created_time
=
int
(
time
.
time
())
...
@@ -145,11 +144,11 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -145,11 +144,11 @@ class OpenAIServingCompletion(OpenAIServing):
# Schedule the request and get the result generator.
# Schedule the request and get the result generator.
max_model_len
=
self
.
model_config
.
max_model_len
max_model_len
=
self
.
model_config
.
max_model_len
generators
:
list
[
AsyncGenerator
[
RequestOutput
,
None
]]
=
[]
generators
:
list
[
AsyncGenerator
[
RequestOutput
,
None
]]
=
[]
for
i
,
engine_
promp
t
in
enumerate
(
engine_
promp
ts
):
for
i
,
engine_
inpu
t
in
enumerate
(
engine_
inpu
ts
):
max_tokens
=
get_max_tokens
(
max_tokens
=
get_max_tokens
(
max_model_len
,
max_model_len
,
request
.
max_tokens
,
request
.
max_tokens
,
self
.
_extract_prompt_len
(
engine_
promp
t
),
self
.
_extract_prompt_len
(
engine_
inpu
t
),
self
.
default_sampling_params
,
self
.
default_sampling_params
,
self
.
override_max_tokens
,
self
.
override_max_tokens
,
)
)
...
@@ -169,7 +168,7 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -169,7 +168,7 @@ class OpenAIServingCompletion(OpenAIServing):
self
.
_log_inputs
(
self
.
_log_inputs
(
request_id_item
,
request_id_item
,
engine_
promp
t
,
engine_
inpu
t
,
params
=
sampling_params
,
params
=
sampling_params
,
lora_request
=
lora_request
,
lora_request
=
lora_request
,
)
)
...
@@ -182,7 +181,7 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -182,7 +181,7 @@ class OpenAIServingCompletion(OpenAIServing):
if
isinstance
(
sampling_params
,
BeamSearchParams
):
if
isinstance
(
sampling_params
,
BeamSearchParams
):
generator
=
self
.
beam_search
(
generator
=
self
.
beam_search
(
prompt
=
engine_
promp
t
,
prompt
=
engine_
inpu
t
,
request_id
=
request_id
,
request_id
=
request_id
,
params
=
sampling_params
,
params
=
sampling_params
,
lora_request
=
lora_request
,
lora_request
=
lora_request
,
...
@@ -190,7 +189,7 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -190,7 +189,7 @@ class OpenAIServingCompletion(OpenAIServing):
)
)
else
:
else
:
generator
=
self
.
engine_client
.
generate
(
generator
=
self
.
engine_client
.
generate
(
engine_
promp
t
,
engine_
inpu
t
,
sampling_params
,
sampling_params
,
request_id_item
,
request_id_item
,
lora_request
=
lora_request
,
lora_request
=
lora_request
,
...
@@ -204,7 +203,7 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -204,7 +203,7 @@ class OpenAIServingCompletion(OpenAIServing):
result_generator
=
merge_async_iterators
(
*
generators
)
result_generator
=
merge_async_iterators
(
*
generators
)
model_name
=
self
.
models
.
model_name
(
lora_request
)
model_name
=
self
.
models
.
model_name
(
lora_request
)
num_prompts
=
len
(
engine_
promp
ts
)
num_prompts
=
len
(
engine_
inpu
ts
)
# Streaming response
# Streaming response
tokenizer
=
self
.
renderer
.
tokenizer
tokenizer
=
self
.
renderer
.
tokenizer
...
@@ -212,7 +211,7 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -212,7 +211,7 @@ class OpenAIServingCompletion(OpenAIServing):
if
request
.
stream
:
if
request
.
stream
:
return
self
.
completion_stream_generator
(
return
self
.
completion_stream_generator
(
request
,
request
,
engine_
promp
ts
,
engine_
inpu
ts
,
result_generator
,
result_generator
,
request_id
,
request_id
,
created_time
,
created_time
,
...
@@ -235,8 +234,7 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -235,8 +234,7 @@ class OpenAIServingCompletion(OpenAIServing):
# We did not pass it into vLLM engine to avoid being redundant
# We did not pass it into vLLM engine to avoid being redundant
# with the inputs token IDs
# with the inputs token IDs
if
final_res
.
prompt
is
None
:
if
final_res
.
prompt
is
None
:
engine_prompt
=
engine_prompts
[
i
]
final_res
.
prompt
=
self
.
_extract_prompt_text
(
engine_inputs
[
i
])
final_res
.
prompt
=
self
.
_extract_prompt_text
(
engine_prompt
)
final_res_batch_checked
=
cast
(
list
[
RequestOutput
],
final_res_batch
)
final_res_batch_checked
=
cast
(
list
[
RequestOutput
],
final_res_batch
)
...
@@ -268,7 +266,7 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -268,7 +266,7 @@ class OpenAIServingCompletion(OpenAIServing):
async
def
completion_stream_generator
(
async
def
completion_stream_generator
(
self
,
self
,
request
:
CompletionRequest
,
request
:
CompletionRequest
,
engine_
promp
ts
:
list
[
Processor
Input
s
],
engine_
inpu
ts
:
list
[
Engine
Input
],
result_generator
:
AsyncIterator
[
tuple
[
int
,
RequestOutput
]],
result_generator
:
AsyncIterator
[
tuple
[
int
,
RequestOutput
]],
request_id
:
str
,
request_id
:
str
,
created_time
:
int
,
created_time
:
int
,
...
@@ -301,8 +299,8 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -301,8 +299,8 @@ class OpenAIServingCompletion(OpenAIServing):
prompt_text
=
res
.
prompt
prompt_text
=
res
.
prompt
if
prompt_text
is
None
:
if
prompt_text
is
None
:
engine_
promp
t
=
engine_
promp
ts
[
prompt_idx
]
engine_
inpu
t
=
engine_
inpu
ts
[
prompt_idx
]
prompt_text
=
self
.
_extract_prompt_text
(
engine_
promp
t
)
prompt_text
=
self
.
_extract_prompt_text
(
engine_
inpu
t
)
# Prompt details are excluded from later streamed outputs
# Prompt details are excluded from later streamed outputs
if
prompt_token_ids
is
not
None
:
if
prompt_token_ids
is
not
None
:
...
...
vllm/entrypoints/openai/engine/serving.py
View file @
ba2f0acc
...
@@ -72,11 +72,7 @@ from vllm.entrypoints.serve.tokenize.protocol import (
...
@@ -72,11 +72,7 @@ from vllm.entrypoints.serve.tokenize.protocol import (
)
)
from
vllm.entrypoints.utils
import
create_error_response
from
vllm.entrypoints.utils
import
create_error_response
from
vllm.exceptions
import
VLLMValidationError
from
vllm.exceptions
import
VLLMValidationError
from
vllm.inputs.data
import
(
from
vllm.inputs
import
EngineInput
,
PromptType
,
TokensPrompt
ProcessorInputs
,
PromptType
,
TokensPrompt
,
)
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.logprobs
import
Logprob
,
PromptLogprobs
from
vllm.logprobs
import
Logprob
,
PromptLogprobs
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
...
@@ -163,7 +159,7 @@ class ServeContext(Generic[RequestT]):
...
@@ -163,7 +159,7 @@ class ServeContext(Generic[RequestT]):
request_id
:
str
request_id
:
str
created_time
:
int
=
field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
created_time
:
int
=
field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
lora_request
:
LoRARequest
|
None
=
None
lora_request
:
LoRARequest
|
None
=
None
engine_
promp
ts
:
list
[
Processor
Input
s
]
|
None
=
None
engine_
inpu
ts
:
list
[
Engine
Input
]
|
None
=
None
result_generator
:
AsyncGenerator
[
tuple
[
int
,
PoolingRequestOutput
],
None
]
|
None
=
(
result_generator
:
AsyncGenerator
[
tuple
[
int
,
PoolingRequestOutput
],
None
]
|
None
=
(
None
None
...
@@ -202,7 +198,7 @@ class OpenAIServing:
...
@@ -202,7 +198,7 @@ class OpenAIServing:
async
def
beam_search
(
async
def
beam_search
(
self
,
self
,
prompt
:
Processor
Input
s
,
prompt
:
Engine
Input
,
request_id
:
str
,
request_id
:
str
,
params
:
BeamSearchParams
,
params
:
BeamSearchParams
,
lora_request
:
LoRARequest
|
None
=
None
,
lora_request
:
LoRARequest
|
None
=
None
,
...
@@ -493,21 +489,21 @@ class OpenAIServing:
...
@@ -493,21 +489,21 @@ class OpenAIServing:
if
isinstance
(
pooling_params
,
ErrorResponse
):
if
isinstance
(
pooling_params
,
ErrorResponse
):
return
pooling_params
return
pooling_params
if
ctx
.
engine_
promp
ts
is
None
:
if
ctx
.
engine_
inpu
ts
is
None
:
return
self
.
create_error_response
(
"Engine prompts not available"
)
return
self
.
create_error_response
(
"Engine prompts not available"
)
for
i
,
engine_
promp
t
in
enumerate
(
ctx
.
engine_
promp
ts
):
for
i
,
engine_
inpu
t
in
enumerate
(
ctx
.
engine_
inpu
ts
):
request_id_item
=
f
"
{
ctx
.
request_id
}
-
{
i
}
"
request_id_item
=
f
"
{
ctx
.
request_id
}
-
{
i
}
"
self
.
_log_inputs
(
self
.
_log_inputs
(
request_id_item
,
request_id_item
,
engine_
promp
t
,
engine_
inpu
t
,
params
=
pooling_params
,
params
=
pooling_params
,
lora_request
=
ctx
.
lora_request
,
lora_request
=
ctx
.
lora_request
,
)
)
generator
=
self
.
engine_client
.
encode
(
generator
=
self
.
engine_client
.
encode
(
engine_
promp
t
,
engine_
inpu
t
,
pooling_params
,
pooling_params
,
request_id_item
,
request_id_item
,
lora_request
=
ctx
.
lora_request
,
lora_request
=
ctx
.
lora_request
,
...
@@ -526,10 +522,10 @@ class OpenAIServing:
...
@@ -526,10 +522,10 @@ class OpenAIServing:
ctx
:
ServeContext
,
ctx
:
ServeContext
,
)
->
ErrorResponse
|
None
:
)
->
ErrorResponse
|
None
:
"""Collect batch results from the result generator."""
"""Collect batch results from the result generator."""
if
ctx
.
engine_
promp
ts
is
None
:
if
ctx
.
engine_
inpu
ts
is
None
:
return
self
.
create_error_response
(
"Engine prompts not available"
)
return
self
.
create_error_response
(
"Engine prompts not available"
)
num_prompts
=
len
(
ctx
.
engine_
promp
ts
)
num_prompts
=
len
(
ctx
.
engine_
inpu
ts
)
final_res_batch
:
list
[
PoolingRequestOutput
|
None
]
final_res_batch
:
list
[
PoolingRequestOutput
|
None
]
final_res_batch
=
[
None
]
*
num_prompts
final_res_batch
=
[
None
]
*
num_prompts
...
@@ -806,19 +802,19 @@ class OpenAIServing:
...
@@ -806,19 +802,19 @@ class OpenAIServing:
# Apply server defaults first, then request kwargs override.
# Apply server defaults first, then request kwargs override.
return
default_chat_template_kwargs
|
request_chat_template_kwargs
return
default_chat_template_kwargs
|
request_chat_template_kwargs
def
_extract_prompt_components
(
self
,
prompt
:
PromptType
|
Processor
Input
s
):
def
_extract_prompt_components
(
self
,
prompt
:
PromptType
|
Engine
Input
):
return
extract_prompt_components
(
self
.
model_config
,
prompt
)
return
extract_prompt_components
(
self
.
model_config
,
prompt
)
def
_extract_prompt_text
(
self
,
prompt
:
Pro
cessor
Input
s
):
def
_extract_prompt_text
(
self
,
prompt
:
Pro
mptType
|
Engine
Input
):
return
self
.
_extract_prompt_components
(
prompt
).
text
return
self
.
_extract_prompt_components
(
prompt
).
text
def
_extract_prompt_len
(
self
,
prompt
:
Processor
Input
s
):
def
_extract_prompt_len
(
self
,
prompt
:
Engine
Input
):
return
extract_prompt_len
(
self
.
model_config
,
prompt
)
return
extract_prompt_len
(
self
.
model_config
,
prompt
)
def
_log_inputs
(
def
_log_inputs
(
self
,
self
,
request_id
:
str
,
request_id
:
str
,
inputs
:
PromptType
|
Processor
Input
s
,
inputs
:
PromptType
|
Engine
Input
,
params
:
SamplingParams
|
PoolingParams
|
BeamSearchParams
|
None
,
params
:
SamplingParams
|
PoolingParams
|
BeamSearchParams
|
None
,
lora_request
:
LoRARequest
|
None
,
lora_request
:
LoRARequest
|
None
,
)
->
None
:
)
->
None
:
...
...
vllm/entrypoints/openai/realtime/serving.py
View file @
ba2f0acc
...
@@ -12,7 +12,7 @@ from vllm.engine.protocol import EngineClient, StreamingInput
...
@@ -12,7 +12,7 @@ from vllm.engine.protocol import EngineClient, StreamingInput
from
vllm.entrypoints.logger
import
RequestLogger
from
vllm.entrypoints.logger
import
RequestLogger
from
vllm.entrypoints.openai.engine.serving
import
OpenAIServing
from
vllm.entrypoints.openai.engine.serving
import
OpenAIServing
from
vllm.entrypoints.openai.models.serving
import
OpenAIServingModels
from
vllm.entrypoints.openai.models.serving
import
OpenAIServingModels
from
vllm.inputs
.data
import
PromptType
from
vllm.inputs
import
PromptType
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor.models.interfaces
import
SupportsRealtime
from
vllm.model_executor.models.interfaces
import
SupportsRealtime
from
vllm.renderers.inputs.preprocess
import
parse_model_prompt
from
vllm.renderers.inputs.preprocess
import
parse_model_prompt
...
@@ -83,6 +83,6 @@ class OpenAIServingRealtime(OpenAIServing):
...
@@ -83,6 +83,6 @@ class OpenAIServingRealtime(OpenAIServing):
async
for
prompt
in
stream_input_iter
:
async
for
prompt
in
stream_input_iter
:
parsed_prompt
=
parse_model_prompt
(
model_config
,
prompt
)
parsed_prompt
=
parse_model_prompt
(
model_config
,
prompt
)
(
engine_
promp
t
,)
=
await
renderer
.
render_cmpl_async
([
parsed_prompt
])
(
engine_
inpu
t
,)
=
await
renderer
.
render_cmpl_async
([
parsed_prompt
])
yield
StreamingInput
(
prompt
=
engine_
promp
t
)
yield
StreamingInput
(
prompt
=
engine_
inpu
t
)
vllm/entrypoints/openai/responses/serving.py
View file @
ba2f0acc
...
@@ -110,7 +110,7 @@ from vllm.entrypoints.openai.responses.utils import (
...
@@ -110,7 +110,7 @@ from vllm.entrypoints.openai.responses.utils import (
from
vllm.entrypoints.serve.render.serving
import
OpenAIServingRender
from
vllm.entrypoints.serve.render.serving
import
OpenAIServingRender
from
vllm.entrypoints.utils
import
get_max_tokens
from
vllm.entrypoints.utils
import
get_max_tokens
from
vllm.exceptions
import
VLLMValidationError
from
vllm.exceptions
import
VLLMValidationError
from
vllm.inputs
.data
import
Processor
Input
s
,
token_input
s
from
vllm.inputs
import
Engine
Input
,
token
s
_input
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.logprobs
import
Logprob
as
SampleLogprob
from
vllm.logprobs
import
Logprob
as
SampleLogprob
from
vllm.logprobs
import
SampleLogprobs
from
vllm.logprobs
import
SampleLogprobs
...
@@ -269,10 +269,10 @@ class OpenAIServingResponses(OpenAIServing):
...
@@ -269,10 +269,10 @@ class OpenAIServingResponses(OpenAIServing):
def
_validate_generator_input
(
def
_validate_generator_input
(
self
,
self
,
engine_
prompt
:
Processor
Input
s
,
engine_
input
:
Engine
Input
,
)
->
ErrorResponse
|
None
:
)
->
ErrorResponse
|
None
:
"""Add validations to the input to the generator here."""
"""Add validations to the input to the generator here."""
prompt_len
=
self
.
_extract_prompt_len
(
engine_
promp
t
)
prompt_len
=
self
.
_extract_prompt_len
(
engine_
inpu
t
)
max_model_len
=
self
.
model_config
.
max_model_len
max_model_len
=
self
.
model_config
.
max_model_len
if
prompt_len
>=
max_model_len
:
if
prompt_len
>=
max_model_len
:
...
@@ -369,11 +369,11 @@ class OpenAIServingResponses(OpenAIServing):
...
@@ -369,11 +369,11 @@ class OpenAIServingResponses(OpenAIServing):
model_name
=
self
.
models
.
model_name
(
lora_request
)
model_name
=
self
.
models
.
model_name
(
lora_request
)
if
self
.
use_harmony
:
if
self
.
use_harmony
:
messages
,
engine_
promp
ts
=
self
.
_make_request_with_harmony
(
messages
,
engine_
inpu
ts
=
self
.
_make_request_with_harmony
(
request
,
prev_response
request
,
prev_response
)
)
else
:
else
:
messages
,
engine_
promp
ts
=
await
self
.
_make_request
(
request
,
prev_response
)
messages
,
engine_
inpu
ts
=
await
self
.
_make_request
(
request
,
prev_response
)
request_metadata
=
RequestResponseMetadata
(
request_id
=
request
.
request_id
)
request_metadata
=
RequestResponseMetadata
(
request_id
=
request
.
request_id
)
if
raw_request
:
if
raw_request
:
...
@@ -413,15 +413,15 @@ class OpenAIServingResponses(OpenAIServing):
...
@@ -413,15 +413,15 @@ class OpenAIServingResponses(OpenAIServing):
available_tools
=
[]
available_tools
=
[]
tokenizer
=
self
.
renderer
.
get_tokenizer
()
tokenizer
=
self
.
renderer
.
get_tokenizer
()
for
engine_
promp
t
in
engine_
promp
ts
:
for
engine_
inpu
t
in
engine_
inpu
ts
:
maybe_error
=
self
.
_validate_generator_input
(
engine_
promp
t
)
maybe_error
=
self
.
_validate_generator_input
(
engine_
inpu
t
)
if
maybe_error
is
not
None
:
if
maybe_error
is
not
None
:
return
maybe_error
return
maybe_error
default_max_tokens
=
get_max_tokens
(
default_max_tokens
=
get_max_tokens
(
max_model_len
,
max_model_len
,
request
.
max_output_tokens
,
request
.
max_output_tokens
,
self
.
_extract_prompt_len
(
engine_
promp
t
),
self
.
_extract_prompt_len
(
engine_
inpu
t
),
self
.
default_sampling_params
,
self
.
default_sampling_params
,
self
.
override_max_tokens
,
self
.
override_max_tokens
,
)
)
...
@@ -480,7 +480,7 @@ class OpenAIServingResponses(OpenAIServing):
...
@@ -480,7 +480,7 @@ class OpenAIServingResponses(OpenAIServing):
)
)
generator
=
self
.
_generate_with_builtin_tools
(
generator
=
self
.
_generate_with_builtin_tools
(
request_id
=
request
.
request_id
,
request_id
=
request
.
request_id
,
engine_
promp
t
=
engine_
promp
t
,
engine_
inpu
t
=
engine_
inpu
t
,
sampling_params
=
sampling_params
,
sampling_params
=
sampling_params
,
context
=
context
,
context
=
context
,
lora_request
=
lora_request
,
lora_request
=
lora_request
,
...
@@ -586,7 +586,7 @@ class OpenAIServingResponses(OpenAIServing):
...
@@ -586,7 +586,7 @@ class OpenAIServingResponses(OpenAIServing):
prev_response_output
=
prev_response
.
output
if
prev_response
else
None
,
prev_response_output
=
prev_response
.
output
if
prev_response
else
None
,
)
)
_
,
engine_
promp
ts
=
await
self
.
openai_serving_render
.
preprocess_chat
(
_
,
engine_
inpu
ts
=
await
self
.
openai_serving_render
.
preprocess_chat
(
request
,
request
,
messages
,
messages
,
default_template
=
self
.
chat_template
,
default_template
=
self
.
chat_template
,
...
@@ -595,7 +595,7 @@ class OpenAIServingResponses(OpenAIServing):
...
@@ -595,7 +595,7 @@ class OpenAIServingResponses(OpenAIServing):
tool_dicts
=
tool_dicts
,
tool_dicts
=
tool_dicts
,
tool_parser
=
self
.
parser
.
tool_parser_cls
if
self
.
parser
else
None
,
tool_parser
=
self
.
parser
.
tool_parser_cls
if
self
.
parser
else
None
,
)
)
return
messages
,
engine_
promp
ts
return
messages
,
engine_
inpu
ts
async
def
_render_next_turn
(
async
def
_render_next_turn
(
self
,
self
,
...
@@ -610,7 +610,7 @@ class OpenAIServingResponses(OpenAIServing):
...
@@ -610,7 +610,7 @@ class OpenAIServingResponses(OpenAIServing):
request_input
=
messages
,
request_input
=
messages
,
)
)
_
,
engine_
promp
ts
=
await
self
.
openai_serving_render
.
preprocess_chat
(
_
,
engine_
inpu
ts
=
await
self
.
openai_serving_render
.
preprocess_chat
(
request
,
request
,
new_messages
,
new_messages
,
default_template
=
chat_template
,
default_template
=
chat_template
,
...
@@ -619,12 +619,12 @@ class OpenAIServingResponses(OpenAIServing):
...
@@ -619,12 +619,12 @@ class OpenAIServingResponses(OpenAIServing):
tool_dicts
=
tool_dicts
,
tool_dicts
=
tool_dicts
,
tool_parser
=
tool_parser
,
tool_parser
=
tool_parser
,
)
)
return
engine_
promp
ts
return
engine_
inpu
ts
async
def
_generate_with_builtin_tools
(
async
def
_generate_with_builtin_tools
(
self
,
self
,
request_id
:
str
,
request_id
:
str
,
engine_
prompt
:
Processor
Input
s
,
engine_
input
:
Engine
Input
,
sampling_params
:
SamplingParams
,
sampling_params
:
SamplingParams
,
context
:
ConversationContext
,
context
:
ConversationContext
,
lora_request
:
LoRARequest
|
None
=
None
,
lora_request
:
LoRARequest
|
None
=
None
,
...
@@ -641,13 +641,13 @@ class OpenAIServingResponses(OpenAIServing):
...
@@ -641,13 +641,13 @@ class OpenAIServingResponses(OpenAIServing):
self
.
_log_inputs
(
self
.
_log_inputs
(
sub_request_id
,
sub_request_id
,
engine_
promp
t
,
engine_
inpu
t
,
params
=
sampling_params
,
params
=
sampling_params
,
lora_request
=
lora_request
,
lora_request
=
lora_request
,
)
)
generator
=
self
.
engine_client
.
generate
(
generator
=
self
.
engine_client
.
generate
(
engine_
promp
t
,
engine_
inpu
t
,
sampling_params
,
sampling_params
,
sub_request_id
,
sub_request_id
,
lora_request
=
lora_request
,
lora_request
=
lora_request
,
...
@@ -675,11 +675,11 @@ class OpenAIServingResponses(OpenAIServing):
...
@@ -675,11 +675,11 @@ class OpenAIServingResponses(OpenAIServing):
# Render the next prompt token ids and update sampling_params.
# Render the next prompt token ids and update sampling_params.
if
isinstance
(
context
,
(
HarmonyContext
,
StreamingHarmonyContext
)):
if
isinstance
(
context
,
(
HarmonyContext
,
StreamingHarmonyContext
)):
token_ids
=
context
.
render_for_completion
()
token_ids
=
context
.
render_for_completion
()
engine_
promp
t
=
token_input
s
(
token_ids
)
engine_
inpu
t
=
token
s
_input
(
token_ids
)
sampling_params
.
max_tokens
=
max_model_len
-
len
(
token_ids
)
sampling_params
.
max_tokens
=
max_model_len
-
len
(
token_ids
)
elif
isinstance
(
context
,
ParsableContext
):
elif
isinstance
(
context
,
ParsableContext
):
(
engine_
promp
t
,)
=
await
self
.
_render_next_turn
(
(
engine_
inpu
t
,)
=
await
self
.
_render_next_turn
(
context
.
request
,
context
.
request
,
context
.
parser
.
response_messages
,
context
.
parser
.
response_messages
,
context
.
tool_dicts
,
context
.
tool_dicts
,
...
@@ -691,7 +691,7 @@ class OpenAIServingResponses(OpenAIServing):
...
@@ -691,7 +691,7 @@ class OpenAIServingResponses(OpenAIServing):
sampling_params
.
max_tokens
=
get_max_tokens
(
sampling_params
.
max_tokens
=
get_max_tokens
(
max_model_len
,
max_model_len
,
context
.
request
.
max_output_tokens
,
context
.
request
.
max_output_tokens
,
self
.
_extract_prompt_len
(
engine_
promp
t
),
self
.
_extract_prompt_len
(
engine_
inpu
t
),
self
.
default_sampling_params
,
# type: ignore
self
.
default_sampling_params
,
# type: ignore
self
.
override_max_tokens
,
# type: ignore
self
.
override_max_tokens
,
# type: ignore
)
)
...
@@ -713,14 +713,10 @@ class OpenAIServingResponses(OpenAIServing):
...
@@ -713,14 +713,10 @@ class OpenAIServingResponses(OpenAIServing):
arrival_time
=
time
.
time
()
arrival_time
=
time
.
time
()
messages
=
self
.
_construct_input_messages_with_harmony
(
request
,
prev_response
)
messages
=
self
.
_construct_input_messages_with_harmony
(
request
,
prev_response
)
prompt_token_ids
=
render_for_completion
(
messages
)
prompt_token_ids
=
render_for_completion
(
messages
)
engine_
promp
t
=
token_input
s
(
prompt_token_ids
)
engine_
inpu
t
=
token
s
_input
(
prompt_token_ids
,
cache_salt
=
request
.
cache_salt
)
engine_
promp
t
[
"arrival_time"
]
=
arrival_time
engine_
inpu
t
[
"arrival_time"
]
=
arrival_time
# Add cache_salt if provided in the request
return
messages
,
[
engine_input
]
if
request
.
cache_salt
is
not
None
:
engine_prompt
[
"cache_salt"
]
=
request
.
cache_salt
return
messages
,
[
engine_prompt
]
async
def
_initialize_tool_sessions
(
async
def
_initialize_tool_sessions
(
self
,
self
,
...
...
vllm/entrypoints/openai/speech_to_text/speech_to_text.py
View file @
ba2f0acc
...
@@ -38,7 +38,7 @@ from vllm.entrypoints.openai.speech_to_text.protocol import (
...
@@ -38,7 +38,7 @@ from vllm.entrypoints.openai.speech_to_text.protocol import (
)
)
from
vllm.entrypoints.utils
import
get_max_tokens
from
vllm.entrypoints.utils
import
get_max_tokens
from
vllm.exceptions
import
VLLMValidationError
from
vllm.exceptions
import
VLLMValidationError
from
vllm.inputs
import
EncoderDecoderInput
s
,
Processor
Input
s
from
vllm.inputs
import
EncoderDecoderInput
,
Engine
Input
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.logprobs
import
FlatLogprobs
,
Logprob
from
vllm.logprobs
import
FlatLogprobs
,
Logprob
from
vllm.model_executor.models
import
SupportsTranscription
from
vllm.model_executor.models
import
SupportsTranscription
...
@@ -171,7 +171,7 @@ class OpenAISpeechToText(OpenAIServing):
...
@@ -171,7 +171,7 @@ class OpenAISpeechToText(OpenAIServing):
request
:
SpeechToTextRequest
,
request
:
SpeechToTextRequest
,
audio_data
:
bytes
,
audio_data
:
bytes
,
request_id
:
str
,
request_id
:
str
,
)
->
tuple
[
list
[
Processor
Input
s
],
float
]:
)
->
tuple
[
list
[
Engine
Input
],
float
]:
# Validate request
# Validate request
language
=
self
.
model_cls
.
validate_language
(
request
.
language
)
language
=
self
.
model_cls
.
validate_language
(
request
.
language
)
# Skip to_language validation to avoid extra logging for Whisper.
# Skip to_language validation to avoid extra logging for Whisper.
...
@@ -250,9 +250,9 @@ class OpenAISpeechToText(OpenAIServing):
...
@@ -250,9 +250,9 @@ class OpenAISpeechToText(OpenAIServing):
parsed_prompts
.
append
(
parsed_prompt
)
parsed_prompts
.
append
(
parsed_prompt
)
engine_
promp
ts
=
await
self
.
renderer
.
render_cmpl_async
(
parsed_prompts
)
engine_
inpu
ts
=
await
self
.
renderer
.
render_cmpl_async
(
parsed_prompts
)
return
engine_
promp
ts
,
duration
return
engine_
inpu
ts
,
duration
def
_preprocess_verbose_prompt
(
self
,
prompt
:
EncoderDecoderDictPrompt
):
def
_preprocess_verbose_prompt
(
self
,
prompt
:
EncoderDecoderDictPrompt
):
dec_prompt
=
prompt
[
"decoder_prompt"
]
dec_prompt
=
prompt
[
"decoder_prompt"
]
...
@@ -271,7 +271,7 @@ class OpenAISpeechToText(OpenAIServing):
...
@@ -271,7 +271,7 @@ class OpenAISpeechToText(OpenAIServing):
return
prompt
return
prompt
@
staticmethod
@
staticmethod
def
_get_decoder_prompt_len
(
engine_
promp
ts
:
list
[
Processor
Input
s
])
->
int
:
def
_get_decoder_prompt_len
(
engine_
inpu
ts
:
list
[
Engine
Input
])
->
int
:
"""Get the length of the decoder prompt. Currently we need to offset
"""Get the length of the decoder prompt. Currently we need to offset
by the decoder prompt length when running beam search because the mm
by the decoder prompt length when running beam search because the mm
encoder is not currently cached and runs on decode calls; because of
encoder is not currently cached and runs on decode calls; because of
...
@@ -282,12 +282,13 @@ class OpenAISpeechToText(OpenAIServing):
...
@@ -282,12 +282,13 @@ class OpenAISpeechToText(OpenAIServing):
encoder/decoder caching is implemented.
encoder/decoder caching is implemented.
"""
"""
input_len
=
0
input_len
=
0
assert
len
(
engine_prompts
)
>
0
assert
len
(
engine_inputs
)
>
0
first_eng_prompt
=
engine_prompts
[
0
]
first_input
=
engine_inputs
[
0
]
if
first_input
.
get
(
"type"
)
==
"enc_dec"
:
first_input
=
cast
(
EncoderDecoderInput
,
first_input
)
input_len
=
len
(
first_input
[
"decoder_prompt"
][
"prompt_token_ids"
])
if
first_eng_prompt
.
get
(
"type"
)
==
"enc_dec"
:
first_eng_prompt
=
cast
(
EncoderDecoderInputs
,
first_eng_prompt
)
input_len
=
len
(
first_eng_prompt
[
"decoder_prompt"
][
"prompt_token_ids"
])
return
input_len
return
input_len
def
_get_verbose_segments
(
def
_get_verbose_segments
(
...
@@ -409,7 +410,7 @@ class OpenAISpeechToText(OpenAIServing):
...
@@ -409,7 +410,7 @@ class OpenAISpeechToText(OpenAIServing):
lora_request
=
self
.
_maybe_get_adapters
(
request
)
lora_request
=
self
.
_maybe_get_adapters
(
request
)
engine_
promp
ts
,
duration_s
=
await
self
.
_preprocess_speech_to_text
(
engine_
inpu
ts
,
duration_s
=
await
self
.
_preprocess_speech_to_text
(
request
=
request
,
request
=
request
,
audio_data
=
audio_data
,
audio_data
=
audio_data
,
request_id
=
request_id
,
request_id
=
request_id
,
...
@@ -420,7 +421,7 @@ class OpenAISpeechToText(OpenAIServing):
...
@@ -420,7 +421,7 @@ class OpenAISpeechToText(OpenAIServing):
list_result_generator
:
list
[
AsyncGenerator
[
RequestOutput
,
None
]]
|
None
=
None
list_result_generator
:
list
[
AsyncGenerator
[
RequestOutput
,
None
]]
|
None
=
None
input_len
=
(
input_len
=
(
OpenAISpeechToText
.
_get_decoder_prompt_len
(
engine_
promp
ts
)
OpenAISpeechToText
.
_get_decoder_prompt_len
(
engine_
inpu
ts
)
if
request
.
use_beam_search
if
request
.
use_beam_search
else
0
else
0
)
)
...
@@ -450,12 +451,12 @@ class OpenAISpeechToText(OpenAIServing):
...
@@ -450,12 +451,12 @@ class OpenAISpeechToText(OpenAIServing):
sampling_params
.
logprobs
=
1
sampling_params
.
logprobs
=
1
list_result_generator
=
[]
list_result_generator
=
[]
for
i
,
engine_
promp
t
in
enumerate
(
engine_
promp
ts
):
for
i
,
engine_
inpu
t
in
enumerate
(
engine_
inpu
ts
):
request_id_item
=
f
"
{
request_id
}
_
{
i
}
"
request_id_item
=
f
"
{
request_id
}
_
{
i
}
"
self
.
_log_inputs
(
self
.
_log_inputs
(
request_id_item
,
request_id_item
,
engine_
promp
t
,
engine_
inpu
t
,
params
=
sampling_params
,
params
=
sampling_params
,
lora_request
=
lora_request
,
lora_request
=
lora_request
,
)
)
...
@@ -468,7 +469,7 @@ class OpenAISpeechToText(OpenAIServing):
...
@@ -468,7 +469,7 @@ class OpenAISpeechToText(OpenAIServing):
if
isinstance
(
sampling_params
,
BeamSearchParams
):
if
isinstance
(
sampling_params
,
BeamSearchParams
):
generator
=
self
.
beam_search
(
generator
=
self
.
beam_search
(
prompt
=
engine_
promp
t
,
prompt
=
engine_
inpu
t
,
params
=
sampling_params
,
params
=
sampling_params
,
request_id
=
request_id_item
,
request_id
=
request_id_item
,
lora_request
=
lora_request
,
lora_request
=
lora_request
,
...
@@ -476,7 +477,7 @@ class OpenAISpeechToText(OpenAIServing):
...
@@ -476,7 +477,7 @@ class OpenAISpeechToText(OpenAIServing):
)
)
else
:
else
:
generator
=
self
.
engine_client
.
generate
(
generator
=
self
.
engine_client
.
generate
(
engine_
promp
t
,
engine_
inpu
t
,
sampling_params
,
sampling_params
,
request_id_item
,
request_id_item
,
lora_request
=
lora_request
,
lora_request
=
lora_request
,
...
...
vllm/entrypoints/pooling/base/io_processor.py
View file @
ba2f0acc
...
@@ -18,7 +18,7 @@ from vllm.entrypoints.pooling.typing import (
...
@@ -18,7 +18,7 @@ from vllm.entrypoints.pooling.typing import (
PoolingCompletionLikeRequest
,
PoolingCompletionLikeRequest
,
PoolingServeContext
,
PoolingServeContext
,
)
)
from
vllm.inputs
.data
import
Processor
Input
s
,
SingletonPrompt
from
vllm.inputs
import
Engine
Input
,
SingletonPrompt
from
vllm.renderers
import
BaseRenderer
,
merge_kwargs
from
vllm.renderers
import
BaseRenderer
,
merge_kwargs
from
vllm.renderers.inputs.preprocess
import
parse_model_prompt
,
prompt_to_seq
from
vllm.renderers.inputs.preprocess
import
parse_model_prompt
,
prompt_to_seq
from
vllm.tool_parsers
import
ToolParser
from
vllm.tool_parsers
import
ToolParser
...
@@ -60,7 +60,7 @@ class PoolingIOProcessor:
...
@@ -60,7 +60,7 @@ class PoolingIOProcessor:
chat_template_kwargs
=
request
.
chat_template_kwargs
,
chat_template_kwargs
=
request
.
chat_template_kwargs
,
trust_request_chat_template
=
self
.
trust_request_chat_template
,
trust_request_chat_template
=
self
.
trust_request_chat_template
,
)
)
_
,
engine_
promp
ts
=
self
.
_preprocess_chat_online
(
_
,
engine_
inpu
ts
=
self
.
_preprocess_chat_online
(
request
,
request
,
request
.
messages
,
request
.
messages
,
default_template
=
self
.
chat_template
,
default_template
=
self
.
chat_template
,
...
@@ -68,7 +68,7 @@ class PoolingIOProcessor:
...
@@ -68,7 +68,7 @@ class PoolingIOProcessor:
default_template_kwargs
=
None
,
default_template_kwargs
=
None
,
)
)
elif
isinstance
(
request
,
PoolingCompletionLikeRequest
):
elif
isinstance
(
request
,
PoolingCompletionLikeRequest
):
engine_
promp
ts
=
self
.
_preprocess_completion_online
(
engine_
inpu
ts
=
self
.
_preprocess_completion_online
(
request
,
request
,
prompt_input
=
request
.
input
,
prompt_input
=
request
.
input
,
prompt_embeds
=
None
,
prompt_embeds
=
None
,
...
@@ -76,7 +76,7 @@ class PoolingIOProcessor:
...
@@ -76,7 +76,7 @@ class PoolingIOProcessor:
else
:
else
:
raise
ValueError
(
f
"Invalid
{
self
.
name
}
request type"
)
raise
ValueError
(
f
"Invalid
{
self
.
name
}
request type"
)
ctx
.
engine_
promp
ts
=
engine_
promp
ts
ctx
.
engine_
inpu
ts
=
engine_
inpu
ts
async
def
pre_process_online_async
(
self
,
ctx
:
PoolingServeContext
):
async
def
pre_process_online_async
(
self
,
ctx
:
PoolingServeContext
):
self
.
pre_process_online
(
ctx
)
self
.
pre_process_online
(
ctx
)
...
@@ -100,7 +100,7 @@ class PoolingIOProcessor:
...
@@ -100,7 +100,7 @@ class PoolingIOProcessor:
self
,
self
,
prompts
:
PromptType
|
Sequence
[
PromptType
],
prompts
:
PromptType
|
Sequence
[
PromptType
],
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
)
->
Sequence
[
Processor
Input
s
]:
)
->
Sequence
[
Engine
Input
]:
return
self
.
_preprocess_completion_offline
(
return
self
.
_preprocess_completion_offline
(
prompts
=
prompts
,
tokenization_kwargs
=
tokenization_kwargs
prompts
=
prompts
,
tokenization_kwargs
=
tokenization_kwargs
)
)
...
@@ -128,7 +128,7 @@ class PoolingIOProcessor:
...
@@ -128,7 +128,7 @@ class PoolingIOProcessor:
request
:
RendererRequest
,
request
:
RendererRequest
,
prompt_input
:
str
|
list
[
str
]
|
list
[
int
]
|
list
[
list
[
int
]]
|
None
,
prompt_input
:
str
|
list
[
str
]
|
list
[
int
]
|
list
[
list
[
int
]]
|
None
,
prompt_embeds
:
bytes
|
list
[
bytes
]
|
None
,
prompt_embeds
:
bytes
|
list
[
bytes
]
|
None
,
)
->
list
[
Processor
Input
s
]:
)
->
list
[
Engine
Input
]:
renderer
=
self
.
renderer
renderer
=
self
.
renderer
model_config
=
self
.
model_config
model_config
=
self
.
model_config
...
@@ -167,7 +167,7 @@ class PoolingIOProcessor:
...
@@ -167,7 +167,7 @@ class PoolingIOProcessor:
default_template_kwargs
:
dict
[
str
,
Any
]
|
None
,
default_template_kwargs
:
dict
[
str
,
Any
]
|
None
,
tool_dicts
:
list
[
dict
[
str
,
Any
]]
|
None
=
None
,
tool_dicts
:
list
[
dict
[
str
,
Any
]]
|
None
=
None
,
tool_parser
:
type
[
ToolParser
]
|
None
=
None
,
tool_parser
:
type
[
ToolParser
]
|
None
=
None
,
)
->
tuple
[
list
[
ConversationMessage
],
list
[
Processor
Input
s
]]:
)
->
tuple
[
list
[
ConversationMessage
],
list
[
Engine
Input
]]:
renderer
=
self
.
renderer
renderer
=
self
.
renderer
default_template_kwargs
=
merge_kwargs
(
default_template_kwargs
=
merge_kwargs
(
...
@@ -188,7 +188,7 @@ class PoolingIOProcessor:
...
@@ -188,7 +188,7 @@ class PoolingIOProcessor:
default_media_io_kwargs
=
(
mm_config
.
media_io_kwargs
if
mm_config
else
None
),
default_media_io_kwargs
=
(
mm_config
.
media_io_kwargs
if
mm_config
else
None
),
)
)
(
conversation
,),
(
engine_
promp
t
,)
=
renderer
.
render_chat
(
(
conversation
,),
(
engine_
inpu
t
,)
=
renderer
.
render_chat
(
[
messages
],
[
messages
],
chat_params
,
chat_params
,
tok_params
,
tok_params
,
...
@@ -199,13 +199,13 @@ class PoolingIOProcessor:
...
@@ -199,13 +199,13 @@ class PoolingIOProcessor:
},
},
)
)
return
conversation
,
[
engine_
promp
t
]
return
conversation
,
[
engine_
inpu
t
]
def
_preprocess_completion_offline
(
def
_preprocess_completion_offline
(
self
,
self
,
prompts
:
PromptType
|
Sequence
[
PromptType
],
prompts
:
PromptType
|
Sequence
[
PromptType
],
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
)
->
Sequence
[
Processor
Input
s
]:
)
->
Sequence
[
Engine
Input
]:
renderer
=
self
.
renderer
renderer
=
self
.
renderer
model_config
=
self
.
model_config
model_config
=
self
.
model_config
...
...
vllm/entrypoints/pooling/base/serving.py
View file @
ba2f0acc
...
@@ -20,7 +20,7 @@ from vllm.entrypoints.openai.engine.protocol import ErrorResponse
...
@@ -20,7 +20,7 @@ from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from
vllm.entrypoints.openai.models.serving
import
OpenAIServingModels
from
vllm.entrypoints.openai.models.serving
import
OpenAIServingModels
from
vllm.entrypoints.pooling.typing
import
AnyPoolingRequest
,
PoolingServeContext
from
vllm.entrypoints.pooling.typing
import
AnyPoolingRequest
,
PoolingServeContext
from
vllm.exceptions
import
VLLMNotFoundError
from
vllm.exceptions
import
VLLMNotFoundError
from
vllm.inputs
.data
import
Processor
Input
s
from
vllm.inputs
import
Engine
Input
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
from
vllm.renderers.base
import
BaseRenderer
from
vllm.renderers.base
import
BaseRenderer
from
vllm.renderers.inputs.preprocess
import
extract_prompt_components
from
vllm.renderers.inputs.preprocess
import
extract_prompt_components
...
@@ -106,7 +106,7 @@ class PoolingServing:
...
@@ -106,7 +106,7 @@ class PoolingServing:
self
,
self
,
ctx
:
PoolingServeContext
,
ctx
:
PoolingServeContext
,
):
):
if
ctx
.
engine_
promp
ts
is
None
:
if
ctx
.
engine_
inpu
ts
is
None
:
raise
ValueError
(
"Engine prompts not available"
)
raise
ValueError
(
"Engine prompts not available"
)
generators
:
list
[
AsyncGenerator
[
PoolingRequestOutput
,
None
]]
=
[]
generators
:
list
[
AsyncGenerator
[
PoolingRequestOutput
,
None
]]
=
[]
...
@@ -120,7 +120,7 @@ class PoolingServing:
...
@@ -120,7 +120,7 @@ class PoolingServing:
pooling_params
=
self
.
io_processor
.
create_pooling_params
(
ctx
.
request
)
pooling_params
=
self
.
io_processor
.
create_pooling_params
(
ctx
.
request
)
pooling_params
.
verify
(
self
.
model_config
)
pooling_params
.
verify
(
self
.
model_config
)
for
i
,
engine_
promp
t
in
enumerate
(
ctx
.
engine_
promp
ts
):
for
i
,
engine_
inpu
t
in
enumerate
(
ctx
.
engine_
inpu
ts
):
prompt_request_id
=
(
prompt_request_id
=
(
f
"
{
ctx
.
request_id
}
-
{
i
}
"
f
"
{
ctx
.
request_id
}
-
{
i
}
"
if
ctx
.
prompt_request_ids
is
None
if
ctx
.
prompt_request_ids
is
None
...
@@ -129,13 +129,13 @@ class PoolingServing:
...
@@ -129,13 +129,13 @@ class PoolingServing:
self
.
_log_inputs
(
self
.
_log_inputs
(
prompt_request_id
,
prompt_request_id
,
engine_
promp
t
,
engine_
inpu
t
,
params
=
pooling_params
,
params
=
pooling_params
,
lora_request
=
ctx
.
lora_request
,
lora_request
=
ctx
.
lora_request
,
)
)
generator
=
self
.
engine_client
.
encode
(
generator
=
self
.
engine_client
.
encode
(
engine_
promp
t
,
engine_
inpu
t
,
pooling_params
,
pooling_params
,
prompt_request_id
,
prompt_request_id
,
lora_request
=
ctx
.
lora_request
,
lora_request
=
ctx
.
lora_request
,
...
@@ -151,13 +151,13 @@ class PoolingServing:
...
@@ -151,13 +151,13 @@ class PoolingServing:
self
,
self
,
ctx
:
PoolingServeContext
,
ctx
:
PoolingServeContext
,
):
):
if
ctx
.
engine_
promp
ts
is
None
:
if
ctx
.
engine_
inpu
ts
is
None
:
raise
ValueError
(
"Engine prompts not available"
)
raise
ValueError
(
"Engine prompts not available"
)
if
ctx
.
result_generator
is
None
:
if
ctx
.
result_generator
is
None
:
raise
ValueError
(
"Result generator not available"
)
raise
ValueError
(
"Result generator not available"
)
num_inputs
=
len
(
ctx
.
engine_
promp
ts
)
num_inputs
=
len
(
ctx
.
engine_
inpu
ts
)
final_res_batch
:
list
[
PoolingRequestOutput
|
None
]
final_res_batch
:
list
[
PoolingRequestOutput
|
None
]
final_res_batch
=
[
None
]
*
num_inputs
final_res_batch
=
[
None
]
*
num_inputs
...
@@ -317,7 +317,7 @@ class PoolingServing:
...
@@ -317,7 +317,7 @@ class PoolingServing:
def
_log_inputs
(
def
_log_inputs
(
self
,
self
,
request_id
:
str
,
request_id
:
str
,
inputs
:
Processor
Input
s
,
inputs
:
Engine
Input
,
params
:
PoolingParams
,
params
:
PoolingParams
,
lora_request
:
LoRARequest
|
None
,
lora_request
:
LoRARequest
|
None
,
)
->
None
:
)
->
None
:
...
...
vllm/entrypoints/pooling/embed/io_processor.py
View file @
ba2f0acc
...
@@ -24,7 +24,7 @@ from vllm.entrypoints.pooling.embed.protocol import (
...
@@ -24,7 +24,7 @@ from vllm.entrypoints.pooling.embed.protocol import (
EmbeddingCompletionRequest
,
EmbeddingCompletionRequest
,
)
)
from
vllm.entrypoints.pooling.typing
import
PoolingServeContext
from
vllm.entrypoints.pooling.typing
import
PoolingServeContext
from
vllm.inputs
.data
import
Processor
Input
s
,
token_input
s
from
vllm.inputs
import
Engine
Input
,
token
s
_input
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.outputs
import
PoolingOutput
,
PoolingRequestOutput
from
vllm.outputs
import
PoolingOutput
,
PoolingRequestOutput
from
vllm.renderers
import
merge_kwargs
from
vllm.renderers
import
merge_kwargs
...
@@ -83,20 +83,20 @@ class EmbedIOProcessor(PoolingIOProcessor):
...
@@ -83,20 +83,20 @@ class EmbedIOProcessor(PoolingIOProcessor):
#################################################################
#################################################################
def
_pre_process_chunked
(
self
,
ctx
:
PoolingServeContext
)
->
None
:
def
_pre_process_chunked
(
self
,
ctx
:
PoolingServeContext
)
->
None
:
if
ctx
.
engine_
promp
ts
is
None
:
if
ctx
.
engine_
inpu
ts
is
None
:
raise
ValueError
(
"Engine prompts not available"
)
raise
ValueError
(
"Engine prompts not available"
)
ctx
.
intermediates
=
ctx
.
engine_
promp
ts
ctx
.
intermediates
=
ctx
.
engine_
inpu
ts
request_id
=
ctx
.
request_id
request_id
=
ctx
.
request_id
max_model_len
=
self
.
model_config
.
max_model_len
max_model_len
=
self
.
model_config
.
max_model_len
chunked_engine_
promp
ts
:
list
[
Processor
Input
s
]
=
[]
chunked_engine_
inpu
ts
:
list
[
Engine
Input
]
=
[]
prompt_request_ids
:
list
[
str
]
=
[]
prompt_request_ids
:
list
[
str
]
=
[]
for
prompt_idx
,
engine_
promp
t
in
enumerate
(
ctx
.
engine_
promp
ts
):
for
prompt_idx
,
engine_
inpu
t
in
enumerate
(
ctx
.
engine_
inpu
ts
):
token_ids
=
engine_
promp
t
.
get
(
"prompt_token_ids"
,
None
)
token_ids
=
engine_
inpu
t
.
get
(
"prompt_token_ids"
,
None
)
if
token_ids
is
None
:
if
token_ids
is
None
:
raise
NotImplementedError
(
raise
NotImplementedError
(
"Long Text Embedding with Chunked Processing does "
"Long Text Embedding with Chunked Processing does "
"not support EmbedsPrompt and EncoderDecoderInput
s
."
"not support EmbedsPrompt and EncoderDecoderInput."
)
)
prompt_token_ids
=
cast
(
list
[
int
],
token_ids
)
prompt_token_ids
=
cast
(
list
[
int
],
token_ids
)
...
@@ -104,14 +104,14 @@ class EmbedIOProcessor(PoolingIOProcessor):
...
@@ -104,14 +104,14 @@ class EmbedIOProcessor(PoolingIOProcessor):
for
chunk_idx
,
chunk_tokens
in
enumerate
(
for
chunk_idx
,
chunk_tokens
in
enumerate
(
chunk_list
(
prompt_token_ids
,
max_model_len
)
chunk_list
(
prompt_token_ids
,
max_model_len
)
):
):
chunked_engine_
promp
ts
.
append
(
chunked_engine_
inpu
ts
.
append
(
token_input
s
(
prompt_token_ids
=
chunk_tokens
)
token
s
_input
(
prompt_token_ids
=
chunk_tokens
)
)
)
prompt_request_ids
.
append
(
prompt_request_ids
.
append
(
f
"
{
request_id
}
-prompt-
{
prompt_idx
}
-chunk-
{
chunk_idx
}
"
f
"
{
request_id
}
-prompt-
{
prompt_idx
}
-chunk-
{
chunk_idx
}
"
)
)
ctx
.
engine_
promp
ts
=
chunked_engine_
promp
ts
ctx
.
engine_
inpu
ts
=
chunked_engine_
inpu
ts
ctx
.
prompt_request_ids
=
prompt_request_ids
ctx
.
prompt_request_ids
=
prompt_request_ids
return
None
return
None
...
@@ -184,8 +184,8 @@ class EmbedIOProcessor(PoolingIOProcessor):
...
@@ -184,8 +184,8 @@ class EmbedIOProcessor(PoolingIOProcessor):
if
ctx
.
intermediates
is
None
:
if
ctx
.
intermediates
is
None
:
raise
ValueError
(
"Original prompts inputs not available"
)
raise
ValueError
(
"Original prompts inputs not available"
)
original_engine_
promp
ts
=
cast
(
list
[
Processor
Input
s
],
ctx
.
intermediates
)
original_engine_
inpu
ts
=
cast
(
list
[
Engine
Input
],
ctx
.
intermediates
)
num_prompts
=
len
(
original_engine_
promp
ts
)
num_prompts
=
len
(
original_engine_
inpu
ts
)
# Finalize aggregated results
# Finalize aggregated results
final_res_batch
:
list
[
PoolingRequestOutput
]
=
[]
final_res_batch
:
list
[
PoolingRequestOutput
]
=
[]
...
@@ -211,12 +211,12 @@ class EmbedIOProcessor(PoolingIOProcessor):
...
@@ -211,12 +211,12 @@ class EmbedIOProcessor(PoolingIOProcessor):
pooling_output_data
=
PoolingOutput
(
data
=
final_embedding
)
pooling_output_data
=
PoolingOutput
(
data
=
final_embedding
)
# Get original prompt token IDs for this prompt
# Get original prompt token IDs for this prompt
original_prompt
=
original_engine_
promp
ts
[
prompt_idx
]
original_prompt
=
original_engine_
inpu
ts
[
prompt_idx
]
token_ids
=
original_prompt
.
get
(
"prompt_token_ids"
,
None
)
token_ids
=
original_prompt
.
get
(
"prompt_token_ids"
,
None
)
if
token_ids
is
None
:
if
token_ids
is
None
:
raise
NotImplementedError
(
raise
NotImplementedError
(
"Long Text Embedding with Chunked Processing does "
"Long Text Embedding with Chunked Processing does "
"not support EmbedsPrompt and EncoderDecoderInput
s
."
"not support EmbedsPrompt and EncoderDecoderInput."
)
)
original_token_ids
=
cast
(
list
[
int
],
token_ids
)
original_token_ids
=
cast
(
list
[
int
],
token_ids
)
...
@@ -372,7 +372,7 @@ class EmbedIOProcessor(PoolingIOProcessor):
...
@@ -372,7 +372,7 @@ class EmbedIOProcessor(PoolingIOProcessor):
]
]
for
uri
in
request
.
images
for
uri
in
request
.
images
]
]
ctx
.
engine_
promp
ts
=
self
.
_batch_render_chat
(
ctx
.
engine_
inpu
ts
=
self
.
_batch_render_chat
(
request
,
all_messages
,
truncate_prompt_tokens
,
truncation_side
request
,
all_messages
,
truncate_prompt_tokens
,
truncation_side
)
)
...
@@ -382,7 +382,7 @@ class EmbedIOProcessor(PoolingIOProcessor):
...
@@ -382,7 +382,7 @@ class EmbedIOProcessor(PoolingIOProcessor):
self
.
_mixed_input_to_messages
(
inp
,
task_prefix
=
task_prefix
)
self
.
_mixed_input_to_messages
(
inp
,
task_prefix
=
task_prefix
)
for
inp
in
request
.
inputs
for
inp
in
request
.
inputs
]
]
ctx
.
engine_
promp
ts
=
self
.
_batch_render_chat
(
ctx
.
engine_
inpu
ts
=
self
.
_batch_render_chat
(
request
,
all_messages
,
truncate_prompt_tokens
,
truncation_side
request
,
all_messages
,
truncate_prompt_tokens
,
truncation_side
)
)
...
@@ -396,7 +396,7 @@ class EmbedIOProcessor(PoolingIOProcessor):
...
@@ -396,7 +396,7 @@ class EmbedIOProcessor(PoolingIOProcessor):
truncate_prompt_tokens
=
truncate_prompt_tokens
,
truncate_prompt_tokens
=
truncate_prompt_tokens
,
truncation_side
=
truncation_side
,
truncation_side
=
truncation_side
,
)
)
ctx
.
engine_
promp
ts
=
self
.
_preprocess_completion_online
(
ctx
.
engine_
inpu
ts
=
self
.
_preprocess_completion_online
(
proxy
,
prompt_input
=
proxy
.
input
,
prompt_embeds
=
None
proxy
,
prompt_input
=
proxy
.
input
,
prompt_embeds
=
None
)
)
...
@@ -406,7 +406,7 @@ class EmbedIOProcessor(PoolingIOProcessor):
...
@@ -406,7 +406,7 @@ class EmbedIOProcessor(PoolingIOProcessor):
all_messages
:
Sequence
[
list
[
ChatCompletionMessageParam
]],
all_messages
:
Sequence
[
list
[
ChatCompletionMessageParam
]],
truncate_prompt_tokens
:
int
|
None
,
truncate_prompt_tokens
:
int
|
None
,
truncation_side
:
Literal
[
"left"
,
"right"
]
|
None
,
truncation_side
:
Literal
[
"left"
,
"right"
]
|
None
,
)
->
list
[
Processor
Input
s
]:
)
->
list
[
Engine
Input
]:
"""Batch-render multiple conversations through the chat template."""
"""Batch-render multiple conversations through the chat template."""
if
not
all_messages
:
if
not
all_messages
:
return
[]
return
[]
...
@@ -438,8 +438,8 @@ class EmbedIOProcessor(PoolingIOProcessor):
...
@@ -438,8 +438,8 @@ class EmbedIOProcessor(PoolingIOProcessor):
default_media_io_kwargs
=
(
mm_config
.
media_io_kwargs
if
mm_config
else
None
),
default_media_io_kwargs
=
(
mm_config
.
media_io_kwargs
if
mm_config
else
None
),
)
)
_
,
engine_
promp
ts
=
renderer
.
render_chat
(
all_messages
,
chat_params
,
tok_params
)
_
,
engine_
inpu
ts
=
renderer
.
render_chat
(
all_messages
,
chat_params
,
tok_params
)
return
engine_
promp
ts
return
engine_
inpu
ts
def
_validate_input_type
(
self
,
input_type
:
str
|
None
)
->
None
:
def
_validate_input_type
(
self
,
input_type
:
str
|
None
)
->
None
:
"""Raise if *input_type* is not supported by this model."""
"""Raise if *input_type* is not supported by this model."""
...
...
vllm/entrypoints/pooling/pooling/serving.py
View file @
ba2f0acc
...
@@ -33,7 +33,7 @@ from vllm.entrypoints.pooling.utils import (
...
@@ -33,7 +33,7 @@ from vllm.entrypoints.pooling.utils import (
encode_pooling_output_float
,
encode_pooling_output_float
,
)
)
from
vllm.entrypoints.serve.render.serving
import
OpenAIServingRender
from
vllm.entrypoints.serve.render.serving
import
OpenAIServingRender
from
vllm.inputs
import
Processor
Input
s
from
vllm.inputs
import
Engine
Input
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.outputs
import
PoolingRequestOutput
from
vllm.outputs
import
PoolingRequestOutput
from
vllm.renderers.inputs.preprocess
import
prompt_to_seq
from
vllm.renderers.inputs.preprocess
import
prompt_to_seq
...
@@ -110,7 +110,7 @@ class OpenAIServingPooling(OpenAIServing):
...
@@ -110,7 +110,7 @@ class OpenAIServingPooling(OpenAIServing):
request
.
task
,
request
.
task
,
)
)
engine_
promp
ts
:
Sequence
[
Processor
Input
s
]
engine_
inpu
ts
:
Sequence
[
Engine
Input
]
if
use_io_processor
:
=
isinstance
(
request
,
IOProcessorRequest
):
if
use_io_processor
:
=
isinstance
(
request
,
IOProcessorRequest
):
if
self
.
io_processor
is
None
:
if
self
.
io_processor
is
None
:
raise
ValueError
(
raise
ValueError
(
...
@@ -125,7 +125,7 @@ class OpenAIServingPooling(OpenAIServing):
...
@@ -125,7 +125,7 @@ class OpenAIServingPooling(OpenAIServing):
raw_prompts
=
await
self
.
io_processor
.
pre_process_async
(
raw_prompts
=
await
self
.
io_processor
.
pre_process_async
(
prompt
=
validated_prompt
,
request_id
=
request_id
prompt
=
validated_prompt
,
request_id
=
request_id
)
)
engine_
promp
ts
=
await
self
.
openai_serving_render
.
preprocess_cmpl
(
engine_
inpu
ts
=
await
self
.
openai_serving_render
.
preprocess_cmpl
(
request
,
request
,
prompt_to_seq
(
raw_prompts
),
prompt_to_seq
(
raw_prompts
),
)
)
...
@@ -138,7 +138,7 @@ class OpenAIServingPooling(OpenAIServing):
...
@@ -138,7 +138,7 @@ class OpenAIServingPooling(OpenAIServing):
if
error_check_ret
is
not
None
:
if
error_check_ret
is
not
None
:
return
error_check_ret
return
error_check_ret
_
,
engine_
promp
ts
=
await
self
.
openai_serving_render
.
preprocess_chat
(
_
,
engine_
inpu
ts
=
await
self
.
openai_serving_render
.
preprocess_chat
(
request
,
request
,
request
.
messages
,
request
.
messages
,
default_template
=
self
.
chat_template
,
default_template
=
self
.
chat_template
,
...
@@ -146,7 +146,7 @@ class OpenAIServingPooling(OpenAIServing):
...
@@ -146,7 +146,7 @@ class OpenAIServingPooling(OpenAIServing):
default_template_kwargs
=
None
,
default_template_kwargs
=
None
,
)
)
elif
isinstance
(
request
,
PoolingCompletionRequest
):
elif
isinstance
(
request
,
PoolingCompletionRequest
):
engine_
promp
ts
=
await
self
.
openai_serving_render
.
preprocess_completion
(
engine_
inpu
ts
=
await
self
.
openai_serving_render
.
preprocess_completion
(
request
,
request
,
prompt_input
=
request
.
input
,
prompt_input
=
request
.
input
,
prompt_embeds
=
None
,
prompt_embeds
=
None
,
...
@@ -165,12 +165,12 @@ class OpenAIServingPooling(OpenAIServing):
...
@@ -165,12 +165,12 @@ class OpenAIServingPooling(OpenAIServing):
else
:
else
:
pooling_params
=
request
.
to_pooling_params
()
# type: ignore
pooling_params
=
request
.
to_pooling_params
()
# type: ignore
for
i
,
engine_
promp
t
in
enumerate
(
engine_
promp
ts
):
for
i
,
engine_
inpu
t
in
enumerate
(
engine_
inpu
ts
):
request_id_item
=
f
"
{
request_id
}
-
{
i
}
"
request_id_item
=
f
"
{
request_id
}
-
{
i
}
"
self
.
_log_inputs
(
self
.
_log_inputs
(
request_id_item
,
request_id_item
,
engine_
promp
t
,
engine_
inpu
t
,
params
=
pooling_params
,
params
=
pooling_params
,
lora_request
=
lora_request
,
lora_request
=
lora_request
,
)
)
...
@@ -182,7 +182,7 @@ class OpenAIServingPooling(OpenAIServing):
...
@@ -182,7 +182,7 @@ class OpenAIServingPooling(OpenAIServing):
)
)
generator
=
self
.
engine_client
.
encode
(
generator
=
self
.
engine_client
.
encode
(
engine_
promp
t
,
engine_
inpu
t
,
pooling_params
,
pooling_params
,
request_id_item
,
request_id_item
,
lora_request
=
lora_request
,
lora_request
=
lora_request
,
...
@@ -221,7 +221,7 @@ class OpenAIServingPooling(OpenAIServing):
...
@@ -221,7 +221,7 @@ class OpenAIServingPooling(OpenAIServing):
return
IOProcessorResponse
(
request_id
=
request_id
,
data
=
output
)
return
IOProcessorResponse
(
request_id
=
request_id
,
data
=
output
)
assert
isinstance
(
request
,
(
PoolingCompletionRequest
,
PoolingChatRequest
))
assert
isinstance
(
request
,
(
PoolingCompletionRequest
,
PoolingChatRequest
))
num_prompts
=
len
(
engine_
promp
ts
)
num_prompts
=
len
(
engine_
inpu
ts
)
# Non-streaming response
# Non-streaming response
final_res_batch
:
list
[
PoolingRequestOutput
|
None
]
final_res_batch
:
list
[
PoolingRequestOutput
|
None
]
...
...
vllm/entrypoints/pooling/score/serving.py
View file @
ba2f0acc
...
@@ -35,7 +35,7 @@ from vllm.entrypoints.pooling.score.utils import (
...
@@ -35,7 +35,7 @@ from vllm.entrypoints.pooling.score.utils import (
parse_score_data_single
,
parse_score_data_single
,
validate_score_input
,
validate_score_input
,
)
)
from
vllm.inputs
.data
import
Processor
Input
s
,
TokensPrompt
,
token_input
s
from
vllm.inputs
import
Engine
Input
,
TokensPrompt
,
token
s
_input
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
from
vllm.outputs
import
PoolingRequestOutput
,
ScoringRequestOutput
from
vllm.outputs
import
PoolingRequestOutput
,
ScoringRequestOutput
...
@@ -110,12 +110,12 @@ class ServingScores(OpenAIServing):
...
@@ -110,12 +110,12 @@ class ServingScores(OpenAIServing):
*
(
encode_async
(
t
,
**
tokenization_kwargs
)
for
t
in
input_texts
)
*
(
encode_async
(
t
,
**
tokenization_kwargs
)
for
t
in
input_texts
)
)
)
engine_
promp
ts
:
list
[
Processor
Input
s
]
=
[]
engine_
inpu
ts
:
list
[
Engine
Input
]
=
[]
for
tok_result
,
input_text
in
zip
(
tokenized_prompts
,
input_texts
):
for
tok_result
,
input_text
in
zip
(
tokenized_prompts
,
input_texts
):
text_token_prompt
=
self
.
_validate_input
(
request
,
tok_result
,
input_text
)
text_token_prompt
=
self
.
_validate_input
(
request
,
tok_result
,
input_text
)
engine_
promp
ts
.
append
(
engine_
inpu
ts
.
append
(
token_input
s
(
token
s
_input
(
text_token_prompt
[
"prompt_token_ids"
],
text_token_prompt
[
"prompt_token_ids"
],
prompt
=
input_text
,
prompt
=
input_text
,
)
)
...
@@ -125,19 +125,19 @@ class ServingScores(OpenAIServing):
...
@@ -125,19 +125,19 @@ class ServingScores(OpenAIServing):
generators
:
list
[
AsyncGenerator
[
PoolingRequestOutput
,
None
]]
=
[]
generators
:
list
[
AsyncGenerator
[
PoolingRequestOutput
,
None
]]
=
[]
pooling_params
=
request
.
to_pooling_params
(
"embed"
)
pooling_params
=
request
.
to_pooling_params
(
"embed"
)
for
i
,
engine_
promp
t
in
enumerate
(
engine_
promp
ts
):
for
i
,
engine_
inpu
t
in
enumerate
(
engine_
inpu
ts
):
request_id_item
=
f
"
{
request_id
}
-
{
i
}
"
request_id_item
=
f
"
{
request_id
}
-
{
i
}
"
self
.
_log_inputs
(
self
.
_log_inputs
(
request_id_item
,
request_id_item
,
engine_
promp
t
,
engine_
inpu
t
,
params
=
pooling_params
,
params
=
pooling_params
,
lora_request
=
lora_request
,
lora_request
=
lora_request
,
)
)
generators
.
append
(
generators
.
append
(
self
.
engine_client
.
encode
(
self
.
engine_client
.
encode
(
engine_
promp
t
,
engine_
inpu
t
,
pooling_params
,
pooling_params
,
request_id_item
,
request_id_item
,
lora_request
=
lora_request
,
lora_request
=
lora_request
,
...
@@ -151,7 +151,7 @@ class ServingScores(OpenAIServing):
...
@@ -151,7 +151,7 @@ class ServingScores(OpenAIServing):
# Non-streaming response
# Non-streaming response
final_res_batch
:
list
[
PoolingRequestOutput
]
=
[]
final_res_batch
:
list
[
PoolingRequestOutput
]
=
[]
embeddings
:
list
[
PoolingRequestOutput
|
None
]
=
[
None
]
*
len
(
engine_
promp
ts
)
embeddings
:
list
[
PoolingRequestOutput
|
None
]
=
[
None
]
*
len
(
engine_
inpu
ts
)
async
for
i
,
res
in
result_generator
:
async
for
i
,
res
in
result_generator
:
embeddings
[
i
]
=
res
embeddings
[
i
]
=
res
...
@@ -183,7 +183,7 @@ class ServingScores(OpenAIServing):
...
@@ -183,7 +183,7 @@ class ServingScores(OpenAIServing):
request
:
RerankRequest
|
ScoreRequest
,
request
:
RerankRequest
|
ScoreRequest
,
tokenizer
:
TokenizerLike
,
tokenizer
:
TokenizerLike
,
tokenization_kwargs
:
dict
[
str
,
Any
],
tokenization_kwargs
:
dict
[
str
,
Any
],
)
->
tuple
[
str
,
TokensPrompt
]
:
)
->
TokensPrompt
:
"""Parse a single ScoreData into a text + optional multimodal
"""Parse a single ScoreData into a text + optional multimodal
TokensPrompt for late-interaction encoding.
TokensPrompt for late-interaction encoding.
...
@@ -197,21 +197,22 @@ class ServingScores(OpenAIServing):
...
@@ -197,21 +197,22 @@ class ServingScores(OpenAIServing):
else
:
else
:
text
,
mm_data
,
mm_uuids
=
parse_score_data_single
(
data
,
role
,
model_config
)
text
,
mm_data
,
mm_uuids
=
parse_score_data_single
(
data
,
role
,
model_config
)
prompt_i
nput
s
=
tokenizer
(
text
,
**
tokenization_kwargs
)
prompt_i
d
s
=
tokenizer
.
encode
(
text
,
**
tokenization_kwargs
)
self
.
_validate_input
(
request
,
prompt_i
nputs
[
"input_ids"
]
,
text
)
self
.
_validate_input
(
request
,
prompt_i
ds
,
text
)
engine_prompt
=
TokensPrompt
(
tok_prompt
=
TokensPrompt
(
prompt_token_ids
=
prompt_inputs
[
"input_ids"
],
prompt_token_ids
=
prompt_ids
,
prompt
=
text
,
)
)
if
mm_data
is
not
None
:
if
mm_data
is
not
None
:
engine
_prompt
[
"multi_modal_data"
]
=
mm_data
tok
_prompt
[
"multi_modal_data"
]
=
mm_data
if
mm_uuids
is
not
None
:
if
mm_uuids
is
not
None
:
engine
_prompt
[
"multi_modal_uuids"
]
=
mm_uuids
tok
_prompt
[
"multi_modal_uuids"
]
=
mm_uuids
if
request
.
mm_processor_kwargs
is
not
None
:
if
request
.
mm_processor_kwargs
is
not
None
:
engine
_prompt
[
"mm_processor_kwargs"
]
=
request
.
mm_processor_kwargs
tok
_prompt
[
"mm_processor_kwargs"
]
=
request
.
mm_processor_kwargs
return
t
ext
,
engine
_prompt
return
t
ok
_prompt
async
def
_late_interaction_score
(
async
def
_late_interaction_score
(
self
,
self
,
...
@@ -240,7 +241,7 @@ class ServingScores(OpenAIServing):
...
@@ -240,7 +241,7 @@ class ServingScores(OpenAIServing):
executor
=
self
.
_tokenizer_executor
,
executor
=
self
.
_tokenizer_executor
,
)
)
preprocessed
=
await
asyncio
.
gather
(
tok_prompts
=
await
asyncio
.
gather
(
*
(
*
(
preprocess_async
(
preprocess_async
(
data
=
d
,
data
=
d
,
...
@@ -253,12 +254,8 @@ class ServingScores(OpenAIServing):
...
@@ -253,12 +254,8 @@ class ServingScores(OpenAIServing):
)
)
)
)
query_prompts
:
list
[
TokensPrompt
]
=
[
query_prompts
=
tok_prompts
[:
len
(
data_1
)]
prompt
for
_
,
prompt
in
preprocessed
[:
len
(
data_1
)]
doc_prompts
=
tok_prompts
[
len
(
data_1
)
:]
]
doc_prompts
:
list
[
TokensPrompt
]
=
[
prompt
for
_
,
prompt
in
preprocessed
[
len
(
data_1
)
:]
]
default_pooling_params
=
request
.
to_pooling_params
(
"token_embed"
)
default_pooling_params
=
request
.
to_pooling_params
(
"token_embed"
)
...
@@ -268,7 +265,7 @@ class ServingScores(OpenAIServing):
...
@@ -268,7 +265,7 @@ class ServingScores(OpenAIServing):
query_prompts
query_prompts
)
)
query_generators
:
list
[
AsyncGenerator
[
PoolingRequestOutput
,
None
]]
=
[]
query_generators
:
list
[
AsyncGenerator
[
PoolingRequestOutput
,
None
]]
=
[]
for
i
,
engine
_prompt
in
enumerate
(
query_prompts
):
for
i
,
tok
_prompt
in
enumerate
(
query_prompts
):
request_id_item
=
f
"
{
request_id
}
-query-
{
i
}
"
request_id_item
=
f
"
{
request_id
}
-query-
{
i
}
"
pooling_params
=
default_pooling_params
.
clone
()
pooling_params
=
default_pooling_params
.
clone
()
pooling_params
.
late_interaction_params
=
(
pooling_params
.
late_interaction_params
=
(
...
@@ -280,14 +277,14 @@ class ServingScores(OpenAIServing):
...
@@ -280,14 +277,14 @@ class ServingScores(OpenAIServing):
self
.
_log_inputs
(
self
.
_log_inputs
(
request_id_item
,
request_id_item
,
engine
_prompt
,
tok
_prompt
,
params
=
pooling_params
,
params
=
pooling_params
,
lora_request
=
lora_request
,
lora_request
=
lora_request
,
)
)
query_generators
.
append
(
query_generators
.
append
(
self
.
engine_client
.
encode
(
self
.
engine_client
.
encode
(
engine
_prompt
,
tok
_prompt
,
pooling_params
,
pooling_params
,
request_id_item
,
request_id_item
,
lora_request
=
lora_request
,
lora_request
=
lora_request
,
...
@@ -306,7 +303,7 @@ class ServingScores(OpenAIServing):
...
@@ -306,7 +303,7 @@ class ServingScores(OpenAIServing):
# stage 2: encode docs and return scalar scores from workers.
# stage 2: encode docs and return scalar scores from workers.
doc_generators
:
list
[
AsyncGenerator
[
PoolingRequestOutput
,
None
]]
=
[]
doc_generators
:
list
[
AsyncGenerator
[
PoolingRequestOutput
,
None
]]
=
[]
for
i
,
engine
_prompt
in
enumerate
(
doc_prompts
):
for
i
,
tok
_prompt
in
enumerate
(
doc_prompts
):
request_id_item
=
f
"
{
request_id
}
-doc-
{
i
}
"
request_id_item
=
f
"
{
request_id
}
-doc-
{
i
}
"
query_idx
=
0
if
len
(
query_prompts
)
==
1
else
i
query_idx
=
0
if
len
(
query_prompts
)
==
1
else
i
pooling_params
=
default_pooling_params
.
clone
()
pooling_params
=
default_pooling_params
.
clone
()
...
@@ -316,14 +313,14 @@ class ServingScores(OpenAIServing):
...
@@ -316,14 +313,14 @@ class ServingScores(OpenAIServing):
self
.
_log_inputs
(
self
.
_log_inputs
(
request_id_item
,
request_id_item
,
engine
_prompt
,
tok
_prompt
,
params
=
pooling_params
,
params
=
pooling_params
,
lora_request
=
lora_request
,
lora_request
=
lora_request
,
)
)
doc_generators
.
append
(
doc_generators
.
append
(
self
.
engine_client
.
encode
(
self
.
engine_client
.
encode
(
engine
_prompt
,
tok
_prompt
,
pooling_params
,
pooling_params
,
request_id_item
,
request_id_item
,
lora_request
=
lora_request
,
lora_request
=
lora_request
,
...
@@ -404,28 +401,22 @@ class ServingScores(OpenAIServing):
...
@@ -404,28 +401,22 @@ class ServingScores(OpenAIServing):
)
)
)
)
request_prompts
:
list
[
str
]
=
[]
engine_prompts
:
list
[
TokensPrompt
]
=
[]
for
full_prompt
,
engine_prompt
in
preprocessed_prompts
:
request_prompts
.
append
(
full_prompt
)
engine_prompts
.
append
(
engine_prompt
)
# Schedule the request and get the result generator.
# Schedule the request and get the result generator.
generators
:
list
[
AsyncGenerator
[
PoolingRequestOutput
,
None
]]
=
[]
generators
:
list
[
AsyncGenerator
[
PoolingRequestOutput
,
None
]]
=
[]
default_pooling_params
=
request
.
to_pooling_params
(
"classify"
)
default_pooling_params
=
request
.
to_pooling_params
(
"classify"
)
for
i
,
engine
_prompt
in
enumerate
(
engine
_prompts
):
for
i
,
(
full
_prompt
,
tok_prompt
)
in
enumerate
(
preprocessed
_prompts
):
request_id_item
=
f
"
{
request_id
}
-
{
i
}
"
request_id_item
=
f
"
{
request_id
}
-
{
i
}
"
self
.
_log_inputs
(
self
.
_log_inputs
(
request_id_item
,
request_id_item
,
request
_prompt
s
[
i
]
,
full
_prompt
,
params
=
default_pooling_params
,
params
=
default_pooling_params
,
lora_request
=
lora_request
,
lora_request
=
lora_request
,
)
)
if
token_type_ids
:
=
engine
_prompt
.
pop
(
"token_type_ids"
,
None
):
if
token_type_ids
:
=
tok
_prompt
.
pop
(
"token_type_ids"
,
None
):
pooling_params
=
default_pooling_params
.
clone
()
pooling_params
=
default_pooling_params
.
clone
()
compressed
=
compress_token_type_ids
(
token_type_ids
)
compressed
=
compress_token_type_ids
(
token_type_ids
)
pooling_params
.
extra_kwargs
=
{
"compressed_token_type_ids"
:
compressed
}
pooling_params
.
extra_kwargs
=
{
"compressed_token_type_ids"
:
compressed
}
...
@@ -433,7 +424,7 @@ class ServingScores(OpenAIServing):
...
@@ -433,7 +424,7 @@ class ServingScores(OpenAIServing):
pooling_params
=
default_pooling_params
pooling_params
=
default_pooling_params
generator
=
self
.
engine_client
.
encode
(
generator
=
self
.
engine_client
.
encode
(
engine
_prompt
,
tok
_prompt
,
pooling_params
,
pooling_params
,
request_id_item
,
request_id_item
,
lora_request
=
lora_request
,
lora_request
=
lora_request
,
...
@@ -447,7 +438,7 @@ class ServingScores(OpenAIServing):
...
@@ -447,7 +438,7 @@ class ServingScores(OpenAIServing):
# Non-streaming response
# Non-streaming response
final_res_batch
:
list
[
PoolingRequestOutput
|
None
]
=
[
None
]
*
len
(
final_res_batch
:
list
[
PoolingRequestOutput
|
None
]
=
[
None
]
*
len
(
engine
_prompts
preprocessed
_prompts
)
)
async
for
i
,
res
in
result_generator
:
async
for
i
,
res
in
result_generator
:
...
@@ -464,7 +455,7 @@ class ServingScores(OpenAIServing):
...
@@ -464,7 +455,7 @@ class ServingScores(OpenAIServing):
data_2
:
ScoreData
,
data_2
:
ScoreData
,
)
->
tuple
[
str
,
TokensPrompt
]:
)
->
tuple
[
str
,
TokensPrompt
]:
model_config
=
self
.
model_config
model_config
=
self
.
model_config
full_prompt
,
engine_
promp
t
=
get_score_prompt
(
full_prompt
,
engine_
inpu
t
=
get_score_prompt
(
model_config
=
model_config
,
model_config
=
model_config
,
data_1
=
data_1
,
data_1
=
data_1
,
data_2
=
data_2
,
data_2
=
data_2
,
...
@@ -472,11 +463,11 @@ class ServingScores(OpenAIServing):
...
@@ -472,11 +463,11 @@ class ServingScores(OpenAIServing):
tokenization_kwargs
=
tokenization_kwargs
,
tokenization_kwargs
=
tokenization_kwargs
,
score_template
=
self
.
score_template
,
score_template
=
self
.
score_template
,
)
)
self
.
_validate_input
(
request
,
engine_
promp
t
[
"prompt_token_ids"
],
full_prompt
)
self
.
_validate_input
(
request
,
engine_
inpu
t
[
"prompt_token_ids"
],
full_prompt
)
if
request
.
mm_processor_kwargs
is
not
None
:
if
request
.
mm_processor_kwargs
is
not
None
:
engine_
promp
t
[
"mm_processor_kwargs"
]
=
request
.
mm_processor_kwargs
engine_
inpu
t
[
"mm_processor_kwargs"
]
=
request
.
mm_processor_kwargs
return
full_prompt
,
engine_
promp
t
return
full_prompt
,
engine_
inpu
t
async
def
_run_scoring
(
async
def
_run_scoring
(
self
,
self
,
...
...
vllm/entrypoints/pooling/score/utils.py
View file @
ba2f0acc
...
@@ -20,10 +20,14 @@ from vllm.entrypoints.chat_utils import (
...
@@ -20,10 +20,14 @@ from vllm.entrypoints.chat_utils import (
MultiModalItemTracker
,
MultiModalItemTracker
,
_parse_chat_message_content_parts
,
_parse_chat_message_content_parts
,
)
)
from
vllm.inputs
import
TokensPrompt
from
vllm.inputs
import
(
from
vllm.inputs.data
import
PromptType
,
TextPrompt
MultiModalDataDict
,
MultiModalUUIDDict
,
PromptType
,
TextPrompt
,
TokensPrompt
,
)
from
vllm.model_executor.models.interfaces
import
supports_score_template
from
vllm.model_executor.models.interfaces
import
supports_score_template
from
vllm.multimodal.inputs
import
MultiModalDataDict
,
MultiModalUUIDDict
from
vllm.outputs
import
PoolingRequestOutput
from
vllm.outputs
import
PoolingRequestOutput
from
vllm.renderers.hf
import
safe_apply_chat_template
from
vllm.renderers.hf
import
safe_apply_chat_template
from
vllm.tokenizers
import
TokenizerLike
from
vllm.tokenizers
import
TokenizerLike
...
...
vllm/entrypoints/pooling/typing.py
View file @
ba2f0acc
...
@@ -32,7 +32,7 @@ from vllm.entrypoints.pooling.score.protocol import (
...
@@ -32,7 +32,7 @@ from vllm.entrypoints.pooling.score.protocol import (
ScoreRequest
,
ScoreRequest
,
ScoreResponse
,
ScoreResponse
,
)
)
from
vllm.inputs
import
Processor
Input
s
from
vllm.inputs
import
Engine
Input
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
PoolingCompletionLikeRequest
:
TypeAlias
=
(
PoolingCompletionLikeRequest
:
TypeAlias
=
(
...
@@ -74,7 +74,7 @@ class PoolingServeContext(Generic[PoolingRequestT]):
...
@@ -74,7 +74,7 @@ class PoolingServeContext(Generic[PoolingRequestT]):
created_time
:
int
=
field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
created_time
:
int
=
field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
lora_request
:
LoRARequest
|
None
=
None
lora_request
:
LoRARequest
|
None
=
None
engine_
promp
ts
:
list
[
Processor
Input
s
]
|
None
=
None
engine_
inpu
ts
:
list
[
Engine
Input
]
|
None
=
None
prompt_request_ids
:
list
[
str
]
|
None
=
None
prompt_request_ids
:
list
[
str
]
|
None
=
None
intermediates
:
Any
|
None
=
None
intermediates
:
Any
|
None
=
None
...
...
vllm/entrypoints/serve/disagg/protocol.py
View file @
ba2f0acc
...
@@ -33,19 +33,20 @@ class MultiModalFeatures(BaseModel):
...
@@ -33,19 +33,20 @@ class MultiModalFeatures(BaseModel):
"""Lightweight multimodal metadata produced by the render step.
"""Lightweight multimodal metadata produced by the render step.
Carries hashes (for cache lookup / identification) and placeholder
Carries hashes (for cache lookup / identification) and placeholder
positions so the downstream
`
`/generate`
`
service knows *where* in
positions so the downstream `/generate` service knows *where* in
the token sequence each multimodal item lives.
the token sequence each multimodal item lives.
.. note:: Phase 1 — metadata only.
Note:
Phase 2 should add ``mm_kwargs`` (processed tensor data) using a
Phase 1 — metadata only.
binary transport so the ``/generate`` side can skip re-processing.
Phase 2 should add `mm_kwargs` (processed tensor data) using a
The ``/generate`` endpoint must also be updated to inject these
binary transport so the ``/generate` side can skip re-processing.
features into ``ProcessorInputs`` before passing to
The `/generate` endpoint must also be updated to inject these
``InputProcessor.process_inputs``.
features into `EngineInput` before passing to
`InputProcessor.process_inputs`.
"""
"""
mm_hashes
:
dict
[
str
,
list
[
str
]]
mm_hashes
:
dict
[
str
,
list
[
str
]]
"""Per-modality item hashes, e.g.
`
`{"image": ["abc", "def"]}`
`
."""
"""Per-modality item hashes, e.g. `{"image": ["abc", "def"]}`."""
mm_placeholders
:
dict
[
str
,
list
[
PlaceholderRangeInfo
]]
mm_placeholders
:
dict
[
str
,
list
[
PlaceholderRangeInfo
]]
"""Per-modality placeholder ranges in the token sequence."""
"""Per-modality placeholder ranges in the token sequence."""
...
...
vllm/entrypoints/serve/disagg/serving.py
View file @
ba2f0acc
...
@@ -99,13 +99,11 @@ class ServingTokens(OpenAIServing):
...
@@ -99,13 +99,11 @@ class ServingTokens(OpenAIServing):
if
raw_request
:
if
raw_request
:
raw_request
.
state
.
request_metadata
=
request_metadata
raw_request
.
state
.
request_metadata
=
request_metadata
engine_
prompts
=
await
self
.
openai_serving_render
.
preprocess_completion
(
(
engine_
input
,)
=
await
self
.
openai_serving_render
.
preprocess_completion
(
request
,
request
,
prompt_input
=
request
.
token_ids
,
prompt_input
=
request
.
token_ids
,
prompt_embeds
=
None
,
prompt_embeds
=
None
,
)
)
assert
len
(
engine_prompts
)
==
1
engine_prompt
=
engine_prompts
[
0
]
# Schedule the request and get the result generator.
# Schedule the request and get the result generator.
result_generator
:
AsyncGenerator
[
RequestOutput
,
None
]
|
None
=
None
result_generator
:
AsyncGenerator
[
RequestOutput
,
None
]
|
None
=
None
...
@@ -115,7 +113,7 @@ class ServingTokens(OpenAIServing):
...
@@ -115,7 +113,7 @@ class ServingTokens(OpenAIServing):
self
.
_log_inputs
(
self
.
_log_inputs
(
request_id
,
request_id
,
engine_
promp
t
,
engine_
inpu
t
,
params
=
sampling_params
,
params
=
sampling_params
,
lora_request
=
lora_request
,
lora_request
=
lora_request
,
)
)
...
@@ -127,7 +125,7 @@ class ServingTokens(OpenAIServing):
...
@@ -127,7 +125,7 @@ class ServingTokens(OpenAIServing):
)
)
result_generator
=
self
.
engine_client
.
generate
(
result_generator
=
self
.
engine_client
.
generate
(
engine_
promp
t
,
engine_
inpu
t
,
sampling_params
,
sampling_params
,
request_id
,
request_id
,
lora_request
=
lora_request
,
lora_request
=
lora_request
,
...
...
vllm/entrypoints/serve/render/serving.py
View file @
ba2f0acc
...
@@ -34,9 +34,15 @@ from vllm.entrypoints.utils import (
...
@@ -34,9 +34,15 @@ from vllm.entrypoints.utils import (
create_error_response
,
create_error_response
,
get_max_tokens
,
get_max_tokens
,
)
)
from
vllm.inputs.data
import
ProcessorInputs
,
PromptType
,
SingletonPrompt
,
TokensPrompt
from
vllm.inputs
import
(
EngineInput
,
MultiModalHashes
,
MultiModalPlaceholders
,
PromptType
,
SingletonPrompt
,
tokens_input
,
)
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.multimodal.inputs
import
MultiModalHashes
,
MultiModalPlaceholderDict
from
vllm.parser
import
ParserManager
from
vllm.parser
import
ParserManager
from
vllm.renderers
import
BaseRenderer
,
merge_kwargs
from
vllm.renderers
import
BaseRenderer
,
merge_kwargs
from
vllm.renderers.inputs.preprocess
import
(
from
vllm.renderers.inputs.preprocess
import
(
...
@@ -127,22 +133,22 @@ class OpenAIServingRender:
...
@@ -127,22 +133,22 @@ class OpenAIServingRender:
if
isinstance
(
result
,
ErrorResponse
):
if
isinstance
(
result
,
ErrorResponse
):
return
result
return
result
_
,
engine_
promp
ts
=
result
_
,
engine_
inpu
ts
=
result
if
len
(
engine_
promp
ts
)
!=
1
:
if
len
(
engine_
inpu
ts
)
!=
1
:
return
self
.
create_error_response
(
return
self
.
create_error_response
(
f
"Expected exactly 1 engine prompt, got
{
len
(
engine_
promp
ts
)
}
"
f
"Expected exactly 1 engine prompt, got
{
len
(
engine_
inpu
ts
)
}
"
)
)
engine_
promp
t
=
engine_
promp
ts
[
0
]
engine_
inpu
t
=
engine_
inpu
ts
[
0
]
prompt_components
=
extract_prompt_components
(
self
.
model_config
,
engine_
promp
t
)
prompt_components
=
extract_prompt_components
(
self
.
model_config
,
engine_
inpu
t
)
token_ids
=
prompt_components
.
token_ids
token_ids
=
prompt_components
.
token_ids
if
not
token_ids
:
if
not
token_ids
:
return
self
.
create_error_response
(
"No token_ids rendered"
)
return
self
.
create_error_response
(
"No token_ids rendered"
)
token_ids
=
list
(
token_ids
)
token_ids
=
list
(
token_ids
)
input_length
=
extract_prompt_len
(
self
.
model_config
,
engine_
promp
t
)
input_length
=
extract_prompt_len
(
self
.
model_config
,
engine_
inpu
t
)
max_tokens
=
get_max_tokens
(
max_tokens
=
get_max_tokens
(
self
.
model_config
.
max_model_len
,
self
.
model_config
.
max_model_len
,
request
.
max_completion_tokens
request
.
max_completion_tokens
...
@@ -159,7 +165,7 @@ class OpenAIServingRender:
...
@@ -159,7 +165,7 @@ class OpenAIServingRender:
return
GenerateRequest
(
return
GenerateRequest
(
request_id
=
request_id
,
request_id
=
request_id
,
token_ids
=
token_ids
,
token_ids
=
token_ids
,
features
=
self
.
_extract_mm_features
(
engine_
promp
t
),
features
=
self
.
_extract_mm_features
(
engine_
inpu
t
),
sampling_params
=
params
,
sampling_params
=
params
,
model
=
request
.
model
,
model
=
request
.
model
,
stream
=
bool
(
request
.
stream
),
stream
=
bool
(
request
.
stream
),
...
@@ -171,7 +177,7 @@ class OpenAIServingRender:
...
@@ -171,7 +177,7 @@ class OpenAIServingRender:
async
def
render_chat
(
async
def
render_chat
(
self
,
self
,
request
:
ChatCompletionRequest
,
request
:
ChatCompletionRequest
,
)
->
tuple
[
list
[
ConversationMessage
],
list
[
Processor
Input
s
]]
|
ErrorResponse
:
)
->
tuple
[
list
[
ConversationMessage
],
list
[
Engine
Input
]]
|
ErrorResponse
:
"""Core preprocessing logic for chat requests (no model/engine check).
"""Core preprocessing logic for chat requests (no model/engine check).
Called directly by render_chat_request and delegated to by
Called directly by render_chat_request and delegated to by
...
@@ -184,7 +190,6 @@ class OpenAIServingRender:
...
@@ -184,7 +190,6 @@ class OpenAIServingRender:
if
is_mistral_tokenizer
(
tokenizer
):
if
is_mistral_tokenizer
(
tokenizer
):
# because of issues with pydantic we need to potentially
# because of issues with pydantic we need to potentially
# re-serialize the tool_calls field of the request
# re-serialize the tool_calls field of the request
# for more info: see comment in `maybe_serialize_tool_calls`
_mt
.
maybe_serialize_tool_calls
(
request
)
# type: ignore[arg-type]
_mt
.
maybe_serialize_tool_calls
(
request
)
# type: ignore[arg-type]
_mt
.
truncate_tool_call_ids
(
request
)
# type: ignore[arg-type]
_mt
.
truncate_tool_call_ids
(
request
)
# type: ignore[arg-type]
_mt
.
validate_request_params
(
request
)
_mt
.
validate_request_params
(
request
)
...
@@ -232,7 +237,7 @@ class OpenAIServingRender:
...
@@ -232,7 +237,7 @@ class OpenAIServingRender:
if
error_check_ret
is
not
None
:
if
error_check_ret
is
not
None
:
return
error_check_ret
return
error_check_ret
conversation
,
engine_
promp
ts
=
await
self
.
preprocess_chat
(
conversation
,
engine_
inpu
ts
=
await
self
.
preprocess_chat
(
request
,
request
,
request
.
messages
,
request
.
messages
,
default_template
=
self
.
chat_template
,
default_template
=
self
.
chat_template
,
...
@@ -244,11 +249,11 @@ class OpenAIServingRender:
...
@@ -244,11 +249,11 @@ class OpenAIServingRender:
else
:
else
:
# For GPT-OSS.
# For GPT-OSS.
should_include_tools
=
tool_dicts
is
not
None
should_include_tools
=
tool_dicts
is
not
None
conversation
,
engine_
promp
ts
=
self
.
_make_request_with_harmony
(
conversation
,
engine_
inpu
ts
=
self
.
_make_request_with_harmony
(
request
,
should_include_tools
request
,
should_include_tools
)
)
return
conversation
,
engine_
promp
ts
return
conversation
,
engine_
inpu
ts
async
def
render_completion_request
(
async
def
render_completion_request
(
self
,
self
,
...
@@ -266,16 +271,16 @@ class OpenAIServingRender:
...
@@ -266,16 +271,16 @@ class OpenAIServingRender:
if
isinstance
(
result
,
ErrorResponse
):
if
isinstance
(
result
,
ErrorResponse
):
return
result
return
result
generate_requests
:
list
[
GenerateRequest
]
=
[]
generate_requests
:
list
[
GenerateRequest
]
=
[]
for
engine_
promp
t
in
result
:
for
engine_
inpu
t
in
result
:
prompt_components
=
extract_prompt_components
(
prompt_components
=
extract_prompt_components
(
self
.
model_config
,
engine_
promp
t
self
.
model_config
,
engine_
inpu
t
)
)
token_ids
=
prompt_components
.
token_ids
token_ids
=
prompt_components
.
token_ids
if
not
token_ids
:
if
not
token_ids
:
return
self
.
create_error_response
(
"No token_ids rendered"
)
return
self
.
create_error_response
(
"No token_ids rendered"
)
token_ids
=
list
(
token_ids
)
token_ids
=
list
(
token_ids
)
input_length
=
extract_prompt_len
(
self
.
model_config
,
engine_
promp
t
)
input_length
=
extract_prompt_len
(
self
.
model_config
,
engine_
inpu
t
)
max_tokens
=
get_max_tokens
(
max_tokens
=
get_max_tokens
(
self
.
model_config
.
max_model_len
,
self
.
model_config
.
max_model_len
,
request
.
max_tokens
,
request
.
max_tokens
,
...
@@ -293,7 +298,7 @@ class OpenAIServingRender:
...
@@ -293,7 +298,7 @@ class OpenAIServingRender:
GenerateRequest
(
GenerateRequest
(
request_id
=
request_id
,
request_id
=
request_id
,
token_ids
=
token_ids
,
token_ids
=
token_ids
,
features
=
self
.
_extract_mm_features
(
engine_
promp
t
),
features
=
self
.
_extract_mm_features
(
engine_
inpu
t
),
sampling_params
=
params
,
sampling_params
=
params
,
model
=
request
.
model
,
model
=
request
.
model
,
stream
=
bool
(
request
.
stream
),
stream
=
bool
(
request
.
stream
),
...
@@ -308,7 +313,7 @@ class OpenAIServingRender:
...
@@ -308,7 +313,7 @@ class OpenAIServingRender:
async
def
render_completion
(
async
def
render_completion
(
self
,
self
,
request
:
CompletionRequest
,
request
:
CompletionRequest
,
)
->
list
[
Processor
Input
s
]
|
ErrorResponse
:
)
->
list
[
Engine
Input
]
|
ErrorResponse
:
"""Core preprocessing logic for completion requests (no model/engine check).
"""Core preprocessing logic for completion requests (no model/engine check).
Called directly by render_completion_request and delegated to by
Called directly by render_completion_request and delegated to by
...
@@ -326,28 +331,28 @@ class OpenAIServingRender:
...
@@ -326,28 +331,28 @@ class OpenAIServingRender:
"prompt_logprobs is not compatible with prompt embeds."
"prompt_logprobs is not compatible with prompt embeds."
)
)
engine_
promp
ts
=
await
self
.
preprocess_completion
(
engine_
inpu
ts
=
await
self
.
preprocess_completion
(
request
,
request
,
prompt_input
=
request
.
prompt
,
prompt_input
=
request
.
prompt
,
prompt_embeds
=
request
.
prompt_embeds
,
prompt_embeds
=
request
.
prompt_embeds
,
)
)
return
engine_
promp
ts
return
engine_
inpu
ts
@
staticmethod
@
staticmethod
def
_extract_mm_features
(
def
_extract_mm_features
(
engine_
prompt
:
Processor
Input
s
,
engine_
input
:
Engine
Input
,
)
->
MultiModalFeatures
|
None
:
)
->
MultiModalFeatures
|
None
:
"""Extract multimodal metadata from a rendered engine prompt.
"""Extract multimodal metadata from a rendered engine prompt.
Returns ``None`` for text-only prompts.
Returns ``None`` for text-only prompts.
"""
"""
if
engine_
promp
t
.
get
(
"type"
)
!=
"multimodal"
:
if
engine_
inpu
t
.
get
(
"type"
)
!=
"multimodal"
:
return
None
return
None
# At this point engine_
promp
t is a MultiModalInputs TypedDict.
# At this point engine_
inpu
t is a MultiModalInputs TypedDict.
mm_hashes
:
MultiModalHashes
=
engine_
promp
t
[
"mm_hashes"
]
# type: ignore[typeddict-item]
mm_hashes
:
MultiModalHashes
=
engine_
inpu
t
[
"mm_hashes"
]
# type: ignore[typeddict-item]
raw_placeholders
:
MultiModalPlaceholder
Dict
=
engine_
promp
t
[
"mm_placeholders"
]
# type: ignore[typeddict-item]
raw_placeholders
:
MultiModalPlaceholder
s
=
engine_
inpu
t
[
"mm_placeholders"
]
# type: ignore[typeddict-item]
mm_placeholders
=
{
mm_placeholders
=
{
modality
:
[
modality
:
[
...
@@ -401,13 +406,9 @@ class OpenAIServingRender:
...
@@ -401,13 +406,9 @@ class OpenAIServingRender:
# Render prompt token ids.
# Render prompt token ids.
prompt_token_ids
=
render_for_completion
(
messages
)
prompt_token_ids
=
render_for_completion
(
messages
)
engine_prompt
=
TokensPrompt
(
prompt_token_ids
=
prompt_token_ids
)
engine_input
=
tokens_input
(
prompt_token_ids
,
cache_salt
=
request
.
cache_salt
)
# Add cache_salt if provided in the request
if
request
.
cache_salt
is
not
None
:
engine_prompt
[
"cache_salt"
]
=
request
.
cache_salt
return
messages
,
[
engine_
promp
t
]
return
messages
,
[
engine_
inpu
t
]
def
create_error_response
(
def
create_error_response
(
self
,
self
,
...
@@ -450,7 +451,7 @@ class OpenAIServingRender:
...
@@ -450,7 +451,7 @@ class OpenAIServingRender:
request
:
Any
,
request
:
Any
,
prompt_input
:
str
|
list
[
str
]
|
list
[
int
]
|
list
[
list
[
int
]]
|
None
,
prompt_input
:
str
|
list
[
str
]
|
list
[
int
]
|
list
[
list
[
int
]]
|
None
,
prompt_embeds
:
bytes
|
list
[
bytes
]
|
None
,
prompt_embeds
:
bytes
|
list
[
bytes
]
|
None
,
)
->
list
[
Processor
Input
s
]:
)
->
list
[
Engine
Input
]:
"""Copied from OpenAIServing._preprocess_completion."""
"""Copied from OpenAIServing._preprocess_completion."""
prompts
=
list
[
SingletonPrompt
|
bytes
]()
prompts
=
list
[
SingletonPrompt
|
bytes
]()
if
prompt_embeds
is
not
None
:
# embeds take higher priority
if
prompt_embeds
is
not
None
:
# embeds take higher priority
...
@@ -463,7 +464,7 @@ class OpenAIServingRender:
...
@@ -463,7 +464,7 @@ class OpenAIServingRender:
self
,
self
,
request
:
Any
,
request
:
Any
,
prompts
:
Sequence
[
PromptType
|
bytes
],
prompts
:
Sequence
[
PromptType
|
bytes
],
)
->
list
[
Processor
Input
s
]:
)
->
list
[
Engine
Input
]:
"""Copied from OpenAIServing._preprocess_cmpl."""
"""Copied from OpenAIServing._preprocess_cmpl."""
renderer
=
self
.
renderer
renderer
=
self
.
renderer
model_config
=
self
.
model_config
model_config
=
self
.
model_config
...
@@ -497,7 +498,7 @@ class OpenAIServingRender:
...
@@ -497,7 +498,7 @@ class OpenAIServingRender:
default_template_kwargs
:
dict
[
str
,
Any
]
|
None
,
default_template_kwargs
:
dict
[
str
,
Any
]
|
None
,
tool_dicts
:
list
[
dict
[
str
,
Any
]]
|
None
=
None
,
tool_dicts
:
list
[
dict
[
str
,
Any
]]
|
None
=
None
,
tool_parser
:
type
[
ToolParser
]
|
None
=
None
,
tool_parser
:
type
[
ToolParser
]
|
None
=
None
,
)
->
tuple
[
list
[
ConversationMessage
],
list
[
Processor
Input
s
]]:
)
->
tuple
[
list
[
ConversationMessage
],
list
[
Engine
Input
]]:
"""Copied from OpenAIServing._preprocess_chat."""
"""Copied from OpenAIServing._preprocess_chat."""
renderer
=
self
.
renderer
renderer
=
self
.
renderer
mm_config
=
self
.
model_config
.
multimodal_config
mm_config
=
self
.
model_config
.
multimodal_config
...
@@ -519,7 +520,7 @@ class OpenAIServingRender:
...
@@ -519,7 +520,7 @@ class OpenAIServingRender:
default_mm_processor_kwargs
=
getattr
(
request
,
"mm_processor_kwargs"
,
None
),
default_mm_processor_kwargs
=
getattr
(
request
,
"mm_processor_kwargs"
,
None
),
)
)
(
conversation
,),
(
engine_
promp
t
,)
=
await
renderer
.
render_chat_async
(
(
conversation
,),
(
engine_
inpu
t
,)
=
await
renderer
.
render_chat_async
(
[
messages
],
[
messages
],
chat_params
,
chat_params
,
tok_params
,
tok_params
,
...
@@ -546,4 +547,4 @@ class OpenAIServingRender:
...
@@ -546,4 +547,4 @@ class OpenAIServingRender:
tokenizer
=
renderer
.
get_tokenizer
()
tokenizer
=
renderer
.
get_tokenizer
()
request
=
tool_parser
(
tokenizer
).
adjust_request
(
request
=
request
)
# type: ignore[arg-type]
request
=
tool_parser
(
tokenizer
).
adjust_request
(
request
=
request
)
# type: ignore[arg-type]
return
conversation
,
[
engine_
promp
t
]
return
conversation
,
[
engine_
inpu
t
]
vllm/entrypoints/serve/tokenize/serving.py
View file @
ba2f0acc
...
@@ -20,7 +20,7 @@ from vllm.entrypoints.serve.tokenize.protocol import (
...
@@ -20,7 +20,7 @@ from vllm.entrypoints.serve.tokenize.protocol import (
TokenizeResponse
,
TokenizeResponse
,
TokenizerInfoResponse
,
TokenizerInfoResponse
,
)
)
from
vllm.inputs
import
TokensPrompt
,
token_input
s
from
vllm.inputs
import
TokensPrompt
,
token
s
_input
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.tokenizers
import
TokenizerLike
from
vllm.tokenizers
import
TokenizerLike
...
@@ -79,7 +79,7 @@ class OpenAIServingTokenization(OpenAIServing):
...
@@ -79,7 +79,7 @@ class OpenAIServingTokenization(OpenAIServing):
if
error_check_ret
is
not
None
:
if
error_check_ret
is
not
None
:
return
error_check_ret
return
error_check_ret
_
,
engine_
promp
ts
=
await
self
.
openai_serving_render
.
preprocess_chat
(
_
,
engine_
inpu
ts
=
await
self
.
openai_serving_render
.
preprocess_chat
(
request
,
request
,
request
.
messages
,
request
.
messages
,
default_template
=
self
.
chat_template
,
default_template
=
self
.
chat_template
,
...
@@ -88,22 +88,22 @@ class OpenAIServingTokenization(OpenAIServing):
...
@@ -88,22 +88,22 @@ class OpenAIServingTokenization(OpenAIServing):
tool_dicts
=
tool_dicts
,
tool_dicts
=
tool_dicts
,
)
)
else
:
else
:
engine_
promp
ts
=
await
self
.
openai_serving_render
.
preprocess_completion
(
engine_
inpu
ts
=
await
self
.
openai_serving_render
.
preprocess_completion
(
request
,
request
,
prompt_input
=
request
.
prompt
,
prompt_input
=
request
.
prompt
,
prompt_embeds
=
None
,
prompt_embeds
=
None
,
)
)
input_ids
:
list
[
int
]
=
[]
input_ids
:
list
[
int
]
=
[]
for
engine_
promp
t
in
engine_
promp
ts
:
for
engine_
inpu
t
in
engine_
inpu
ts
:
self
.
_log_inputs
(
self
.
_log_inputs
(
request_id
,
request_id
,
engine_
promp
t
,
engine_
inpu
t
,
params
=
None
,
params
=
None
,
lora_request
=
lora_request
,
lora_request
=
lora_request
,
)
)
prompt_components
=
self
.
_extract_prompt_components
(
engine_
promp
t
)
prompt_components
=
self
.
_extract_prompt_components
(
engine_
inpu
t
)
if
prompt_components
.
token_ids
is
not
None
:
if
prompt_components
.
token_ids
is
not
None
:
input_ids
.
extend
(
prompt_components
.
token_ids
)
input_ids
.
extend
(
prompt_components
.
token_ids
)
...
@@ -134,16 +134,16 @@ class OpenAIServingTokenization(OpenAIServing):
...
@@ -134,16 +134,16 @@ class OpenAIServingTokenization(OpenAIServing):
self
.
_log_inputs
(
self
.
_log_inputs
(
request_id
,
request_id
,
token_input
s
(
request
.
tokens
),
token
s
_input
(
request
.
tokens
),
params
=
None
,
params
=
None
,
lora_request
=
lora_request
,
lora_request
=
lora_request
,
)
)
engine
_prompt
=
await
self
.
renderer
.
tokenize_prompt_async
(
tok
_prompt
=
await
self
.
renderer
.
tokenize_prompt_async
(
TokensPrompt
(
prompt_token_ids
=
request
.
tokens
),
TokensPrompt
(
prompt_token_ids
=
request
.
tokens
),
request
.
build_tok_params
(
self
.
model_config
),
request
.
build_tok_params
(
self
.
model_config
),
)
)
prompt_text
=
engine
_prompt
[
"prompt"
]
# type: ignore[typeddict-item]
prompt_text
=
tok
_prompt
[
"prompt"
]
# type: ignore[typeddict-item]
return
DetokenizeResponse
(
prompt
=
prompt_text
)
return
DetokenizeResponse
(
prompt
=
prompt_text
)
...
...
Prev
1
2
3
4
5
6
…
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment