Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
dcb31196
Unverified
Commit
dcb31196
authored
Dec 14, 2025
by
Cyrus Leung
Committed by
GitHub
Dec 14, 2025
Browse files
[Chore] Remove redundant `RequestPrompt` (#30612)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
f569c654
Changes
12
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
186 additions
and
251 deletions
+186
-251
tests/entrypoints/openai/test_chat_error.py
tests/entrypoints/openai/test_chat_error.py
+1
-2
tests/entrypoints/openai/test_serving_chat.py
tests/entrypoints/openai/test_serving_chat.py
+13
-13
tests/entrypoints/openai/test_serving_responses.py
tests/entrypoints/openai/test_serving_responses.py
+3
-3
vllm/entrypoints/openai/serving_chat.py
vllm/entrypoints/openai/serving_chat.py
+36
-19
vllm/entrypoints/openai/serving_engine.py
vllm/entrypoints/openai/serving_engine.py
+72
-125
vllm/entrypoints/openai/serving_responses.py
vllm/entrypoints/openai/serving_responses.py
+10
-11
vllm/entrypoints/pooling/classify/serving.py
vllm/entrypoints/pooling/classify/serving.py
+1
-5
vllm/entrypoints/pooling/embed/serving.py
vllm/entrypoints/pooling/embed/serving.py
+20
-39
vllm/entrypoints/pooling/pooling/serving.py
vllm/entrypoints/pooling/pooling/serving.py
+2
-5
vllm/entrypoints/renderer.py
vllm/entrypoints/renderer.py
+18
-20
vllm/entrypoints/serve/disagg/serving.py
vllm/entrypoints/serve/disagg/serving.py
+3
-3
vllm/entrypoints/serve/tokenize/serving.py
vllm/entrypoints/serve/tokenize/serving.py
+7
-6
No files found.
tests/entrypoints/openai/test_chat_error.py
View file @
dcb31196
...
...
@@ -80,10 +80,9 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
return
dict
(
engine_prompt
),
{}
async
def
_fake_preprocess_chat
(
*
args
,
**
kwargs
):
# return conversation,
request_prompts,
engine_prompts
# return conversation, engine_prompts
return
(
[{
"role"
:
"user"
,
"content"
:
"Test"
}],
[[
1
,
2
,
3
]],
[{
"prompt_token_ids"
:
[
1
,
2
,
3
]}],
)
...
...
tests/entrypoints/openai/test_serving_chat.py
View file @
dcb31196
...
...
@@ -877,7 +877,7 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the first turn's input
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
input_messages
,
_
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
input_messages
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
verify_harmony_messages
(
input_messages
,
[
...
...
@@ -905,7 +905,7 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the second turn's input
req_2
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
input_messages_2
,
_
,
_
=
serving_chat
.
_make_request_with_harmony
(
req_2
)
input_messages_2
,
_
=
serving_chat
.
_make_request_with_harmony
(
req_2
)
verify_harmony_messages
(
input_messages_2
,
[
...
...
@@ -927,7 +927,7 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the first turn's input
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
input_messages
,
_
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
input_messages
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
verify_harmony_messages
(
input_messages
,
[
...
...
@@ -971,7 +971,7 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the second turn's input
req_2
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
input_messages_2
,
_
,
_
=
serving_chat
.
_make_request_with_harmony
(
req_2
)
input_messages_2
,
_
=
serving_chat
.
_make_request_with_harmony
(
req_2
)
verify_harmony_messages
(
input_messages_2
,
[
...
...
@@ -1008,7 +1008,7 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the first turn's input
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
input_messages
,
_
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
input_messages
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
verify_harmony_messages
(
input_messages
,
[
...
...
@@ -1052,7 +1052,7 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the second turn's input
req_2
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
input_messages_2
,
_
,
_
=
serving_chat
.
_make_request_with_harmony
(
req_2
)
input_messages_2
,
_
=
serving_chat
.
_make_request_with_harmony
(
req_2
)
verify_harmony_messages
(
input_messages_2
,
[
...
...
@@ -1089,7 +1089,7 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the first turn's input
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
input_messages
,
_
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
input_messages
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
verify_harmony_messages
(
input_messages
,
[
...
...
@@ -1133,7 +1133,7 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the second turn's input
req_2
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
input_messages_2
,
_
,
_
=
serving_chat
.
_make_request_with_harmony
(
req_2
)
input_messages_2
,
_
=
serving_chat
.
_make_request_with_harmony
(
req_2
)
verify_harmony_messages
(
input_messages_2
,
[
...
...
@@ -1183,7 +1183,7 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the third turn's input
req_3
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
input_messages_3
,
_
,
_
=
serving_chat
.
_make_request_with_harmony
(
req_3
)
input_messages_3
,
_
=
serving_chat
.
_make_request_with_harmony
(
req_3
)
verify_harmony_messages
(
input_messages_3
,
[
...
...
@@ -1246,7 +1246,7 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the fourth turn's input
req_4
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
input_messages_4
,
_
,
_
=
serving_chat
.
_make_request_with_harmony
(
req_4
)
input_messages_4
,
_
=
serving_chat
.
_make_request_with_harmony
(
req_4
)
verify_harmony_messages
(
input_messages_4
,
[
...
...
@@ -1295,7 +1295,7 @@ class TestServingChatWithHarmony:
},
]
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
input_messages
,
_
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
input_messages
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
verify_harmony_messages
(
input_messages
,
...
...
@@ -1327,7 +1327,7 @@ class TestServingChatWithHarmony:
},
]
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
input_messages
,
_
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
input_messages
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
verify_harmony_messages
(
input_messages
,
...
...
@@ -1357,7 +1357,7 @@ class TestServingChatWithHarmony:
},
]
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
input_messages
,
_
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
input_messages
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
verify_harmony_messages
(
input_messages
,
...
...
tests/entrypoints/openai/test_serving_responses.py
View file @
dcb31196
...
...
@@ -21,7 +21,7 @@ from vllm.entrypoints.openai.serving_responses import (
extract_tool_types
,
)
from
vllm.entrypoints.tool_server
import
ToolServer
from
vllm.inputs.data
import
TokensPrompt
as
EngineTokensPrompt
from
vllm.inputs.data
import
TokensPrompt
class
MockConversationContext
(
ConversationContext
):
...
...
@@ -237,7 +237,7 @@ class TestValidateGeneratorInput:
"""Test _validate_generator_input with valid prompt length"""
# Create an engine prompt with valid length (less than max_model_len)
valid_prompt_token_ids
=
list
(
range
(
5
))
# 5 tokens < 100 max_model_len
engine_prompt
=
Engine
TokensPrompt
(
prompt_token_ids
=
valid_prompt_token_ids
)
engine_prompt
=
TokensPrompt
(
prompt_token_ids
=
valid_prompt_token_ids
)
# Call the method
result
=
serving_responses_instance
.
_validate_generator_input
(
engine_prompt
)
...
...
@@ -247,7 +247,7 @@ class TestValidateGeneratorInput:
# create an invalid engine prompt
invalid_prompt_token_ids
=
list
(
range
(
200
))
# 100 tokens >= 100 max_model_len
engine_prompt
=
Engine
TokensPrompt
(
prompt_token_ids
=
invalid_prompt_token_ids
)
engine_prompt
=
TokensPrompt
(
prompt_token_ids
=
invalid_prompt_token_ids
)
# Call the method
result
=
serving_responses_instance
.
_validate_generator_input
(
engine_prompt
)
...
...
vllm/entrypoints/openai/serving_chat.py
View file @
dcb31196
...
...
@@ -61,7 +61,7 @@ from vllm.entrypoints.openai.tool_parsers import ToolParser
from
vllm.entrypoints.openai.tool_parsers.mistral_tool_parser
import
MistralToolCall
from
vllm.entrypoints.openai.utils
import
maybe_filter_parallel_tool_calls
from
vllm.entrypoints.utils
import
get_max_tokens
,
should_include_usage
from
vllm.inputs.data
import
TokensPrompt
as
EngineTokensPrompt
from
vllm.inputs.data
import
TokensPrompt
from
vllm.logger
import
init_logger
from
vllm.logprobs
import
Logprob
from
vllm.outputs
import
CompletionOutput
,
RequestOutput
...
...
@@ -234,11 +234,7 @@ class OpenAIServingChat(OpenAIServing):
)
if
error_check_ret
is
not
None
:
return
error_check_ret
(
conversation
,
request_prompts
,
engine_prompts
,
)
=
await
self
.
_preprocess_chat
(
conversation
,
engine_prompts
=
await
self
.
_preprocess_chat
(
request
,
tokenizer
,
request
.
messages
,
...
...
@@ -254,11 +250,7 @@ class OpenAIServingChat(OpenAIServing):
)
else
:
# For GPT-OSS.
(
conversation
,
request_prompts
,
engine_prompts
,
)
=
self
.
_make_request_with_harmony
(
request
)
conversation
,
engine_prompts
=
self
.
_make_request_with_harmony
(
request
)
except
(
ValueError
,
TypeError
,
RuntimeError
,
jinja2
.
TemplateError
)
as
e
:
logger
.
exception
(
"Error in preprocessing prompt inputs"
)
return
self
.
create_error_response
(
f
"
{
e
}
{
e
.
__cause__
}
"
)
...
...
@@ -278,7 +270,7 @@ class OpenAIServingChat(OpenAIServing):
generators
:
list
[
AsyncGenerator
[
RequestOutput
,
None
]]
=
[]
try
:
for
i
,
engine_prompt
in
enumerate
(
engine_prompts
):
prompt_text
,
_
,
_
=
self
.
_get_prompt_components
(
request
_prompt
s
[
i
]
)
prompt_text
,
_
,
_
=
self
.
_get_prompt_components
(
engine
_prompt
)
# If we are creating sub requests for multiple prompts, ensure that they
# have unique request ids.
sub_request_id
=
(
...
...
@@ -313,7 +305,7 @@ class OpenAIServingChat(OpenAIServing):
self
.
_log_inputs
(
sub_request_id
,
request
_prompt
s
[
i
]
,
engine
_prompt
,
params
=
sampling_params
,
lora_request
=
lora_request
,
)
...
...
@@ -537,7 +529,7 @@ class OpenAIServingChat(OpenAIServing):
request_id
:
str
,
model_name
:
str
,
conversation
:
list
[
ConversationMessage
],
tokenizer
:
TokenizerLike
,
tokenizer
:
TokenizerLike
|
None
,
request_metadata
:
RequestResponseMetadata
,
)
->
AsyncGenerator
[
str
,
None
]:
created_time
=
int
(
time
.
time
())
...
...
@@ -591,6 +583,11 @@ class OpenAIServingChat(OpenAIServing):
try
:
if
self
.
reasoning_parser
:
if
tokenizer
is
None
:
raise
ValueError
(
"Tokenizer not available when `skip_tokenizer_init=True`"
)
reasoning_parser
=
self
.
reasoning_parser
(
tokenizer
,
chat_template_kwargs
=
request
.
chat_template_kwargs
,
# type: ignore
...
...
@@ -604,6 +601,11 @@ class OpenAIServingChat(OpenAIServing):
# Prepare the tool parser if it's needed
try
:
if
tool_choice_auto
and
self
.
tool_parser
:
if
tokenizer
is
None
:
raise
ValueError
(
"Tokenizer not available when `skip_tokenizer_init=True`"
)
tool_parsers
:
list
[
ToolParser
|
None
]
=
[
self
.
tool_parser
(
tokenizer
)
]
*
num_choices
...
...
@@ -1317,7 +1319,7 @@ class OpenAIServingChat(OpenAIServing):
request_id
:
str
,
model_name
:
str
,
conversation
:
list
[
ConversationMessage
],
tokenizer
:
TokenizerLike
,
tokenizer
:
TokenizerLike
|
None
,
request_metadata
:
RequestResponseMetadata
,
)
->
ErrorResponse
|
ChatCompletionResponse
:
created_time
=
int
(
time
.
time
())
...
...
@@ -1367,6 +1369,11 @@ class OpenAIServingChat(OpenAIServing):
reasoning
=
None
if
self
.
tool_parser
is
not
None
:
if
tokenizer
is
None
:
raise
ValueError
(
"Tokenizer not available when `skip_tokenizer_init=True`"
)
tool_parser
=
self
.
tool_parser
(
tokenizer
)
# NOTE: We use token_ids for openai tool parser
tool_call_info
=
tool_parser
.
extract_tool_calls
(
...
...
@@ -1409,6 +1416,11 @@ class OpenAIServingChat(OpenAIServing):
if
self
.
reasoning_parser
:
try
:
if
tokenizer
is
None
:
raise
ValueError
(
"Tokenizer not available when `skip_tokenizer_init=True`"
)
reasoning_parser
=
self
.
reasoning_parser
(
tokenizer
,
chat_template_kwargs
=
request
.
chat_template_kwargs
,
# type: ignore
...
...
@@ -1648,7 +1660,7 @@ class OpenAIServingChat(OpenAIServing):
self
,
logprobs
:
dict
[
int
,
Logprob
],
top_logprobs
:
int
|
None
,
tokenizer
:
TokenizerLike
,
tokenizer
:
TokenizerLike
|
None
,
should_return_as_token_id
:
bool
,
)
->
list
[
ChatCompletionLogProb
]:
return
[
...
...
@@ -1672,7 +1684,7 @@ class OpenAIServingChat(OpenAIServing):
self
,
token_ids
:
GenericSequence
[
int
],
top_logprobs
:
GenericSequence
[
dict
[
int
,
Logprob
]
|
None
],
tokenizer
:
TokenizerLike
,
tokenizer
:
TokenizerLike
|
None
,
num_output_top_logprobs
:
int
|
None
=
None
,
return_as_token_id
:
bool
|
None
=
None
,
)
->
ChatCompletionLogProbs
:
...
...
@@ -1690,6 +1702,11 @@ class OpenAIServingChat(OpenAIServing):
if
should_return_as_token_id
:
token
=
f
"token_id:
{
token_id
}
"
else
:
if
tokenizer
is
None
:
raise
ValueError
(
"Tokenizer not available when `skip_tokenizer_init=True`"
)
token
=
tokenizer
.
decode
(
token_id
)
logprobs_content
.
append
(
...
...
@@ -1800,10 +1817,10 @@ class OpenAIServingChat(OpenAIServing):
# Render prompt token ids.
prompt_token_ids
=
render_for_completion
(
messages
)
engine_prompt
=
Engine
TokensPrompt
(
prompt_token_ids
=
prompt_token_ids
)
engine_prompt
=
TokensPrompt
(
prompt_token_ids
=
prompt_token_ids
)
# Add cache_salt if provided in the request
if
request
.
cache_salt
is
not
None
:
engine_prompt
[
"cache_salt"
]
=
request
.
cache_salt
return
messages
,
[
prompt_token_ids
],
[
engine_prompt
]
return
messages
,
[
engine_prompt
]
vllm/entrypoints/openai/serving_engine.py
View file @
dcb31196
...
...
@@ -5,60 +5,19 @@ import json
import
sys
import
time
import
traceback
from
collections.abc
import
AsyncGenerator
,
Callable
,
Iterable
,
Mapping
,
Sequence
from
collections.abc
import
AsyncGenerator
,
Callable
,
Iterable
,
Mapping
from
concurrent.futures
import
ThreadPoolExecutor
from
dataclasses
import
dataclass
,
field
from
http
import
HTTPStatus
from
typing
import
Any
,
ClassVar
,
Generic
,
TypeAlias
,
TypeVar
import
numpy
as
np
import
torch
from
fastapi
import
Request
from
pydantic
import
ConfigDict
,
TypeAdapter
from
starlette.datastructures
import
Headers
from
typing_extensions
import
TypeIs
from
vllm.entrypoints.context
import
(
HarmonyContext
,
ParsableContext
,
StreamingHarmonyContext
,
)
from
vllm.entrypoints.openai.protocol
import
(
FunctionCall
,
ResponseInputOutputItem
,
ResponsesRequest
,
)
from
vllm.entrypoints.pooling.classify.protocol
import
(
ClassificationChatRequest
,
ClassificationCompletionRequest
,
ClassificationRequest
,
ClassificationResponse
,
)
from
vllm.entrypoints.pooling.embed.protocol
import
(
EmbeddingChatRequest
,
EmbeddingCompletionRequest
,
EmbeddingRequest
,
EmbeddingResponse
,
)
from
vllm.entrypoints.pooling.pooling.protocol
import
(
IOProcessorRequest
,
PoolingResponse
,
)
from
vllm.entrypoints.pooling.score.protocol
import
(
RerankRequest
,
ScoreRequest
,
ScoreResponse
,
)
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
if
sys
.
version_info
>=
(
3
,
12
):
from
typing
import
TypedDict
else
:
from
typing_extensions
import
TypedDict
from
openai.types.responses
import
(
ToolChoiceFunction
,
)
from
pydantic
import
ConfigDict
,
TypeAdapter
from
starlette.datastructures
import
Headers
import
vllm.envs
as
envs
from
vllm.beam_search
import
BeamSearchSequence
,
create_sort_beams_key_function
...
...
@@ -72,7 +31,12 @@ from vllm.entrypoints.chat_utils import (
parse_chat_messages_futures
,
resolve_chat_template_content_format
,
)
from
vllm.entrypoints.context
import
ConversationContext
from
vllm.entrypoints.context
import
(
ConversationContext
,
HarmonyContext
,
ParsableContext
,
StreamingHarmonyContext
,
)
from
vllm.entrypoints.logger
import
RequestLogger
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionNamedToolChoiceParam
,
...
...
@@ -83,7 +47,10 @@ from vllm.entrypoints.openai.protocol import (
DetokenizeRequest
,
ErrorInfo
,
ErrorResponse
,
FunctionCall
,
FunctionDefinition
,
ResponseInputOutputItem
,
ResponsesRequest
,
TokenizeChatRequest
,
TokenizeCompletionRequest
,
TokenizeResponse
,
...
...
@@ -93,14 +60,34 @@ from vllm.entrypoints.openai.protocol import (
)
from
vllm.entrypoints.openai.serving_models
import
OpenAIServingModels
from
vllm.entrypoints.openai.tool_parsers
import
ToolParser
,
ToolParserManager
from
vllm.entrypoints.pooling.classify.protocol
import
(
ClassificationChatRequest
,
ClassificationCompletionRequest
,
ClassificationRequest
,
ClassificationResponse
,
)
from
vllm.entrypoints.pooling.embed.protocol
import
(
EmbeddingChatRequest
,
EmbeddingCompletionRequest
,
EmbeddingRequest
,
EmbeddingResponse
,
)
from
vllm.entrypoints.pooling.pooling.protocol
import
(
IOProcessorRequest
,
PoolingResponse
,
)
from
vllm.entrypoints.pooling.score.protocol
import
(
RerankRequest
,
ScoreRequest
,
ScoreResponse
,
)
from
vllm.entrypoints.renderer
import
BaseRenderer
,
CompletionRenderer
,
RenderConfig
from
vllm.entrypoints.responses_utils
import
(
construct_input_messages
,
)
from
vllm.entrypoints.serve.disagg.protocol
import
GenerateRequest
,
GenerateResponse
from
vllm.entrypoints.utils
import
_validate_truncation_size
from
vllm.inputs.data
import
PromptType
from
vllm.inputs.data
import
TokensPrompt
as
EngineTokensPrompt
from
vllm.inputs.data
import
PromptType
,
TokensPrompt
from
vllm.inputs.parse
import
(
PromptComponents
,
get_prompt_components
,
...
...
@@ -109,10 +96,7 @@ from vllm.inputs.parse import (
from
vllm.logger
import
init_logger
from
vllm.logprobs
import
Logprob
,
PromptLogprobs
from
vllm.lora.request
import
LoRARequest
from
vllm.multimodal
import
(
# noqa: F401 - Required to resolve Pydantic error in RequestProcessingMixin
MultiModalDataDict
,
MultiModalUUIDDict
,
)
from
vllm.multimodal
import
MultiModalDataDict
from
vllm.outputs
import
CompletionOutput
,
PoolingRequestOutput
,
RequestOutput
from
vllm.pooling_params
import
PoolingParams
from
vllm.reasoning
import
ReasoningParser
,
ReasoningParserManager
...
...
@@ -185,34 +169,6 @@ AnyResponse: TypeAlias = (
)
class
TextTokensPrompt
(
TypedDict
):
prompt
:
str
prompt_token_ids
:
list
[
int
]
class
EmbedsPrompt
(
TypedDict
):
prompt_embeds
:
torch
.
Tensor
RequestPrompt
:
TypeAlias
=
list
[
int
]
|
str
|
TextTokensPrompt
|
EmbedsPrompt
def
is_text_tokens_prompt
(
prompt
:
RequestPrompt
)
->
TypeIs
[
TextTokensPrompt
]:
return
(
isinstance
(
prompt
,
dict
)
and
"prompt_token_ids"
in
prompt
and
"prompt_embeds"
not
in
prompt
)
def
is_embeds_prompt
(
prompt
:
RequestPrompt
)
->
TypeIs
[
EmbedsPrompt
]:
return
(
isinstance
(
prompt
,
dict
)
and
"prompt_token_ids"
not
in
prompt
and
"prompt_embeds"
in
prompt
)
RequestT
=
TypeVar
(
"RequestT"
,
bound
=
AnyRequest
)
...
...
@@ -223,8 +179,7 @@ class RequestProcessingMixin:
handling prompt preparation and engine input.
"""
request_prompts
:
Sequence
[
RequestPrompt
]
|
None
=
field
(
default_factory
=
list
)
engine_prompts
:
list
[
EngineTokensPrompt
]
|
None
=
field
(
default_factory
=
list
)
engine_prompts
:
list
[
TokensPrompt
]
|
None
=
field
(
default_factory
=
list
)
@
dataclass
(
kw_only
=
True
)
...
...
@@ -425,7 +380,7 @@ class OpenAIServing:
prompts_batch
,
lora_req_batch
=
zip
(
*
[
(
Engine
TokensPrompt
(
TokensPrompt
(
prompt_token_ids
=
beam
.
tokens
,
multi_modal_data
=
beam
.
multi_modal_data
,
mm_processor_kwargs
=
beam
.
mm_processor_kwargs
,
...
...
@@ -947,7 +902,7 @@ class OpenAIServing:
prompt
:
str
,
tokenizer
:
TokenizerLike
,
add_special_tokens
:
bool
,
)
->
Text
TokensPrompt
:
)
->
TokensPrompt
:
async_tokenizer
=
self
.
_get_async_tokenizer
(
tokenizer
)
if
(
...
...
@@ -988,7 +943,7 @@ class OpenAIServing:
request
:
AnyRequest
,
prompt_ids
:
list
[
int
],
tokenizer
:
TokenizerLike
|
None
,
)
->
Text
TokensPrompt
:
)
->
TokensPrompt
:
truncate_prompt_tokens
=
getattr
(
request
,
"truncate_prompt_tokens"
,
None
)
if
truncate_prompt_tokens
is
None
:
...
...
@@ -1011,7 +966,7 @@ class OpenAIServing:
request
:
AnyRequest
,
input_ids
:
list
[
int
],
input_text
:
str
,
)
->
Text
TokensPrompt
:
)
->
TokensPrompt
:
token_num
=
len
(
input_ids
)
# Note: EmbeddingRequest, ClassificationRequest,
...
...
@@ -1042,7 +997,7 @@ class OpenAIServing:
f
"
{
token_num
}
tokens in the input for
{
operation
}
. "
f
"Please reduce the length of the input."
)
return
Text
TokensPrompt
(
prompt
=
input_text
,
prompt_token_ids
=
input_ids
)
return
TokensPrompt
(
prompt
=
input_text
,
prompt_token_ids
=
input_ids
)
# Note: TokenizeRequest and DetokenizeRequest doesn't have max_tokens
# and does not require model context length validation
...
...
@@ -1050,7 +1005,7 @@ class OpenAIServing:
request
,
(
TokenizeCompletionRequest
,
TokenizeChatRequest
,
DetokenizeRequest
),
):
return
Text
TokensPrompt
(
prompt
=
input_text
,
prompt_token_ids
=
input_ids
)
return
TokensPrompt
(
prompt
=
input_text
,
prompt_token_ids
=
input_ids
)
# chat completion endpoint supports max_completion_tokens
if
isinstance
(
request
,
ChatCompletionRequest
):
...
...
@@ -1078,7 +1033,7 @@ class OpenAIServing:
f
" -
{
token_num
}
)."
)
return
Text
TokensPrompt
(
prompt
=
input_text
,
prompt_token_ids
=
input_ids
)
return
TokensPrompt
(
prompt
=
input_text
,
prompt_token_ids
=
input_ids
)
async
def
_tokenize_prompt_input_async
(
self
,
...
...
@@ -1086,7 +1041,7 @@ class OpenAIServing:
tokenizer
:
TokenizerLike
,
prompt_input
:
str
|
list
[
int
],
add_special_tokens
:
bool
=
True
,
)
->
Text
TokensPrompt
:
)
->
TokensPrompt
:
"""
A simpler implementation that tokenizes a single prompt input.
"""
...
...
@@ -1105,7 +1060,7 @@ class OpenAIServing:
tokenizer
:
TokenizerLike
,
prompt_inputs
:
Iterable
[
str
|
list
[
int
]],
add_special_tokens
:
bool
=
True
,
)
->
AsyncGenerator
[
Text
TokensPrompt
,
None
]:
)
->
AsyncGenerator
[
TokensPrompt
,
None
]:
"""
A simpler implementation that tokenizes multiple prompt inputs.
"""
...
...
@@ -1158,11 +1113,7 @@ class OpenAIServing:
chat_template_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
tool_parser
:
Callable
[[
TokenizerLike
],
ToolParser
]
|
None
=
None
,
add_special_tokens
:
bool
=
False
,
)
->
tuple
[
list
[
ConversationMessage
],
Sequence
[
RequestPrompt
],
list
[
EngineTokensPrompt
],
]:
)
->
tuple
[
list
[
ConversationMessage
],
list
[
TokensPrompt
]]:
model_config
=
self
.
model_config
resolved_content_format
=
resolve_chat_template_content_format
(
...
...
@@ -1235,9 +1186,7 @@ class OpenAIServing:
"Prompt has to be a string"
,
"when the tokenizer is not initialised"
,
)
prompt_inputs
=
TextTokensPrompt
(
prompt
=
request_prompt
,
prompt_token_ids
=
[
1
]
)
prompt_inputs
=
TokensPrompt
(
prompt
=
request_prompt
,
prompt_token_ids
=
[
1
])
elif
isinstance
(
request_prompt
,
str
):
prompt_inputs
=
await
self
.
_tokenize_prompt_input_async
(
request
,
...
...
@@ -1250,14 +1199,15 @@ class OpenAIServing:
assert
is_list_of
(
request_prompt
,
int
),
(
"Prompt has to be either a string or a list of token ids"
)
prompt_inputs
=
Text
TokensPrompt
(
prompt_inputs
=
TokensPrompt
(
prompt
=
tokenizer
.
decode
(
request_prompt
),
prompt_token_ids
=
request_prompt
,
)
engine_prompt
=
EngineTokensPrompt
(
prompt_token_ids
=
prompt_inputs
[
"prompt_token_ids"
]
)
engine_prompt
=
TokensPrompt
(
prompt_token_ids
=
prompt_inputs
[
"prompt_token_ids"
])
if
"prompt"
in
prompt_inputs
:
engine_prompt
[
"prompt"
]
=
prompt_inputs
[
"prompt"
]
if
mm_data
is
not
None
:
engine_prompt
[
"multi_modal_data"
]
=
mm_data
...
...
@@ -1270,7 +1220,7 @@ class OpenAIServing:
if
hasattr
(
request
,
"cache_salt"
)
and
request
.
cache_salt
is
not
None
:
engine_prompt
[
"cache_salt"
]
=
request
.
cache_salt
return
conversation
,
[
request_prompt
],
[
engine_prompt
]
return
conversation
,
[
engine_prompt
]
async
def
_process_inputs
(
self
,
...
...
@@ -1302,7 +1252,7 @@ class OpenAIServing:
async
def
_render_next_turn
(
self
,
request
:
ResponsesRequest
,
tokenizer
:
Any
Tokenizer
,
tokenizer
:
Tokenizer
Like
|
None
,
messages
:
list
[
ResponseInputOutputItem
],
tool_dicts
:
list
[
dict
[
str
,
Any
]]
|
None
,
tool_parser
,
...
...
@@ -1313,7 +1263,7 @@ class OpenAIServing:
request_input
=
messages
,
)
_
,
request_prompts
,
engine_prompts
=
await
self
.
_preprocess_chat
(
_
,
engine_prompts
=
await
self
.
_preprocess_chat
(
request
,
tokenizer
,
new_messages
,
...
...
@@ -1322,20 +1272,20 @@ class OpenAIServing:
chat_template
=
chat_template
,
chat_template_content_format
=
chat_template_content_format
,
)
return
request_prompts
,
engine_prompts
return
engine_prompts
async
def
_generate_with_builtin_tools
(
self
,
request_id
:
str
,
request_prompt
:
RequestPrompt
,
engine_prompt
:
EngineTokensPrompt
,
engine_prompt
:
TokensPrompt
,
sampling_params
:
SamplingParams
,
context
:
ConversationContext
,
lora_request
:
LoRARequest
|
None
=
None
,
priority
:
int
=
0
,
**
kwargs
,
):
prompt_text
,
_
,
_
=
self
.
_get_prompt_components
(
request_prompt
)
prompt_text
,
_
,
_
=
self
.
_get_prompt_components
(
engine_prompt
)
orig_priority
=
priority
sub_request
=
0
while
True
:
...
...
@@ -1343,7 +1293,7 @@ class OpenAIServing:
sub_request_id
=
f
"
{
request_id
}
_
{
sub_request
}
"
self
.
_log_inputs
(
sub_request_id
,
request
_prompt
,
engine
_prompt
,
params
=
sampling_params
,
lora_request
=
lora_request
,
)
...
...
@@ -1388,10 +1338,9 @@ class OpenAIServing:
# Render the next prompt token ids.
if
isinstance
(
context
,
(
HarmonyContext
,
StreamingHarmonyContext
)):
prompt_token_ids
=
context
.
render_for_completion
()
engine_prompt
=
EngineTokensPrompt
(
prompt_token_ids
=
prompt_token_ids
)
request_prompt
=
prompt_token_ids
engine_prompt
=
TokensPrompt
(
prompt_token_ids
=
prompt_token_ids
)
elif
isinstance
(
context
,
ParsableContext
):
request_prompts
,
engine_prompts
=
await
self
.
_render_next_turn
(
engine_prompts
=
await
self
.
_render_next_turn
(
context
.
request
,
context
.
tokenizer
,
context
.
parser
.
response_messages
,
...
...
@@ -1401,8 +1350,7 @@ class OpenAIServing:
context
.
chat_template_content_format
,
)
engine_prompt
=
engine_prompts
[
0
]
request_prompt
=
request_prompts
[
0
]
prompt_text
,
_
,
_
=
self
.
_get_prompt_components
(
request_prompt
)
prompt_text
,
_
,
_
=
self
.
_get_prompt_components
(
engine_prompt
)
# Update the sampling params.
sampling_params
.
max_tokens
=
self
.
max_model_len
-
len
(
...
...
@@ -1412,19 +1360,13 @@ class OpenAIServing:
priority
=
orig_priority
-
1
sub_request
+=
1
def
_get_prompt_components
(
self
,
prompt
:
RequestPrompt
|
PromptType
,
)
->
PromptComponents
:
if
isinstance
(
prompt
,
list
):
return
PromptComponents
(
token_ids
=
prompt
)
return
get_prompt_components
(
prompt
)
# type: ignore[arg-type]
def
_get_prompt_components
(
self
,
prompt
:
PromptType
)
->
PromptComponents
:
return
get_prompt_components
(
prompt
)
def
_log_inputs
(
self
,
request_id
:
str
,
inputs
:
RequestPrompt
|
PromptType
,
inputs
:
PromptType
,
params
:
SamplingParams
|
PoolingParams
|
BeamSearchParams
|
None
,
lora_request
:
LoRARequest
|
None
,
)
->
None
:
...
...
@@ -1486,7 +1428,7 @@ class OpenAIServing:
@
staticmethod
def
_parse_tool_calls_from_content
(
request
:
ResponsesRequest
|
ChatCompletionRequest
,
tokenizer
:
TokenizerLike
,
tokenizer
:
TokenizerLike
|
None
,
enable_auto_tools
:
bool
,
tool_parser_cls
:
Callable
[[
TokenizerLike
],
ToolParser
]
|
None
,
content
:
str
|
None
=
None
,
...
...
@@ -1526,6 +1468,11 @@ class OpenAIServing:
and
enable_auto_tools
and
(
request
.
tool_choice
==
"auto"
or
request
.
tool_choice
is
None
)
):
if
tokenizer
is
None
:
raise
ValueError
(
"Tokenizer not available when `skip_tokenizer_init=True`"
)
# Automatic Tool Call Parsing
try
:
tool_parser
=
tool_parser_cls
(
tokenizer
)
...
...
vllm/entrypoints/openai/serving_responses.py
View file @
dcb31196
...
...
@@ -107,7 +107,7 @@ from vllm.entrypoints.responses_utils import (
make_response_output_items_from_parsable_context
,
)
from
vllm.entrypoints.tool_server
import
ToolServer
from
vllm.inputs.data
import
TokensPrompt
as
EngineTokensPrompt
from
vllm.inputs.data
import
TokensPrompt
from
vllm.logger
import
init_logger
from
vllm.logprobs
import
Logprob
as
SampleLogprob
from
vllm.logprobs
import
SampleLogprobs
...
...
@@ -258,7 +258,7 @@ class OpenAIServingResponses(OpenAIServing):
self
.
tool_server
=
tool_server
def
_validate_generator_input
(
self
,
engine_prompt
:
Engine
TokensPrompt
self
,
engine_prompt
:
TokensPrompt
)
->
ErrorResponse
|
None
:
"""Add validations to the input to the generator here."""
if
self
.
max_model_len
<=
len
(
engine_prompt
[
"prompt_token_ids"
]):
...
...
@@ -353,11 +353,11 @@ class OpenAIServingResponses(OpenAIServing):
tokenizer
=
await
self
.
engine_client
.
get_tokenizer
()
if
self
.
use_harmony
:
messages
,
request_prompts
,
engine_prompts
=
(
self
.
_make_request_with_harmony
(
request
,
prev_response
)
messages
,
engine_prompts
=
self
.
_make_request_with_harmony
(
request
,
prev_response
)
else
:
messages
,
request_prompts
,
engine_prompts
=
await
self
.
_make_request
(
messages
,
engine_prompts
=
await
self
.
_make_request
(
request
,
prev_response
,
tokenizer
)
...
...
@@ -393,7 +393,7 @@ class OpenAIServingResponses(OpenAIServing):
assert
len
(
builtin_tool_list
)
==
0
available_tools
=
[]
try
:
for
i
,
engine_prompt
in
enumerate
(
engine_prompts
)
:
for
engine_prompt
in
engine_prompts
:
maybe_error
=
self
.
_validate_generator_input
(
engine_prompt
)
if
maybe_error
is
not
None
:
return
maybe_error
...
...
@@ -449,7 +449,6 @@ class OpenAIServingResponses(OpenAIServing):
)
generator
=
self
.
_generate_with_builtin_tools
(
request_id
=
request
.
request_id
,
request_prompt
=
request_prompts
[
i
],
engine_prompt
=
engine_prompt
,
sampling_params
=
sampling_params
,
context
=
context
,
...
...
@@ -564,7 +563,7 @@ class OpenAIServingResponses(OpenAIServing):
prev_msg
=
self
.
msg_store
.
get
(
prev_response
.
id
)
if
prev_response
else
None
,
prev_response_output
=
prev_response
.
output
if
prev_response
else
None
,
)
_
,
request_prompts
,
engine_prompts
=
await
self
.
_preprocess_chat
(
_
,
engine_prompts
=
await
self
.
_preprocess_chat
(
request
,
tokenizer
,
messages
,
...
...
@@ -573,7 +572,7 @@ class OpenAIServingResponses(OpenAIServing):
chat_template
=
self
.
chat_template
,
chat_template_content_format
=
self
.
chat_template_content_format
,
)
return
messages
,
request_prompts
,
engine_prompts
return
messages
,
engine_prompts
def
_make_request_with_harmony
(
self
,
...
...
@@ -586,13 +585,13 @@ class OpenAIServingResponses(OpenAIServing):
)
messages
=
self
.
_construct_input_messages_with_harmony
(
request
,
prev_response
)
prompt_token_ids
=
render_for_completion
(
messages
)
engine_prompt
=
Engine
TokensPrompt
(
prompt_token_ids
=
prompt_token_ids
)
engine_prompt
=
TokensPrompt
(
prompt_token_ids
=
prompt_token_ids
)
# Add cache_salt if provided in the request
if
request
.
cache_salt
is
not
None
:
engine_prompt
[
"cache_salt"
]
=
request
.
cache_salt
return
messages
,
[
prompt_token_ids
],
[
engine_prompt
]
return
messages
,
[
engine_prompt
]
async
def
_initialize_tool_sessions
(
self
,
...
...
vllm/entrypoints/pooling/classify/serving.py
View file @
dcb31196
...
...
@@ -72,11 +72,7 @@ class ClassificationMixin(OpenAIServing):
if
ret
:
return
ret
(
_
,
_
,
engine_prompts
,
)
=
await
self
.
_preprocess_chat
(
_
,
engine_prompts
=
await
self
.
_preprocess_chat
(
cast
(
ChatCompletionRequest
,
chat_request
),
ctx
.
tokenizer
,
messages
,
...
...
vllm/entrypoints/pooling/embed/serving.py
View file @
dcb31196
...
...
@@ -20,7 +20,6 @@ from vllm.entrypoints.openai.serving_engine import (
EmbeddingServeContext
,
OpenAIServing
,
ServeContext
,
TextTokensPrompt
,
)
from
vllm.entrypoints.openai.serving_models
import
OpenAIServingModels
from
vllm.entrypoints.pooling.embed.protocol
import
(
...
...
@@ -32,7 +31,7 @@ from vllm.entrypoints.pooling.embed.protocol import (
EmbeddingResponseData
,
)
from
vllm.entrypoints.renderer
import
RenderConfig
from
vllm.inputs.data
import
TokensPrompt
as
EngineTokensPrompt
from
vllm.inputs.data
import
TokensPrompt
from
vllm.logger
import
init_logger
from
vllm.outputs
import
(
EmbeddingRequestOutput
,
...
...
@@ -83,11 +82,7 @@ class EmbeddingMixin(OpenAIServing):
renderer
=
self
.
_get_renderer
(
tokenizer
)
if
isinstance
(
ctx
.
request
,
EmbeddingChatRequest
):
(
_
,
_
,
ctx
.
engine_prompts
,
)
=
await
self
.
_preprocess_chat
(
_
,
ctx
.
engine_prompts
=
await
self
.
_preprocess_chat
(
ctx
.
request
,
tokenizer
,
ctx
.
request
.
messages
,
...
...
@@ -209,14 +204,13 @@ class EmbeddingMixin(OpenAIServing):
async
def
_process_chunked_request
(
self
,
ctx
:
EmbeddingServeContext
,
original_prompt
:
TextTokensPrompt
,
token_ids
:
list
[
int
]
,
pooling_params
,
trace_headers
,
prompt_idx
:
int
,
)
->
list
[
AsyncGenerator
[
PoolingRequestOutput
,
None
]]:
"""Process a single prompt using chunked processing."""
generators
:
list
[
AsyncGenerator
[
PoolingRequestOutput
,
None
]]
=
[]
token_ids
=
original_prompt
[
"prompt_token_ids"
]
# Split into chunks using max_position_embeddings
max_pos_embeddings
=
self
.
_get_max_position_embeddings
()
...
...
@@ -228,18 +222,12 @@ class EmbeddingMixin(OpenAIServing):
chunk_request_id
=
f
"
{
ctx
.
request_id
}
-prompt-
{
prompt_idx
}
-chunk-
{
chunk_idx
}
"
# Create engine prompt for this chunk
chunk_engine_prompt
=
EngineTokensPrompt
(
prompt_token_ids
=
chunk_tokens
)
# Create chunk request prompt for logging
chunk_text
=
""
chunk_request_prompt
=
TextTokensPrompt
(
prompt
=
chunk_text
,
prompt_token_ids
=
chunk_tokens
)
chunk_engine_prompt
=
TokensPrompt
(
prompt_token_ids
=
chunk_tokens
)
# Log the chunk
self
.
_log_inputs
(
chunk_request_id
,
chunk_
request
_prompt
,
chunk_
engine
_prompt
,
params
=
pooling_params
,
lora_request
=
ctx
.
lora_request
,
)
...
...
@@ -263,7 +251,7 @@ class EmbeddingMixin(OpenAIServing):
request
,
input_ids
:
list
[
int
],
input_text
:
str
,
)
->
Text
TokensPrompt
:
)
->
TokensPrompt
:
"""Override to support chunked processing for embedding requests."""
token_num
=
len
(
input_ids
)
...
...
@@ -328,23 +316,15 @@ class EmbeddingMixin(OpenAIServing):
)
)
return
Text
TokensPrompt
(
prompt
=
input_text
,
prompt_token_ids
=
input_ids
)
return
TokensPrompt
(
prompt
=
input_text
,
prompt_token_ids
=
input_ids
)
# For other request types, use the parent's implementation
return
super
().
_validate_input
(
request
,
input_ids
,
input_text
)
def
_is_text_tokens_prompt
(
self
,
prompt
)
->
bool
:
"""Check if a prompt is a TextTokensPrompt (has prompt_token_ids)."""
return
(
isinstance
(
prompt
,
dict
)
and
"prompt_token_ids"
in
prompt
and
"prompt_embeds"
not
in
prompt
)
async
def
_create_single_prompt_generator
(
self
,
ctx
:
EmbeddingServeContext
,
engine_prompt
:
Engine
TokensPrompt
,
engine_prompt
:
TokensPrompt
,
pooling_params
:
PoolingParams
,
trace_headers
:
Mapping
[
str
,
str
]
|
None
,
prompt_index
:
int
,
...
...
@@ -413,14 +393,16 @@ class EmbeddingMixin(OpenAIServing):
for
i
,
engine_prompt
in
enumerate
(
ctx
.
engine_prompts
):
# Check if this specific prompt needs chunked processing
if
self
.
_is_text_tokens_prompt
(
engine_prompt
):
# Cast to TextTokensPrompt since we've verified
# prompt_token_ids
text_tokens_prompt
=
cast
(
TextTokensPrompt
,
engine_prompt
)
if
len
(
text_tokens_prompt
[
"prompt_token_ids"
])
>
max_pos_embeddings
:
if
"prompt_token_ids"
in
engine_prompt
:
prompt_token_ids
=
engine_prompt
[
"prompt_token_ids"
]
if
len
(
prompt_token_ids
)
>
max_pos_embeddings
:
# Use chunked processing for this prompt
chunk_generators
=
await
self
.
_process_chunked_request
(
ctx
,
text_tokens_prompt
,
pooling_params
,
trace_headers
,
i
ctx
,
prompt_token_ids
,
pooling_params
,
trace_headers
,
i
,
)
generators
.
extend
(
chunk_generators
)
continue
...
...
@@ -578,14 +560,13 @@ class EmbeddingMixin(OpenAIServing):
# Get original prompt token IDs for this prompt
original_prompt
=
ctx
.
engine_prompts
[
prompt_idx
]
if
not
self
.
_is_text_tokens_prompt
(
original_prompt
)
:
if
"prompt_token_ids"
not
in
original_prompt
:
return
self
.
create_error_response
(
f
"Chunked prompt
{
prompt_idx
}
is not a TextTokensPrompt"
f
"Chunked prompt
{
prompt_idx
}
does not contain "
"token IDs"
)
original_token_ids
=
cast
(
TextTokensPrompt
,
original_prompt
)[
"prompt_token_ids"
]
original_token_ids
=
original_prompt
[
"prompt_token_ids"
]
pooling_request_output
=
PoolingRequestOutput
(
request_id
=
aggregator
[
"request_id"
],
...
...
vllm/entrypoints/pooling/pooling/serving.py
View file @
dcb31196
...
...
@@ -137,11 +137,8 @@ class OpenAIServingPooling(OpenAIServing):
)
if
error_check_ret
is
not
None
:
return
error_check_ret
(
_
,
_
,
engine_prompts
,
)
=
await
self
.
_preprocess_chat
(
_
,
engine_prompts
=
await
self
.
_preprocess_chat
(
request
,
tokenizer
,
request
.
messages
,
...
...
vllm/entrypoints/renderer.py
View file @
dcb31196
...
...
@@ -12,9 +12,7 @@ import torch
from
pydantic
import
Field
from
vllm.config
import
ModelConfig
from
vllm.inputs.data
import
EmbedsPrompt
as
EngineEmbedsPrompt
from
vllm.inputs.data
import
TextPrompt
as
EngineTextPrompt
from
vllm.inputs.data
import
TokensPrompt
as
EngineTokensPrompt
from
vllm.inputs.data
import
EmbedsPrompt
,
TextPrompt
,
TokensPrompt
from
vllm.inputs.parse
import
get_prompt_components
,
parse_raw_prompts
from
vllm.tokenizers
import
TokenizerLike
from
vllm.utils.async_utils
import
AsyncMicrobatchTokenizer
...
...
@@ -97,7 +95,7 @@ class BaseRenderer(ABC):
*
,
prompt_or_prompts
:
str
|
list
[
str
]
|
list
[
int
]
|
list
[
list
[
int
]],
config
:
RenderConfig
,
)
->
list
[
Engine
TokensPrompt
]:
)
->
list
[
TokensPrompt
]:
"""
Convert text or token inputs into engine-ready TokensPrompt objects.
...
...
@@ -115,7 +113,7 @@ class BaseRenderer(ABC):
(e.g., tokenization and length handling).
Returns:
list[
Engine
TokensPrompt]: Engine-ready token prompts.
list[TokensPrompt]: Engine-ready token prompts.
Raises:
ValueError: If input formats are invalid or length limits exceeded.
...
...
@@ -129,7 +127,7 @@ class BaseRenderer(ABC):
prompt_or_prompts
:
str
|
list
[
str
]
|
list
[
int
]
|
list
[
list
[
int
]]
|
None
=
None
,
prompt_embeds
:
bytes
|
list
[
bytes
]
|
None
=
None
,
config
:
RenderConfig
,
)
->
list
[
Engine
TokensPrompt
|
Engine
EmbedsPrompt
]:
)
->
list
[
TokensPrompt
|
EmbedsPrompt
]:
"""
Convert text/token and/or base64-encoded embeddings inputs into
engine-ready prompt objects using a unified RenderConfig.
...
...
@@ -146,7 +144,7 @@ class BaseRenderer(ABC):
(e.g., tokenization and length handling).
Returns:
list[Union[
Engine
TokensPrompt,
Engine
EmbedsPrompt]]:
list[Union[TokensPrompt, EmbedsPrompt]]:
Engine-ready prompt objects.
Raises:
...
...
@@ -161,14 +159,14 @@ class BaseRenderer(ABC):
prompt_embeds
:
bytes
|
list
[
bytes
],
truncate_prompt_tokens
:
Annotated
[
int
,
Field
(
ge
=
0
)]
|
None
=
None
,
cache_salt
:
str
|
None
=
None
,
)
->
list
[
Engine
EmbedsPrompt
]:
)
->
list
[
EmbedsPrompt
]:
"""Load and validate base64-encoded embeddings into prompt objects."""
if
not
self
.
model_config
.
enable_prompt_embeds
:
raise
ValueError
(
"You must set `--enable-prompt-embeds` to input `prompt_embeds`."
)
def
_load_and_validate_embed
(
embed
:
bytes
)
->
Engine
EmbedsPrompt
:
def
_load_and_validate_embed
(
embed
:
bytes
)
->
EmbedsPrompt
:
tensor
=
torch
.
load
(
io
.
BytesIO
(
pybase64
.
b64decode
(
embed
,
validate
=
True
)),
weights_only
=
True
,
...
...
@@ -185,7 +183,7 @@ class BaseRenderer(ABC):
assert
tensor
.
dim
()
==
2
if
truncate_prompt_tokens
is
not
None
:
tensor
=
tensor
[
-
truncate_prompt_tokens
:]
embeds_prompt
=
Engine
EmbedsPrompt
(
prompt_embeds
=
tensor
)
embeds_prompt
=
EmbedsPrompt
(
prompt_embeds
=
tensor
)
if
cache_salt
is
not
None
:
embeds_prompt
[
"cache_salt"
]
=
cache_salt
return
embeds_prompt
...
...
@@ -213,7 +211,7 @@ class CompletionRenderer(BaseRenderer):
*
,
prompt_or_prompts
:
str
|
list
[
str
]
|
list
[
int
]
|
list
[
list
[
int
]],
config
:
RenderConfig
,
)
->
list
[
Engine
TokensPrompt
]:
)
->
list
[
TokensPrompt
]:
"""Implementation of prompt rendering for completion-style requests.
Uses async tokenizer pooling for improved performance. See base class
...
...
@@ -240,7 +238,7 @@ class CompletionRenderer(BaseRenderer):
prompt_or_prompts
:
str
|
list
[
str
]
|
list
[
int
]
|
list
[
list
[
int
]]
|
None
=
None
,
prompt_embeds
:
bytes
|
list
[
bytes
]
|
None
=
None
,
config
:
RenderConfig
,
)
->
list
[
Engine
TokensPrompt
|
Engine
EmbedsPrompt
]:
)
->
list
[
TokensPrompt
|
EmbedsPrompt
]:
"""
Render text/token prompts and/or precomputed embedding prompts. At
least one of `prompt_or_prompts` or `prompt_embeds` must be provided.
...
...
@@ -249,7 +247,7 @@ class CompletionRenderer(BaseRenderer):
if
truncate_prompt_tokens
==
0
:
return
[]
rendered
:
list
[
Engine
TokensPrompt
|
Engine
EmbedsPrompt
]
=
[]
rendered
:
list
[
TokensPrompt
|
EmbedsPrompt
]
=
[]
if
prompt_embeds
is
not
None
:
rendered
.
extend
(
...
...
@@ -281,10 +279,10 @@ class CompletionRenderer(BaseRenderer):
async
def
_create_prompt
(
self
,
prompt_input
:
Engine
TextPrompt
|
Engine
TokensPrompt
,
prompt_input
:
TextPrompt
|
TokensPrompt
,
config
:
RenderConfig
,
truncate_prompt_tokens
:
int
|
None
,
)
->
Engine
TokensPrompt
:
)
->
TokensPrompt
:
prompt
,
prompt_token_ids
,
_
=
get_prompt_components
(
prompt_input
)
if
prompt_token_ids
is
not
None
:
...
...
@@ -317,7 +315,7 @@ class CompletionRenderer(BaseRenderer):
truncate_prompt_tokens
:
int
|
None
,
add_special_tokens
:
bool
,
cache_salt
:
str
|
None
,
)
->
Engine
TokensPrompt
:
)
->
TokensPrompt
:
"""Tokenize text input asynchronously."""
async_tokenizer
=
self
.
_get_async_tokenizer
()
...
...
@@ -350,7 +348,7 @@ class CompletionRenderer(BaseRenderer):
truncate_prompt_tokens
:
int
|
None
,
cache_salt
:
str
|
None
,
needs_detokenization
:
bool
|
None
=
False
,
)
->
Engine
TokensPrompt
:
)
->
TokensPrompt
:
"""Optionally detokenize token IDs and build a tokens prompt."""
token_ids
=
self
.
_maybe_apply_truncation
(
token_ids
,
truncate_prompt_tokens
)
...
...
@@ -392,8 +390,8 @@ class CompletionRenderer(BaseRenderer):
max_length
:
int
|
None
=
None
,
cache_salt
:
str
|
None
=
None
,
prompt
:
str
|
None
=
None
,
)
->
Engine
TokensPrompt
:
"""Create validated
Engine
TokensPrompt."""
)
->
TokensPrompt
:
"""Create validated TokensPrompt."""
if
max_length
is
not
None
and
len
(
token_ids
)
>
max_length
:
raise
ValueError
(
f
"This model's maximum context length is
{
max_length
}
tokens. "
...
...
@@ -401,7 +399,7 @@ class CompletionRenderer(BaseRenderer):
"Please reduce the length of the input messages."
)
tokens_prompt
=
Engine
TokensPrompt
(
prompt_token_ids
=
token_ids
)
tokens_prompt
=
TokensPrompt
(
prompt_token_ids
=
token_ids
)
if
cache_salt
is
not
None
:
tokens_prompt
[
"cache_salt"
]
=
cache_salt
if
prompt
is
not
None
:
...
...
vllm/entrypoints/serve/disagg/serving.py
View file @
dcb31196
...
...
@@ -27,7 +27,7 @@ from vllm.entrypoints.serve.disagg.protocol import (
GenerateResponse
,
GenerateResponseChoice
,
)
from
vllm.inputs.data
import
TokensPrompt
as
EngineTokensPrompt
from
vllm.inputs.data
import
TokensPrompt
from
vllm.logger
import
init_logger
from
vllm.logprobs
import
Logprob
from
vllm.outputs
import
RequestOutput
...
...
@@ -99,7 +99,7 @@ class ServingTokens(OpenAIServing):
# TODO(NickLucche): Change to EngineCoreRequest once Renderer work is
# completed
engine_prompt
=
Engine
TokensPrompt
(
prompt_token_ids
=
request
.
token_ids
)
engine_prompt
=
TokensPrompt
(
prompt_token_ids
=
request
.
token_ids
)
if
request
.
features
is
not
None
:
engine_prompt
[
"multi_modal_data"
]
=
None
...
...
@@ -115,7 +115,7 @@ class ServingTokens(OpenAIServing):
self
.
_log_inputs
(
request_id
,
request
.
token_ids
,
TokensPrompt
(
prompt_token_ids
=
request
.
token_ids
)
,
params
=
sampling_params
,
lora_request
=
lora_request
,
)
...
...
vllm/entrypoints/serve/tokenize/serving.py
View file @
dcb31196
...
...
@@ -21,6 +21,7 @@ from vllm.entrypoints.openai.protocol import (
from
vllm.entrypoints.openai.serving_engine
import
OpenAIServing
from
vllm.entrypoints.openai.serving_models
import
OpenAIServingModels
from
vllm.entrypoints.renderer
import
RenderConfig
from
vllm.inputs
import
TokensPrompt
from
vllm.logger
import
init_logger
from
vllm.tokenizers
import
TokenizerLike
...
...
@@ -80,11 +81,8 @@ class OpenAIServingTokenization(OpenAIServing):
)
if
error_check_ret
is
not
None
:
return
error_check_ret
(
_
,
_
,
engine_prompts
,
)
=
await
self
.
_preprocess_chat
(
_
,
engine_prompts
=
await
self
.
_preprocess_chat
(
request
,
tokenizer
,
request
.
messages
,
...
...
@@ -141,7 +139,10 @@ class OpenAIServingTokenization(OpenAIServing):
tokenizer
=
await
self
.
engine_client
.
get_tokenizer
()
self
.
_log_inputs
(
request_id
,
request
.
tokens
,
params
=
None
,
lora_request
=
lora_request
request_id
,
TokensPrompt
(
prompt_token_ids
=
request
.
tokens
),
params
=
None
,
lora_request
=
lora_request
,
)
prompt_input
=
await
self
.
_tokenize_prompt_input_async
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment