Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
59192dfd
Unverified
Commit
59192dfd
authored
Mar 17, 2026
by
Sage
Committed by
GitHub
Mar 17, 2026
Browse files
[Frontend] Complete OpenAI render delegation (#37287)
Signed-off-by:
Sage Ahrac
<
sagiahrak@gmail.com
>
parent
56cb1baa
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
140 additions
and
244 deletions
+140
-244
tests/entrypoints/openai/test_serving_responses.py
tests/entrypoints/openai/test_serving_responses.py
+4
-0
vllm/entrypoints/openai/engine/serving.py
vllm/entrypoints/openai/engine/serving.py
+3
-226
vllm/entrypoints/openai/generate/api_router.py
vllm/entrypoints/openai/generate/api_router.py
+2
-0
vllm/entrypoints/openai/responses/serving.py
vllm/entrypoints/openai/responses/serving.py
+112
-3
vllm/entrypoints/pooling/__init__.py
vllm/entrypoints/pooling/__init__.py
+1
-0
vllm/entrypoints/pooling/pooling/serving.py
vllm/entrypoints/pooling/pooling/serving.py
+7
-4
vllm/entrypoints/serve/disagg/serving.py
vllm/entrypoints/serve/disagg/serving.py
+4
-1
vllm/entrypoints/serve/render/serving.py
vllm/entrypoints/serve/render/serving.py
+7
-10
No files found.
tests/entrypoints/openai/test_serving_responses.py
View file @
59192dfd
...
@@ -159,6 +159,7 @@ class TestInitializeToolSessions:
...
@@ -159,6 +159,7 @@ class TestInitializeToolSessions:
instance
=
OpenAIServingResponses
(
instance
=
OpenAIServingResponses
(
engine_client
=
engine_client
,
engine_client
=
engine_client
,
models
=
models
,
models
=
models
,
openai_serving_render
=
MagicMock
(),
request_logger
=
None
,
request_logger
=
None
,
chat_template
=
None
,
chat_template
=
None
,
chat_template_content_format
=
"auto"
,
chat_template_content_format
=
"auto"
,
...
@@ -245,6 +246,7 @@ class TestValidateGeneratorInput:
...
@@ -245,6 +246,7 @@ class TestValidateGeneratorInput:
instance
=
OpenAIServingResponses
(
instance
=
OpenAIServingResponses
(
engine_client
=
engine_client
,
engine_client
=
engine_client
,
models
=
models
,
models
=
models
,
openai_serving_render
=
MagicMock
(),
request_logger
=
None
,
request_logger
=
None
,
chat_template
=
None
,
chat_template
=
None
,
chat_template_content_format
=
"auto"
,
chat_template_content_format
=
"auto"
,
...
@@ -308,6 +310,7 @@ async def test_reasoning_tokens_counted_for_text_reasoning_model(monkeypatch):
...
@@ -308,6 +310,7 @@ async def test_reasoning_tokens_counted_for_text_reasoning_model(monkeypatch):
serving
=
OpenAIServingResponses
(
serving
=
OpenAIServingResponses
(
engine_client
=
engine_client
,
engine_client
=
engine_client
,
models
=
models
,
models
=
models
,
openai_serving_render
=
MagicMock
(),
request_logger
=
None
,
request_logger
=
None
,
chat_template
=
None
,
chat_template
=
None
,
chat_template_content_format
=
"auto"
,
chat_template_content_format
=
"auto"
,
...
@@ -607,6 +610,7 @@ def _make_serving_instance_with_reasoning():
...
@@ -607,6 +610,7 @@ def _make_serving_instance_with_reasoning():
serving
=
OpenAIServingResponses
(
serving
=
OpenAIServingResponses
(
engine_client
=
engine_client
,
engine_client
=
engine_client
,
models
=
models
,
models
=
models
,
openai_serving_render
=
MagicMock
(),
request_logger
=
None
,
request_logger
=
None
,
chat_template
=
None
,
chat_template
=
None
,
chat_template_content_format
=
"auto"
,
chat_template_content_format
=
"auto"
,
...
...
vllm/entrypoints/openai/engine/serving.py
View file @
59192dfd
...
@@ -4,7 +4,7 @@ import asyncio
...
@@ -4,7 +4,7 @@ import asyncio
import
contextlib
import
contextlib
import
json
import
json
import
time
import
time
from
collections.abc
import
AsyncGenerator
,
Callable
,
Mapping
,
Sequence
from
collections.abc
import
AsyncGenerator
,
Callable
,
Mapping
from
dataclasses
import
dataclass
,
field
from
dataclasses
import
dataclass
,
field
from
http
import
HTTPStatus
from
http
import
HTTPStatus
from
typing
import
Any
,
ClassVar
,
Generic
,
Protocol
,
TypeAlias
,
TypeVar
from
typing
import
Any
,
ClassVar
,
Generic
,
Protocol
,
TypeAlias
,
TypeVar
...
@@ -22,9 +22,7 @@ from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
...
@@ -22,9 +22,7 @@ from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
from
vllm.config
import
ModelConfig
from
vllm.config
import
ModelConfig
from
vllm.engine.protocol
import
EngineClient
from
vllm.engine.protocol
import
EngineClient
from
vllm.entrypoints.chat_utils
import
(
from
vllm.entrypoints.chat_utils
import
(
ChatCompletionMessageParam
,
ChatTemplateContentFormatOption
,
ChatTemplateContentFormatOption
,
ConversationMessage
,
)
)
from
vllm.entrypoints.logger
import
RequestLogger
from
vllm.entrypoints.logger
import
RequestLogger
from
vllm.entrypoints.openai.chat_completion.protocol
import
(
from
vllm.entrypoints.openai.chat_completion.protocol
import
(
...
@@ -43,19 +41,9 @@ from vllm.entrypoints.openai.engine.protocol import (
...
@@ -43,19 +41,9 @@ from vllm.entrypoints.openai.engine.protocol import (
GenerationError
,
GenerationError
,
)
)
from
vllm.entrypoints.openai.models.serving
import
OpenAIServingModels
from
vllm.entrypoints.openai.models.serving
import
OpenAIServingModels
from
vllm.entrypoints.openai.responses.context
import
(
ConversationContext
,
HarmonyContext
,
ParsableContext
,
StreamingHarmonyContext
,
)
from
vllm.entrypoints.openai.responses.protocol
import
(
from
vllm.entrypoints.openai.responses.protocol
import
(
ResponseInputOutputItem
,
ResponsesRequest
,
ResponsesRequest
,
)
)
from
vllm.entrypoints.openai.responses.utils
import
(
construct_input_messages
,
)
from
vllm.entrypoints.openai.speech_to_text.protocol
import
(
from
vllm.entrypoints.openai.speech_to_text.protocol
import
(
TranscriptionRequest
,
TranscriptionRequest
,
TranscriptionResponse
,
TranscriptionResponse
,
...
@@ -82,26 +70,22 @@ from vllm.entrypoints.serve.tokenize.protocol import (
...
@@ -82,26 +70,22 @@ from vllm.entrypoints.serve.tokenize.protocol import (
TokenizeCompletionRequest
,
TokenizeCompletionRequest
,
TokenizeResponse
,
TokenizeResponse
,
)
)
from
vllm.entrypoints.utils
import
create_error_response
,
get_max_tokens
from
vllm.entrypoints.utils
import
create_error_response
from
vllm.exceptions
import
VLLMValidationError
from
vllm.exceptions
import
VLLMValidationError
from
vllm.inputs.data
import
(
from
vllm.inputs.data
import
(
ProcessorInputs
,
ProcessorInputs
,
PromptType
,
PromptType
,
SingletonPrompt
,
TokensPrompt
,
TokensPrompt
,
token_inputs
,
)
)
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.logprobs
import
Logprob
,
PromptLogprobs
from
vllm.logprobs
import
Logprob
,
PromptLogprobs
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
from
vllm.outputs
import
CompletionOutput
,
PoolingRequestOutput
,
RequestOutput
from
vllm.outputs
import
CompletionOutput
,
PoolingRequestOutput
,
RequestOutput
from
vllm.pooling_params
import
PoolingParams
from
vllm.pooling_params
import
PoolingParams
from
vllm.renderers
import
ChatParams
,
TokenizeParams
,
merge_kwargs
from
vllm.renderers
import
ChatParams
,
TokenizeParams
from
vllm.renderers.inputs.preprocess
import
(
from
vllm.renderers.inputs.preprocess
import
(
extract_prompt_components
,
extract_prompt_components
,
extract_prompt_len
,
extract_prompt_len
,
parse_model_prompt
,
prompt_to_seq
,
)
)
from
vllm.sampling_params
import
BeamSearchParams
,
SamplingParams
from
vllm.sampling_params
import
BeamSearchParams
,
SamplingParams
from
vllm.tokenizers
import
TokenizerLike
from
vllm.tokenizers
import
TokenizerLike
...
@@ -116,7 +100,6 @@ from vllm.utils.async_utils import (
...
@@ -116,7 +100,6 @@ from vllm.utils.async_utils import (
collect_from_async_generator
,
collect_from_async_generator
,
merge_async_iterators
,
merge_async_iterators
,
)
)
from
vllm.utils.mistral
import
is_mistral_tokenizer
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -823,109 +806,6 @@ class OpenAIServing:
...
@@ -823,109 +806,6 @@ class OpenAIServing:
# Apply server defaults first, then request kwargs override.
# Apply server defaults first, then request kwargs override.
return
default_chat_template_kwargs
|
request_chat_template_kwargs
return
default_chat_template_kwargs
|
request_chat_template_kwargs
async
def
_preprocess_completion
(
self
,
request
:
RendererRequest
,
prompt_input
:
str
|
list
[
str
]
|
list
[
int
]
|
list
[
list
[
int
]]
|
None
,
prompt_embeds
:
bytes
|
list
[
bytes
]
|
None
,
)
->
list
[
ProcessorInputs
]:
prompts
=
list
[
SingletonPrompt
|
bytes
]()
if
prompt_embeds
is
not
None
:
# embeds take higher priority
prompts
.
extend
(
prompt_to_seq
(
prompt_embeds
))
if
prompt_input
is
not
None
:
prompts
.
extend
(
prompt_to_seq
(
prompt_input
))
return
await
self
.
_preprocess_cmpl
(
request
,
prompts
)
async
def
_preprocess_cmpl
(
self
,
request
:
RendererRequest
,
prompts
:
Sequence
[
PromptType
|
bytes
],
)
->
list
[
ProcessorInputs
]:
renderer
=
self
.
renderer
model_config
=
self
.
model_config
parsed_prompts
=
[
(
prompt
if
isinstance
(
prompt
,
bytes
)
else
parse_model_prompt
(
model_config
,
prompt
)
)
for
prompt
in
prompts
]
tok_params
=
request
.
build_tok_params
(
model_config
)
return
await
renderer
.
render_cmpl_async
(
parsed_prompts
,
tok_params
,
prompt_extras
=
{
k
:
v
for
k
in
(
"mm_processor_kwargs"
,
"cache_salt"
)
if
(
v
:
=
getattr
(
request
,
k
,
None
))
is
not
None
},
)
async
def
_preprocess_chat
(
self
,
request
:
RendererChatRequest
,
messages
:
list
[
ChatCompletionMessageParam
],
default_template
:
str
|
None
,
default_template_content_format
:
ChatTemplateContentFormatOption
,
default_template_kwargs
:
dict
[
str
,
Any
]
|
None
,
tool_dicts
:
list
[
dict
[
str
,
Any
]]
|
None
=
None
,
tool_parser
:
Callable
[[
TokenizerLike
],
ToolParser
]
|
None
=
None
,
)
->
tuple
[
list
[
ConversationMessage
],
list
[
ProcessorInputs
]]:
renderer
=
self
.
renderer
default_template_kwargs
=
merge_kwargs
(
default_template_kwargs
,
dict
(
tools
=
tool_dicts
,
tokenize
=
is_mistral_tokenizer
(
renderer
.
tokenizer
),
),
)
mm_config
=
self
.
model_config
.
multimodal_config
tok_params
=
request
.
build_tok_params
(
self
.
model_config
)
chat_params
=
request
.
build_chat_params
(
default_template
,
default_template_content_format
).
with_defaults
(
default_template_kwargs
,
default_media_io_kwargs
=
(
mm_config
.
media_io_kwargs
if
mm_config
else
None
),
default_mm_processor_kwargs
=
getattr
(
request
,
"mm_processor_kwargs"
,
None
),
)
(
conversation
,),
(
engine_prompt
,)
=
await
renderer
.
render_chat_async
(
[
messages
],
chat_params
,
tok_params
,
prompt_extras
=
{
k
:
v
for
k
in
(
"mm_processor_kwargs"
,
"cache_salt"
)
if
(
v
:
=
getattr
(
request
,
k
,
None
))
is
not
None
},
)
# tool parsing is done only if a tool_parser has been set and if
# tool_choice is not "none" (if tool_choice is "none" but a tool_parser
# is set, we want to prevent parsing a tool_call hallucinated by the LLM
if
tool_parser
is
not
None
:
tool_choice
=
getattr
(
request
,
"tool_choice"
,
"none"
)
if
tool_choice
!=
"none"
:
if
not
isinstance
(
request
,
ChatCompletionRequest
|
ResponsesRequest
):
msg
=
(
"Tool usage is only supported for Chat Completions API "
"or Responses API requests."
)
raise
NotImplementedError
(
msg
)
# TODO: Update adjust_request to accept ResponsesRequest
tokenizer
=
renderer
.
get_tokenizer
()
request
=
tool_parser
(
tokenizer
).
adjust_request
(
request
=
request
)
# type: ignore[arg-type]
return
conversation
,
[
engine_prompt
]
def
_extract_prompt_components
(
self
,
prompt
:
PromptType
|
ProcessorInputs
):
def
_extract_prompt_components
(
self
,
prompt
:
PromptType
|
ProcessorInputs
):
return
extract_prompt_components
(
self
.
model_config
,
prompt
)
return
extract_prompt_components
(
self
.
model_config
,
prompt
)
...
@@ -935,109 +815,6 @@ class OpenAIServing:
...
@@ -935,109 +815,6 @@ class OpenAIServing:
def
_extract_prompt_len
(
self
,
prompt
:
ProcessorInputs
):
def
_extract_prompt_len
(
self
,
prompt
:
ProcessorInputs
):
return
extract_prompt_len
(
self
.
model_config
,
prompt
)
return
extract_prompt_len
(
self
.
model_config
,
prompt
)
async
def
_render_next_turn
(
self
,
request
:
ResponsesRequest
,
messages
:
list
[
ResponseInputOutputItem
],
tool_dicts
:
list
[
dict
[
str
,
Any
]]
|
None
,
tool_parser
:
Callable
[[
TokenizerLike
],
ToolParser
]
|
None
,
chat_template
:
str
|
None
,
chat_template_content_format
:
ChatTemplateContentFormatOption
,
):
new_messages
=
construct_input_messages
(
request_input
=
messages
,
)
_
,
engine_prompts
=
await
self
.
_preprocess_chat
(
request
,
new_messages
,
default_template
=
chat_template
,
default_template_content_format
=
chat_template_content_format
,
default_template_kwargs
=
None
,
tool_dicts
=
tool_dicts
,
tool_parser
=
tool_parser
,
)
return
engine_prompts
async
def
_generate_with_builtin_tools
(
self
,
request_id
:
str
,
engine_prompt
:
ProcessorInputs
,
sampling_params
:
SamplingParams
,
context
:
ConversationContext
,
lora_request
:
LoRARequest
|
None
=
None
,
priority
:
int
=
0
,
trace_headers
:
Mapping
[
str
,
str
]
|
None
=
None
,
):
max_model_len
=
self
.
model_config
.
max_model_len
orig_priority
=
priority
sub_request
=
0
while
True
:
# Ensure that each sub-request has a unique request id.
sub_request_id
=
f
"
{
request_id
}
_
{
sub_request
}
"
self
.
_log_inputs
(
sub_request_id
,
engine_prompt
,
params
=
sampling_params
,
lora_request
=
lora_request
,
)
generator
=
self
.
engine_client
.
generate
(
engine_prompt
,
sampling_params
,
sub_request_id
,
lora_request
=
lora_request
,
trace_headers
=
trace_headers
,
priority
=
priority
,
)
async
for
res
in
generator
:
context
.
append_output
(
res
)
# NOTE(woosuk): The stop condition is handled by the engine.
yield
context
if
not
context
.
need_builtin_tool_call
():
# The model did not ask for a tool call, so we're done.
break
# Call the tool and update the context with the result.
tool_output
=
await
context
.
call_tool
()
context
.
append_tool_output
(
tool_output
)
# TODO: uncomment this and enable tool output streaming
# yield context
# Create inputs for the next turn.
# Render the next prompt token ids and update sampling_params.
if
isinstance
(
context
,
(
HarmonyContext
,
StreamingHarmonyContext
)):
token_ids
=
context
.
render_for_completion
()
engine_prompt
=
token_inputs
(
token_ids
)
sampling_params
.
max_tokens
=
max_model_len
-
len
(
token_ids
)
elif
isinstance
(
context
,
ParsableContext
):
(
engine_prompt
,)
=
await
self
.
_render_next_turn
(
context
.
request
,
context
.
parser
.
response_messages
,
context
.
tool_dicts
,
context
.
tool_parser_cls
,
context
.
chat_template
,
context
.
chat_template_content_format
,
)
sampling_params
.
max_tokens
=
get_max_tokens
(
max_model_len
,
context
.
request
.
max_output_tokens
,
self
.
_extract_prompt_len
(
engine_prompt
),
self
.
default_sampling_params
,
# type: ignore
self
.
override_max_tokens
,
# type: ignore
)
# OPTIMIZATION
priority
=
orig_priority
-
1
sub_request
+=
1
def
_log_inputs
(
def
_log_inputs
(
self
,
self
,
request_id
:
str
,
request_id
:
str
,
...
...
vllm/entrypoints/openai/generate/api_router.py
View file @
59192dfd
...
@@ -80,6 +80,7 @@ async def init_generate_state(
...
@@ -80,6 +80,7 @@ async def init_generate_state(
OpenAIServingResponses
(
OpenAIServingResponses
(
engine_client
,
engine_client
,
state
.
openai_serving_models
,
state
.
openai_serving_models
,
state
.
openai_serving_render
,
request_logger
=
request_logger
,
request_logger
=
request_logger
,
chat_template
=
resolved_chat_template
,
chat_template
=
resolved_chat_template
,
chat_template_content_format
=
args
.
chat_template_content_format
,
chat_template_content_format
=
args
.
chat_template_content_format
,
...
@@ -157,6 +158,7 @@ async def init_generate_state(
...
@@ -157,6 +158,7 @@ async def init_generate_state(
ServingTokens
(
ServingTokens
(
engine_client
,
engine_client
,
state
.
openai_serving_models
,
state
.
openai_serving_models
,
state
.
openai_serving_render
,
request_logger
=
request_logger
,
request_logger
=
request_logger
,
return_tokens_as_token_ids
=
args
.
return_tokens_as_token_ids
,
return_tokens_as_token_ids
=
args
.
return_tokens_as_token_ids
,
enable_prompt_tokens_details
=
args
.
enable_prompt_tokens_details
,
enable_prompt_tokens_details
=
args
.
enable_prompt_tokens_details
,
...
...
vllm/entrypoints/openai/responses/serving.py
View file @
59192dfd
...
@@ -5,11 +5,11 @@ import asyncio
...
@@ -5,11 +5,11 @@ import asyncio
import
time
import
time
import
uuid
import
uuid
from
collections
import
deque
from
collections
import
deque
from
collections.abc
import
AsyncGenerator
,
AsyncIterator
,
Callable
,
Sequence
from
collections.abc
import
AsyncGenerator
,
AsyncIterator
,
Callable
,
Mapping
,
Sequence
from
contextlib
import
AsyncExitStack
from
contextlib
import
AsyncExitStack
from
copy
import
copy
from
copy
import
copy
from
http
import
HTTPStatus
from
http
import
HTTPStatus
from
typing
import
Final
from
typing
import
Any
,
Final
from
fastapi
import
Request
from
fastapi
import
Request
from
openai.types.responses
import
(
from
openai.types.responses
import
(
...
@@ -86,6 +86,7 @@ from vllm.entrypoints.openai.responses.protocol import (
...
@@ -86,6 +86,7 @@ from vllm.entrypoints.openai.responses.protocol import (
ResponseCompletedEvent
,
ResponseCompletedEvent
,
ResponseCreatedEvent
,
ResponseCreatedEvent
,
ResponseInProgressEvent
,
ResponseInProgressEvent
,
ResponseInputOutputItem
,
ResponseInputOutputMessage
,
ResponseInputOutputMessage
,
ResponseReasoningPartAddedEvent
,
ResponseReasoningPartAddedEvent
,
ResponseReasoningPartDoneEvent
,
ResponseReasoningPartDoneEvent
,
...
@@ -105,16 +106,19 @@ from vllm.entrypoints.openai.responses.utils import (
...
@@ -105,16 +106,19 @@ from vllm.entrypoints.openai.responses.utils import (
construct_tool_dicts
,
construct_tool_dicts
,
extract_tool_types
,
extract_tool_types
,
)
)
from
vllm.entrypoints.serve.render.serving
import
OpenAIServingRender
from
vllm.entrypoints.utils
import
get_max_tokens
from
vllm.entrypoints.utils
import
get_max_tokens
from
vllm.exceptions
import
VLLMValidationError
from
vllm.exceptions
import
VLLMValidationError
from
vllm.inputs.data
import
ProcessorInputs
,
token_inputs
from
vllm.inputs.data
import
ProcessorInputs
,
token_inputs
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.logprobs
import
Logprob
as
SampleLogprob
from
vllm.logprobs
import
Logprob
as
SampleLogprob
from
vllm.logprobs
import
SampleLogprobs
from
vllm.logprobs
import
SampleLogprobs
from
vllm.lora.request
import
LoRARequest
from
vllm.outputs
import
CompletionOutput
from
vllm.outputs
import
CompletionOutput
from
vllm.parser
import
ParserManager
from
vllm.parser
import
ParserManager
from
vllm.sampling_params
import
SamplingParams
,
StructuredOutputsParams
from
vllm.sampling_params
import
SamplingParams
,
StructuredOutputsParams
from
vllm.tokenizers
import
TokenizerLike
from
vllm.tokenizers
import
TokenizerLike
from
vllm.tool_parsers
import
ToolParser
from
vllm.utils
import
random_uuid
from
vllm.utils
import
random_uuid
from
vllm.utils.collection_utils
import
as_list
from
vllm.utils.collection_utils
import
as_list
...
@@ -165,6 +169,7 @@ class OpenAIServingResponses(OpenAIServing):
...
@@ -165,6 +169,7 @@ class OpenAIServingResponses(OpenAIServing):
self
,
self
,
engine_client
:
EngineClient
,
engine_client
:
EngineClient
,
models
:
OpenAIServingModels
,
models
:
OpenAIServingModels
,
openai_serving_render
:
OpenAIServingRender
,
*
,
*
,
request_logger
:
RequestLogger
|
None
,
request_logger
:
RequestLogger
|
None
,
chat_template
:
str
|
None
,
chat_template
:
str
|
None
,
...
@@ -185,6 +190,7 @@ class OpenAIServingResponses(OpenAIServing):
...
@@ -185,6 +190,7 @@ class OpenAIServingResponses(OpenAIServing):
return_tokens_as_token_ids
=
return_tokens_as_token_ids
,
return_tokens_as_token_ids
=
return_tokens_as_token_ids
,
)
)
self
.
openai_serving_render
=
openai_serving_render
self
.
chat_template
=
chat_template
self
.
chat_template
=
chat_template
self
.
chat_template_content_format
:
Final
=
chat_template_content_format
self
.
chat_template_content_format
:
Final
=
chat_template_content_format
self
.
enable_log_outputs
=
enable_log_outputs
self
.
enable_log_outputs
=
enable_log_outputs
...
@@ -587,7 +593,7 @@ class OpenAIServingResponses(OpenAIServing):
...
@@ -587,7 +593,7 @@ class OpenAIServingResponses(OpenAIServing):
prev_response_output
=
prev_response
.
output
if
prev_response
else
None
,
prev_response_output
=
prev_response
.
output
if
prev_response
else
None
,
)
)
_
,
engine_prompts
=
await
self
.
_
preprocess_chat
(
_
,
engine_prompts
=
await
self
.
openai_serving_render
.
preprocess_chat
(
request
,
request
,
messages
,
messages
,
default_template
=
self
.
chat_template
,
default_template
=
self
.
chat_template
,
...
@@ -598,6 +604,109 @@ class OpenAIServingResponses(OpenAIServing):
...
@@ -598,6 +604,109 @@ class OpenAIServingResponses(OpenAIServing):
)
)
return
messages
,
engine_prompts
return
messages
,
engine_prompts
async
def
_render_next_turn
(
self
,
request
:
ResponsesRequest
,
messages
:
list
[
ResponseInputOutputItem
],
tool_dicts
:
list
[
dict
[
str
,
Any
]]
|
None
,
tool_parser
:
Callable
[[
TokenizerLike
],
ToolParser
]
|
None
,
chat_template
:
str
|
None
,
chat_template_content_format
:
ChatTemplateContentFormatOption
,
):
new_messages
=
construct_input_messages
(
request_input
=
messages
,
)
_
,
engine_prompts
=
await
self
.
openai_serving_render
.
preprocess_chat
(
request
,
new_messages
,
default_template
=
chat_template
,
default_template_content_format
=
chat_template_content_format
,
default_template_kwargs
=
None
,
tool_dicts
=
tool_dicts
,
tool_parser
=
tool_parser
,
)
return
engine_prompts
async
def
_generate_with_builtin_tools
(
self
,
request_id
:
str
,
engine_prompt
:
ProcessorInputs
,
sampling_params
:
SamplingParams
,
context
:
ConversationContext
,
lora_request
:
LoRARequest
|
None
=
None
,
priority
:
int
=
0
,
trace_headers
:
Mapping
[
str
,
str
]
|
None
=
None
,
):
max_model_len
=
self
.
model_config
.
max_model_len
orig_priority
=
priority
sub_request
=
0
while
True
:
# Ensure that each sub-request has a unique request id.
sub_request_id
=
f
"
{
request_id
}
_
{
sub_request
}
"
self
.
_log_inputs
(
sub_request_id
,
engine_prompt
,
params
=
sampling_params
,
lora_request
=
lora_request
,
)
generator
=
self
.
engine_client
.
generate
(
engine_prompt
,
sampling_params
,
sub_request_id
,
lora_request
=
lora_request
,
trace_headers
=
trace_headers
,
priority
=
priority
,
)
async
for
res
in
generator
:
context
.
append_output
(
res
)
# NOTE(woosuk): The stop condition is handled by the engine.
yield
context
if
not
context
.
need_builtin_tool_call
():
# The model did not ask for a tool call, so we're done.
break
# Call the tool and update the context with the result.
tool_output
=
await
context
.
call_tool
()
context
.
append_tool_output
(
tool_output
)
# TODO: uncomment this and enable tool output streaming
# yield context
# Create inputs for the next turn.
# Render the next prompt token ids and update sampling_params.
if
isinstance
(
context
,
(
HarmonyContext
,
StreamingHarmonyContext
)):
token_ids
=
context
.
render_for_completion
()
engine_prompt
=
token_inputs
(
token_ids
)
sampling_params
.
max_tokens
=
max_model_len
-
len
(
token_ids
)
elif
isinstance
(
context
,
ParsableContext
):
(
engine_prompt
,)
=
await
self
.
_render_next_turn
(
context
.
request
,
context
.
parser
.
response_messages
,
context
.
tool_dicts
,
context
.
tool_parser_cls
,
context
.
chat_template
,
context
.
chat_template_content_format
,
)
sampling_params
.
max_tokens
=
get_max_tokens
(
max_model_len
,
context
.
request
.
max_output_tokens
,
self
.
_extract_prompt_len
(
engine_prompt
),
self
.
default_sampling_params
,
# type: ignore
self
.
override_max_tokens
,
# type: ignore
)
# OPTIMIZATION
priority
=
orig_priority
-
1
sub_request
+=
1
def
_make_request_with_harmony
(
def
_make_request_with_harmony
(
self
,
self
,
request
:
ResponsesRequest
,
request
:
ResponsesRequest
,
...
...
vllm/entrypoints/pooling/__init__.py
View file @
59192dfd
...
@@ -68,6 +68,7 @@ def init_pooling_state(
...
@@ -68,6 +68,7 @@ def init_pooling_state(
OpenAIServingPooling
(
OpenAIServingPooling
(
engine_client
,
engine_client
,
state
.
openai_serving_models
,
state
.
openai_serving_models
,
state
.
openai_serving_render
,
request_logger
=
request_logger
,
request_logger
=
request_logger
,
chat_template
=
resolved_chat_template
,
chat_template
=
resolved_chat_template
,
chat_template_content_format
=
args
.
chat_template_content_format
,
chat_template_content_format
=
args
.
chat_template_content_format
,
...
...
vllm/entrypoints/pooling/pooling/serving.py
View file @
59192dfd
...
@@ -32,6 +32,7 @@ from vllm.entrypoints.pooling.utils import (
...
@@ -32,6 +32,7 @@ from vllm.entrypoints.pooling.utils import (
encode_pooling_output_base64
,
encode_pooling_output_base64
,
encode_pooling_output_float
,
encode_pooling_output_float
,
)
)
from
vllm.entrypoints.serve.render.serving
import
OpenAIServingRender
from
vllm.inputs
import
ProcessorInputs
from
vllm.inputs
import
ProcessorInputs
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.outputs
import
PoolingRequestOutput
from
vllm.outputs
import
PoolingRequestOutput
...
@@ -47,6 +48,7 @@ class OpenAIServingPooling(OpenAIServing):
...
@@ -47,6 +48,7 @@ class OpenAIServingPooling(OpenAIServing):
self
,
self
,
engine_client
:
EngineClient
,
engine_client
:
EngineClient
,
models
:
OpenAIServingModels
,
models
:
OpenAIServingModels
,
openai_serving_render
:
OpenAIServingRender
,
*
,
*
,
request_logger
:
RequestLogger
|
None
,
request_logger
:
RequestLogger
|
None
,
chat_template
:
str
|
None
,
chat_template
:
str
|
None
,
...
@@ -59,6 +61,7 @@ class OpenAIServingPooling(OpenAIServing):
...
@@ -59,6 +61,7 @@ class OpenAIServingPooling(OpenAIServing):
request_logger
=
request_logger
,
request_logger
=
request_logger
,
)
)
self
.
openai_serving_render
=
openai_serving_render
self
.
chat_template
=
chat_template
self
.
chat_template
=
chat_template
self
.
chat_template_content_format
:
Final
=
chat_template_content_format
self
.
chat_template_content_format
:
Final
=
chat_template_content_format
self
.
trust_request_chat_template
=
trust_request_chat_template
self
.
trust_request_chat_template
=
trust_request_chat_template
...
@@ -101,12 +104,12 @@ class OpenAIServingPooling(OpenAIServing):
...
@@ -101,12 +104,12 @@ class OpenAIServingPooling(OpenAIServing):
raw_prompts
=
await
self
.
io_processor
.
pre_process_async
(
raw_prompts
=
await
self
.
io_processor
.
pre_process_async
(
prompt
=
validated_prompt
,
request_id
=
request_id
prompt
=
validated_prompt
,
request_id
=
request_id
)
)
engine_prompts
=
await
self
.
_
preprocess_cmpl
(
engine_prompts
=
await
self
.
openai_serving_render
.
preprocess_cmpl
(
request
,
request
,
prompt_to_seq
(
raw_prompts
),
prompt_to_seq
(
raw_prompts
),
)
)
elif
isinstance
(
request
,
PoolingChatRequest
):
elif
isinstance
(
request
,
PoolingChatRequest
):
error_check_ret
=
self
.
_
validate_chat_template
(
error_check_ret
=
self
.
openai_serving_render
.
validate_chat_template
(
request_chat_template
=
request
.
chat_template
,
request_chat_template
=
request
.
chat_template
,
chat_template_kwargs
=
request
.
chat_template_kwargs
,
chat_template_kwargs
=
request
.
chat_template_kwargs
,
trust_request_chat_template
=
self
.
trust_request_chat_template
,
trust_request_chat_template
=
self
.
trust_request_chat_template
,
...
@@ -114,7 +117,7 @@ class OpenAIServingPooling(OpenAIServing):
...
@@ -114,7 +117,7 @@ class OpenAIServingPooling(OpenAIServing):
if
error_check_ret
is
not
None
:
if
error_check_ret
is
not
None
:
return
error_check_ret
return
error_check_ret
_
,
engine_prompts
=
await
self
.
_
preprocess_chat
(
_
,
engine_prompts
=
await
self
.
openai_serving_render
.
preprocess_chat
(
request
,
request
,
request
.
messages
,
request
.
messages
,
default_template
=
self
.
chat_template
,
default_template
=
self
.
chat_template
,
...
@@ -122,7 +125,7 @@ class OpenAIServingPooling(OpenAIServing):
...
@@ -122,7 +125,7 @@ class OpenAIServingPooling(OpenAIServing):
default_template_kwargs
=
None
,
default_template_kwargs
=
None
,
)
)
elif
isinstance
(
request
,
PoolingCompletionRequest
):
elif
isinstance
(
request
,
PoolingCompletionRequest
):
engine_prompts
=
await
self
.
_
preprocess_completion
(
engine_prompts
=
await
self
.
openai_serving_render
.
preprocess_completion
(
request
,
request
,
prompt_input
=
request
.
input
,
prompt_input
=
request
.
input
,
prompt_embeds
=
None
,
prompt_embeds
=
None
,
...
...
vllm/entrypoints/serve/disagg/serving.py
View file @
59192dfd
...
@@ -29,6 +29,7 @@ from vllm.entrypoints.serve.disagg.protocol import (
...
@@ -29,6 +29,7 @@ from vllm.entrypoints.serve.disagg.protocol import (
GenerateResponse
,
GenerateResponse
,
GenerateResponseChoice
,
GenerateResponseChoice
,
)
)
from
vllm.entrypoints.serve.render.serving
import
OpenAIServingRender
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.logprobs
import
Logprob
from
vllm.logprobs
import
Logprob
from
vllm.outputs
import
RequestOutput
from
vllm.outputs
import
RequestOutput
...
@@ -45,6 +46,7 @@ class ServingTokens(OpenAIServing):
...
@@ -45,6 +46,7 @@ class ServingTokens(OpenAIServing):
self
,
self
,
engine_client
:
EngineClient
,
engine_client
:
EngineClient
,
models
:
OpenAIServingModels
,
models
:
OpenAIServingModels
,
openai_serving_render
:
OpenAIServingRender
,
*
,
*
,
request_logger
:
RequestLogger
|
None
,
request_logger
:
RequestLogger
|
None
,
force_no_detokenize
:
bool
=
False
,
force_no_detokenize
:
bool
=
False
,
...
@@ -58,6 +60,7 @@ class ServingTokens(OpenAIServing):
...
@@ -58,6 +60,7 @@ class ServingTokens(OpenAIServing):
request_logger
=
request_logger
,
request_logger
=
request_logger
,
return_tokens_as_token_ids
=
return_tokens_as_token_ids
,
return_tokens_as_token_ids
=
return_tokens_as_token_ids
,
)
)
self
.
openai_serving_render
=
openai_serving_render
self
.
enable_prompt_tokens_details
=
enable_prompt_tokens_details
self
.
enable_prompt_tokens_details
=
enable_prompt_tokens_details
self
.
enable_log_outputs
=
enable_log_outputs
self
.
enable_log_outputs
=
enable_log_outputs
self
.
force_no_detokenize
=
force_no_detokenize
self
.
force_no_detokenize
=
force_no_detokenize
...
@@ -96,7 +99,7 @@ class ServingTokens(OpenAIServing):
...
@@ -96,7 +99,7 @@ class ServingTokens(OpenAIServing):
if
raw_request
:
if
raw_request
:
raw_request
.
state
.
request_metadata
=
request_metadata
raw_request
.
state
.
request_metadata
=
request_metadata
engine_prompts
=
await
self
.
_
preprocess_completion
(
engine_prompts
=
await
self
.
openai_serving_render
.
preprocess_completion
(
request
,
request
,
prompt_input
=
request
.
token_ids
,
prompt_input
=
request
.
token_ids
,
prompt_embeds
=
None
,
prompt_embeds
=
None
,
...
...
vllm/entrypoints/serve/render/serving.py
View file @
59192dfd
...
@@ -24,6 +24,7 @@ from vllm.entrypoints.openai.parser.harmony_utils import (
...
@@ -24,6 +24,7 @@ from vllm.entrypoints.openai.parser.harmony_utils import (
parse_chat_inputs_to_harmony_messages
,
parse_chat_inputs_to_harmony_messages
,
render_for_completion
,
render_for_completion
,
)
)
from
vllm.entrypoints.openai.responses.protocol
import
ResponsesRequest
from
vllm.entrypoints.serve.disagg.protocol
import
(
from
vllm.entrypoints.serve.disagg.protocol
import
(
GenerateRequest
,
GenerateRequest
,
MultiModalFeatures
,
MultiModalFeatures
,
...
@@ -459,9 +460,9 @@ class OpenAIServingRender:
...
@@ -459,9 +460,9 @@ class OpenAIServingRender:
prompts
.
extend
(
prompt_to_seq
(
prompt_embeds
))
prompts
.
extend
(
prompt_to_seq
(
prompt_embeds
))
if
prompt_input
is
not
None
:
if
prompt_input
is
not
None
:
prompts
.
extend
(
prompt_to_seq
(
prompt_input
))
prompts
.
extend
(
prompt_to_seq
(
prompt_input
))
return
await
self
.
_
preprocess_cmpl
(
request
,
prompts
)
return
await
self
.
preprocess_cmpl
(
request
,
prompts
)
async
def
_
preprocess_cmpl
(
async
def
preprocess_cmpl
(
self
,
self
,
request
:
Any
,
request
:
Any
,
prompts
:
Sequence
[
PromptType
|
bytes
],
prompts
:
Sequence
[
PromptType
|
bytes
],
...
@@ -500,11 +501,7 @@ class OpenAIServingRender:
...
@@ -500,11 +501,7 @@ class OpenAIServingRender:
tool_dicts
:
list
[
dict
[
str
,
Any
]]
|
None
=
None
,
tool_dicts
:
list
[
dict
[
str
,
Any
]]
|
None
=
None
,
tool_parser
:
Callable
[[
TokenizerLike
],
ToolParser
]
|
None
=
None
,
tool_parser
:
Callable
[[
TokenizerLike
],
ToolParser
]
|
None
=
None
,
)
->
tuple
[
list
[
ConversationMessage
],
list
[
ProcessorInputs
]]:
)
->
tuple
[
list
[
ConversationMessage
],
list
[
ProcessorInputs
]]:
"""Copied from OpenAIServing._preprocess_chat.
"""Copied from OpenAIServing._preprocess_chat."""
Differences: isinstance check is ChatCompletionRequest-only
(ResponsesRequest not supported here); TODO comment dropped accordingly.
"""
renderer
=
self
.
renderer
renderer
=
self
.
renderer
mm_config
=
self
.
model_config
.
multimodal_config
mm_config
=
self
.
model_config
.
multimodal_config
...
@@ -542,11 +539,11 @@ class OpenAIServingRender:
...
@@ -542,11 +539,11 @@ class OpenAIServingRender:
if
tool_parser
is
not
None
:
if
tool_parser
is
not
None
:
tool_choice
=
getattr
(
request
,
"tool_choice"
,
"none"
)
tool_choice
=
getattr
(
request
,
"tool_choice"
,
"none"
)
if
tool_choice
!=
"none"
:
if
tool_choice
!=
"none"
:
if
not
isinstance
(
request
,
ChatCompletionRequest
):
if
not
isinstance
(
request
,
ChatCompletionRequest
|
ResponsesRequest
):
msg
=
(
msg
=
(
"Tool usage is only supported "
"Tool usage is only supported "
"
for ChatCompletion
Request, but got
"
"for Chat
Completion
s API or Responses API requests,
"
f
"
{
type
(
request
).
__name__
}
"
f
"
but got
{
type
(
request
).
__name__
}
"
)
)
raise
NotImplementedError
(
msg
)
raise
NotImplementedError
(
msg
)
tokenizer
=
renderer
.
get_tokenizer
()
tokenizer
=
renderer
.
get_tokenizer
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment