Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a2268617
Unverified
Commit
a2268617
authored
Mar 13, 2026
by
Sage
Committed by
GitHub
Mar 13, 2026
Browse files
[Frontend] Delegate preprocessing to `OpenAIServingRender` (#36483)
Signed-off-by:
Sage Ahrac
<
sagiahrak@gmail.com
>
parent
a4ad9db5
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
203 additions
and
196 deletions
+203
-196
tests/entrypoints/openai/test_chat_error.py
tests/entrypoints/openai/test_chat_error.py
+14
-1
tests/entrypoints/openai/test_completion_error.py
tests/entrypoints/openai/test_completion_error.py
+11
-0
tests/entrypoints/openai/test_lora_resolvers.py
tests/entrypoints/openai/test_lora_resolvers.py
+11
-1
tests/entrypoints/openai/test_serving_chat.py
tests/entrypoints/openai/test_serving_chat.py
+68
-14
tests/v1/engine/test_async_llm.py
tests/v1/engine/test_async_llm.py
+14
-0
vllm/entrypoints/anthropic/serving.py
vllm/entrypoints/anthropic/serving.py
+6
-1
vllm/entrypoints/openai/chat_completion/serving.py
vllm/entrypoints/openai/chat_completion/serving.py
+12
-128
vllm/entrypoints/openai/completion/serving.py
vllm/entrypoints/openai/completion/serving.py
+11
-21
vllm/entrypoints/openai/generate/api_router.py
vllm/entrypoints/openai/generate/api_router.py
+26
-21
vllm/entrypoints/serve/render/serving.py
vllm/entrypoints/serve/render/serving.py
+30
-9
No files found.
tests/entrypoints/openai/test_chat_error.py
View file @
a2268617
...
...
@@ -13,6 +13,7 @@ from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
from
vllm.entrypoints.openai.engine.protocol
import
GenerationError
from
vllm.entrypoints.openai.models.protocol
import
BaseModelPath
from
vllm.entrypoints.openai.models.serving
import
OpenAIServingModels
from
vllm.entrypoints.serve.render.serving
import
OpenAIServingRender
from
vllm.outputs
import
CompletionOutput
,
RequestOutput
from
vllm.renderers.hf
import
HfRenderer
from
vllm.tokenizers.registry
import
tokenizer_args_from_config
...
...
@@ -84,10 +85,20 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
engine_client
=
engine
,
base_model_paths
=
BASE_MODEL_PATHS
,
)
serving_render
=
OpenAIServingRender
(
model_config
=
engine
.
model_config
,
renderer
=
engine
.
renderer
,
io_processor
=
engine
.
io_processor
,
model_registry
=
models
.
registry
,
request_logger
=
None
,
chat_template
=
None
,
chat_template_content_format
=
"auto"
,
)
serving_chat
=
OpenAIServingChat
(
engine
,
models
,
response_role
=
"assistant"
,
openai_serving_render
=
serving_render
,
request_logger
=
None
,
chat_template
=
None
,
chat_template_content_format
=
"auto"
,
...
...
@@ -100,7 +111,9 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
[{
"prompt_token_ids"
:
[
1
,
2
,
3
]}],
)
serving_chat
.
_preprocess_chat
=
AsyncMock
(
side_effect
=
_fake_preprocess_chat
)
serving_chat
.
openai_serving_render
.
_preprocess_chat
=
AsyncMock
(
side_effect
=
_fake_preprocess_chat
)
return
serving_chat
...
...
tests/entrypoints/openai/test_completion_error.py
View file @
a2268617
...
...
@@ -13,6 +13,7 @@ from vllm.entrypoints.openai.completion.serving import OpenAIServingCompletion
from
vllm.entrypoints.openai.engine.protocol
import
GenerationError
from
vllm.entrypoints.openai.models.protocol
import
BaseModelPath
from
vllm.entrypoints.openai.models.serving
import
OpenAIServingModels
from
vllm.entrypoints.serve.render.serving
import
OpenAIServingRender
from
vllm.outputs
import
CompletionOutput
,
RequestOutput
from
vllm.renderers.hf
import
HfRenderer
from
vllm.tokenizers.registry
import
tokenizer_args_from_config
...
...
@@ -74,9 +75,19 @@ def _build_serving_completion(engine: AsyncLLM) -> OpenAIServingCompletion:
engine_client
=
engine
,
base_model_paths
=
BASE_MODEL_PATHS
,
)
serving_render
=
OpenAIServingRender
(
model_config
=
engine
.
model_config
,
renderer
=
engine
.
renderer
,
io_processor
=
engine
.
io_processor
,
model_registry
=
models
.
registry
,
request_logger
=
None
,
chat_template
=
None
,
chat_template_content_format
=
"auto"
,
)
return
OpenAIServingCompletion
(
engine
,
models
,
openai_serving_render
=
serving_render
,
request_logger
=
None
,
)
...
...
tests/entrypoints/openai/test_lora_resolvers.py
View file @
a2268617
...
...
@@ -14,6 +14,7 @@ from vllm.entrypoints.openai.completion.serving import OpenAIServingCompletion
from
vllm.entrypoints.openai.engine.protocol
import
ErrorResponse
from
vllm.entrypoints.openai.models.protocol
import
BaseModelPath
from
vllm.entrypoints.openai.models.serving
import
OpenAIServingModels
from
vllm.entrypoints.serve.render.serving
import
OpenAIServingRender
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.resolver
import
LoRAResolver
,
LoRAResolverRegistry
from
vllm.renderers.hf
import
HfRenderer
...
...
@@ -145,8 +146,17 @@ def mock_serving_setup():
base_model_paths
=
BASE_MODEL_PATHS
,
)
serving_render
=
OpenAIServingRender
(
model_config
=
mock_engine
.
model_config
,
renderer
=
mock_engine
.
renderer
,
io_processor
=
mock_engine
.
io_processor
,
model_registry
=
models
.
registry
,
request_logger
=
None
,
chat_template
=
None
,
chat_template_content_format
=
"auto"
,
)
serving_completion
=
OpenAIServingCompletion
(
mock_engine
,
models
,
request_logger
=
None
mock_engine
,
models
,
openai_serving_render
=
serving_render
,
request_logger
=
None
)
return
mock_engine
,
serving_completion
...
...
tests/entrypoints/openai/test_serving_chat.py
View file @
a2268617
...
...
@@ -21,8 +21,13 @@ from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse
,
RequestResponseMetadata
,
)
from
vllm.entrypoints.openai.models.serving
import
BaseModelPath
,
OpenAIServingModels
from
vllm.entrypoints.openai.models.serving
import
(
BaseModelPath
,
OpenAIModelRegistry
,
OpenAIServingModels
,
)
from
vllm.entrypoints.openai.parser.harmony_utils
import
get_encoding
from
vllm.entrypoints.serve.render.serving
import
OpenAIServingRender
from
vllm.exceptions
import
VLLMValidationError
from
vllm.inputs
import
TokensPrompt
from
vllm.outputs
import
CompletionOutput
,
RequestOutput
...
...
@@ -557,15 +562,32 @@ def _build_renderer(model_config: MockModelConfig):
)
def
_build_serving_render
(
engine
,
model_registry
:
OpenAIModelRegistry
)
->
OpenAIServingRender
:
return
OpenAIServingRender
(
model_config
=
engine
.
model_config
,
renderer
=
engine
.
renderer
,
io_processor
=
engine
.
io_processor
,
model_registry
=
model_registry
,
request_logger
=
None
,
chat_template
=
CHAT_TEMPLATE
,
chat_template_content_format
=
"auto"
,
)
def
_build_serving_chat
(
engine
:
AsyncLLM
)
->
OpenAIServingChat
:
models
=
OpenAIServingModels
(
engine_client
=
engine
,
base_model_paths
=
BASE_MODEL_PATHS
,
)
openai_serving_render
=
_build_serving_render
(
engine
,
models
.
registry
)
serving_chat
=
OpenAIServingChat
(
engine
,
models
,
response_role
=
"assistant"
,
openai_serving_render
=
openai_serving_render
,
chat_template
=
CHAT_TEMPLATE
,
chat_template_content_format
=
"auto"
,
request_logger
=
None
,
...
...
@@ -586,10 +608,13 @@ async def _async_serving_chat_init():
engine
=
MockEngine
()
models
=
OpenAIServingModels
(
engine
,
BASE_MODEL_PATHS
)
openai_serving_render
=
_build_serving_render
(
engine
,
models
.
registry
)
serving_completion
=
OpenAIServingChat
(
engine
,
models
,
response_role
=
"assistant"
,
openai_serving_render
=
openai_serving_render
,
chat_template
=
CHAT_TEMPLATE
,
chat_template_content_format
=
"auto"
,
request_logger
=
None
,
...
...
@@ -1182,7 +1207,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the first turn's input
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
input_messages
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
input_messages
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req
)
)
verify_harmony_messages
(
input_messages
,
[
...
...
@@ -1209,7 +1236,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the second turn's input
req_2
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
input_messages_2
,
_
=
serving_chat
.
_make_request_with_harmony
(
req_2
)
input_messages_2
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req_2
)
)
verify_harmony_messages
(
input_messages_2
,
[
...
...
@@ -1230,7 +1259,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the first turn's input
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
input_messages
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
input_messages
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req
)
)
verify_harmony_messages
(
input_messages
,
[
...
...
@@ -1274,7 +1305,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the second turn's input
req_2
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
input_messages_2
,
_
=
serving_chat
.
_make_request_with_harmony
(
req_2
)
input_messages_2
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req_2
)
)
verify_harmony_messages
(
input_messages_2
,
[
...
...
@@ -1311,7 +1344,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the first turn's input
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
input_messages
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
input_messages
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req
)
)
verify_harmony_messages
(
input_messages
,
[
...
...
@@ -1355,7 +1390,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the second turn's input
req_2
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
input_messages_2
,
_
=
serving_chat
.
_make_request_with_harmony
(
req_2
)
input_messages_2
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req_2
)
)
verify_harmony_messages
(
input_messages_2
,
[
...
...
@@ -1392,7 +1429,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the first turn's input
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
input_messages
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
input_messages
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req
)
)
verify_harmony_messages
(
input_messages
,
[
...
...
@@ -1436,7 +1475,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the second turn's input
req_2
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
input_messages_2
,
_
=
serving_chat
.
_make_request_with_harmony
(
req_2
)
input_messages_2
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req_2
)
)
verify_harmony_messages
(
input_messages_2
,
[
...
...
@@ -1486,7 +1527,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the third turn's input
req_3
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
input_messages_3
,
_
=
serving_chat
.
_make_request_with_harmony
(
req_3
)
input_messages_3
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req_3
)
)
verify_harmony_messages
(
input_messages_3
,
[
...
...
@@ -1549,7 +1592,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the fourth turn's input
req_4
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
input_messages_4
,
_
=
serving_chat
.
_make_request_with_harmony
(
req_4
)
input_messages_4
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req_4
)
)
verify_harmony_messages
(
input_messages_4
,
[
...
...
@@ -1598,7 +1643,9 @@ class TestServingChatWithHarmony:
},
]
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
input_messages
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
input_messages
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req
)
)
verify_harmony_messages
(
input_messages
,
...
...
@@ -1629,7 +1676,9 @@ class TestServingChatWithHarmony:
},
]
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
input_messages
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
input_messages
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req
)
)
verify_harmony_messages
(
input_messages
,
...
...
@@ -1658,7 +1707,9 @@ class TestServingChatWithHarmony:
},
]
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
input_messages
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
input_messages
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req
)
)
verify_harmony_messages
(
input_messages
,
...
...
@@ -1689,11 +1740,14 @@ async def test_tool_choice_validation_without_parser():
engine_client
=
mock_engine
,
base_model_paths
=
BASE_MODEL_PATHS
,
)
openai_serving_render
=
_build_serving_render
(
mock_engine
,
models
.
registry
)
# Create serving_chat without tool_parser (enable_auto_tools=False)
serving_chat
=
OpenAIServingChat
(
mock_engine
,
models
,
response_role
=
"assistant"
,
openai_serving_render
=
openai_serving_render
,
chat_template
=
CHAT_TEMPLATE
,
chat_template_content_format
=
"auto"
,
request_logger
=
None
,
...
...
tests/v1/engine/test_async_llm.py
View file @
a2268617
...
...
@@ -508,11 +508,25 @@ async def test_header_dp_rank_argument():
base_model_paths
=
BASE_MODEL_PATHS
,
)
# Create render serving instance (required by OpenAIServingChat)
from
vllm.entrypoints.serve.render.serving
import
OpenAIServingRender
serving_render
=
OpenAIServingRender
(
model_config
=
engine
.
model_config
,
renderer
=
engine
.
renderer
,
io_processor
=
engine
.
io_processor
,
model_registry
=
models
.
registry
,
request_logger
=
None
,
chat_template
=
None
,
chat_template_content_format
=
"auto"
,
)
# Create serving chat instance
serving_chat
=
OpenAIServingChat
(
engine_client
=
engine
,
models
=
models
,
response_role
=
"assistant"
,
openai_serving_render
=
serving_render
,
chat_template
=
None
,
chat_template_content_format
=
"auto"
,
request_logger
=
None
,
...
...
vllm/entrypoints/anthropic/serving.py
View file @
a2268617
...
...
@@ -10,7 +10,7 @@ import logging
import
time
import
uuid
from
collections.abc
import
AsyncGenerator
from
typing
import
Any
from
typing
import
TYPE_CHECKING
,
Any
from
fastapi
import
Request
...
...
@@ -43,6 +43,9 @@ from vllm.entrypoints.openai.engine.protocol import (
)
from
vllm.entrypoints.openai.models.serving
import
OpenAIServingModels
if
TYPE_CHECKING
:
from
vllm.entrypoints.serve.render.serving
import
OpenAIServingRender
logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -59,6 +62,7 @@ class AnthropicServingMessages(OpenAIServingChat):
models
:
OpenAIServingModels
,
response_role
:
str
,
*
,
openai_serving_render
:
"OpenAIServingRender"
,
request_logger
:
RequestLogger
|
None
,
chat_template
:
str
|
None
,
chat_template_content_format
:
ChatTemplateContentFormatOption
,
...
...
@@ -73,6 +77,7 @@ class AnthropicServingMessages(OpenAIServingChat):
engine_client
=
engine_client
,
models
=
models
,
response_role
=
response_role
,
openai_serving_render
=
openai_serving_render
,
request_logger
=
request_logger
,
chat_template
=
chat_template
,
chat_template_content_format
=
chat_template_content_format
,
...
...
vllm/entrypoints/openai/chat_completion/serving.py
View file @
a2268617
...
...
@@ -6,12 +6,11 @@ import json
import
time
from
collections.abc
import
AsyncGenerator
,
AsyncIterator
from
collections.abc
import
Sequence
as
GenericSequence
from
typing
import
Any
,
Final
from
typing
import
TYPE_CHECKING
,
Any
,
Final
import
partial_json_parser
import
regex
as
re
from
fastapi
import
Request
from
openai_harmony
import
Message
as
OpenAIMessage
from
partial_json_parser.core.options
import
Allow
from
vllm.engine.protocol
import
EngineClient
...
...
@@ -56,17 +55,13 @@ from vllm.entrypoints.openai.engine.serving import (
)
from
vllm.entrypoints.openai.models.serving
import
OpenAIServingModels
from
vllm.entrypoints.openai.parser.harmony_utils
import
(
get_developer_message
,
get_stop_tokens_for_assistant_actions
,
get_streamable_parser_for_assistant
,
get_system_message
,
parse_chat_inputs_to_harmony_messages
,
parse_chat_output
,
render_for_completion
,
)
from
vllm.entrypoints.openai.utils
import
maybe_filter_parallel_tool_calls
from
vllm.entrypoints.utils
import
get_max_tokens
,
should_include_usage
from
vllm.inputs.data
import
ProcessorInputs
,
TokensPrompt
from
vllm.inputs.data
import
ProcessorInputs
from
vllm.logger
import
init_logger
from
vllm.logprobs
import
Logprob
from
vllm.outputs
import
CompletionOutput
,
RequestOutput
...
...
@@ -80,7 +75,9 @@ from vllm.tool_parsers.mistral_tool_parser import MistralToolCall
from
vllm.tool_parsers.utils
import
partial_json_loads
from
vllm.utils.collection_utils
import
as_list
from
vllm.utils.mistral
import
is_mistral_tokenizer
from
vllm.utils.mistral
import
mt
as
_mt
if
TYPE_CHECKING
:
from
vllm.entrypoints.serve.render.serving
import
OpenAIServingRender
logger
=
init_logger
(
__name__
)
...
...
@@ -92,6 +89,7 @@ class OpenAIServingChat(OpenAIServing):
models
:
OpenAIServingModels
,
response_role
:
str
,
*
,
openai_serving_render
:
"OpenAIServingRender"
,
request_logger
:
RequestLogger
|
None
,
chat_template
:
str
|
None
,
chat_template_content_format
:
ChatTemplateContentFormatOption
,
...
...
@@ -114,6 +112,7 @@ class OpenAIServingChat(OpenAIServing):
return_tokens_as_token_ids
=
return_tokens_as_token_ids
,
)
self
.
openai_serving_render
=
openai_serving_render
self
.
response_role
=
response_role
self
.
chat_template
=
chat_template
self
.
chat_template_content_format
:
Final
=
chat_template_content_format
...
...
@@ -186,7 +185,10 @@ class OpenAIServingChat(OpenAIServing):
request
:
ChatCompletionRequest
,
)
->
tuple
[
list
[
ConversationMessage
],
list
[
ProcessorInputs
]]
|
ErrorResponse
:
"""
render chat request by validating and preprocessing inputs.
Validate the model and preprocess a chat completion request.
Delegates preprocessing logic to OpenAIServingRender, adding the
engine-aware checks (LoRA model validation, engine health).
Returns:
A tuple of (conversation, engine_prompts) on success,
...
...
@@ -203,78 +205,7 @@ class OpenAIServingChat(OpenAIServing):
if
self
.
engine_client
.
errored
:
raise
self
.
engine_client
.
dead_error
tokenizer
=
self
.
renderer
.
tokenizer
tool_parser
=
self
.
tool_parser
if
is_mistral_tokenizer
(
tokenizer
):
# because of issues with pydantic we need to potentially
# re-serialize the tool_calls field of the request
# for more info: see comment in `maybe_serialize_tool_calls`
_mt
.
maybe_serialize_tool_calls
(
request
)
# type: ignore[arg-type]
_mt
.
truncate_tool_call_ids
(
request
)
# type: ignore[arg-type]
_mt
.
validate_request_params
(
request
)
# Check if tool parsing is unavailable (common condition)
tool_parsing_unavailable
=
(
tool_parser
is
None
and
not
is_mistral_tokenizer
(
tokenizer
)
and
not
self
.
use_harmony
)
# Validate tool_choice when tool parsing is required but unavailable
if
tool_parsing_unavailable
and
request
.
tool_choice
not
in
(
None
,
"none"
,
):
if
request
.
tool_choice
==
"auto"
and
not
self
.
enable_auto_tools
:
# for hf tokenizers, "auto" tools requires
# --enable-auto-tool-choice and --tool-call-parser
return
self
.
create_error_response
(
'"auto" tool choice requires '
"--enable-auto-tool-choice and --tool-call-parser to be set"
)
elif
request
.
tool_choice
!=
"auto"
:
# "required" or named tool requires tool parser
return
self
.
create_error_response
(
f
'tool_choice="
{
request
.
tool_choice
}
" requires '
"--tool-call-parser to be set"
)
if
request
.
tools
is
None
or
(
request
.
tool_choice
==
"none"
and
self
.
exclude_tools_when_tool_choice_none
):
tool_dicts
=
None
else
:
tool_dicts
=
[
tool
.
model_dump
()
for
tool
in
request
.
tools
]
if
not
self
.
use_harmony
:
# Common case.
error_check_ret
=
self
.
_validate_chat_template
(
request_chat_template
=
request
.
chat_template
,
chat_template_kwargs
=
request
.
chat_template_kwargs
,
trust_request_chat_template
=
self
.
trust_request_chat_template
,
)
if
error_check_ret
is
not
None
:
return
error_check_ret
conversation
,
engine_prompts
=
await
self
.
_preprocess_chat
(
request
,
request
.
messages
,
default_template
=
self
.
chat_template
,
default_template_content_format
=
self
.
chat_template_content_format
,
default_template_kwargs
=
self
.
default_chat_template_kwargs
,
tool_dicts
=
tool_dicts
,
tool_parser
=
tool_parser
,
)
else
:
# For GPT-OSS.
should_include_tools
=
tool_dicts
is
not
None
conversation
,
engine_prompts
=
self
.
_make_request_with_harmony
(
request
,
should_include_tools
)
return
conversation
,
engine_prompts
return
await
self
.
openai_serving_render
.
render_chat
(
request
)
async
def
create_chat_completion
(
self
,
...
...
@@ -1875,50 +1806,3 @@ class OpenAIServingChat(OpenAIServing):
)
]
)
def
_make_request_with_harmony
(
self
,
request
:
ChatCompletionRequest
,
should_include_tools
:
bool
=
True
,
):
messages
:
list
[
OpenAIMessage
]
=
[]
# because of issues with pydantic we need to potentially
# re-serialize the tool_calls field of the request
# for more info: see comment in `maybe_serialize_tool_calls`
_mt
.
maybe_serialize_tool_calls
(
request
)
# type: ignore[arg-type]
# Add system message.
# NOTE: In Chat Completion API, browsing is enabled by default
# if the model supports it. TODO: Support browsing.
assert
not
self
.
supports_browsing
assert
not
self
.
supports_code_interpreter
if
(
reasoning_effort
:
=
request
.
reasoning_effort
)
==
"none"
:
raise
ValueError
(
f
"Harmony does not support
{
reasoning_effort
=
}
"
)
sys_msg
=
get_system_message
(
reasoning_effort
=
reasoning_effort
,
browser_description
=
None
,
python_description
=
None
,
with_custom_tools
=
should_include_tools
,
)
messages
.
append
(
sys_msg
)
# Add developer message.
if
request
.
tools
:
dev_msg
=
get_developer_message
(
tools
=
request
.
tools
if
should_include_tools
else
None
# type: ignore[arg-type]
)
messages
.
append
(
dev_msg
)
# Add user message.
messages
.
extend
(
parse_chat_inputs_to_harmony_messages
(
request
.
messages
))
# Render prompt token ids.
prompt_token_ids
=
render_for_completion
(
messages
)
engine_prompt
=
TokensPrompt
(
prompt_token_ids
=
prompt_token_ids
)
# Add cache_salt if provided in the request
if
request
.
cache_salt
is
not
None
:
engine_prompt
[
"cache_salt"
]
=
request
.
cache_salt
return
messages
,
[
engine_prompt
]
vllm/entrypoints/openai/completion/serving.py
View file @
a2268617
...
...
@@ -5,7 +5,7 @@ import asyncio
import
time
from
collections.abc
import
AsyncGenerator
,
AsyncIterator
from
collections.abc
import
Sequence
as
GenericSequence
from
typing
import
cast
from
typing
import
TYPE_CHECKING
,
cast
from
fastapi
import
Request
...
...
@@ -42,6 +42,9 @@ from vllm.tokenizers import TokenizerLike
from
vllm.utils.async_utils
import
merge_async_iterators
from
vllm.utils.collection_utils
import
as_list
if
TYPE_CHECKING
:
from
vllm.entrypoints.serve.render.serving
import
OpenAIServingRender
logger
=
init_logger
(
__name__
)
...
...
@@ -51,6 +54,7 @@ class OpenAIServingCompletion(OpenAIServing):
engine_client
:
EngineClient
,
models
:
OpenAIServingModels
,
*
,
openai_serving_render
:
"OpenAIServingRender"
,
request_logger
:
RequestLogger
|
None
,
return_tokens_as_token_ids
:
bool
=
False
,
enable_prompt_tokens_details
:
bool
=
False
,
...
...
@@ -63,6 +67,7 @@ class OpenAIServingCompletion(OpenAIServing):
return_tokens_as_token_ids
=
return_tokens_as_token_ids
,
)
self
.
openai_serving_render
=
openai_serving_render
self
.
enable_prompt_tokens_details
=
enable_prompt_tokens_details
self
.
enable_force_include_usage
=
enable_force_include_usage
...
...
@@ -79,7 +84,10 @@ class OpenAIServingCompletion(OpenAIServing):
request
:
CompletionRequest
,
)
->
list
[
ProcessorInputs
]
|
ErrorResponse
:
"""
render completion request by validating and preprocessing inputs.
Validate the model and preprocess a completion request.
Delegates preprocessing logic to OpenAIServingRender, adding the
engine-aware checks (LoRA model validation, engine health).
Returns:
A list of engine_prompts on success,
...
...
@@ -95,25 +103,7 @@ class OpenAIServingCompletion(OpenAIServing):
if
self
.
engine_client
.
errored
:
raise
self
.
engine_client
.
dead_error
# Return error for unsupported features.
if
request
.
suffix
is
not
None
:
return
self
.
create_error_response
(
"suffix is not currently supported"
)
if
request
.
echo
and
request
.
prompt_embeds
is
not
None
:
return
self
.
create_error_response
(
"Echo is unsupported with prompt embeds."
)
if
request
.
prompt_logprobs
is
not
None
and
request
.
prompt_embeds
is
not
None
:
return
self
.
create_error_response
(
"prompt_logprobs is not compatible with prompt embeds."
)
engine_prompts
=
await
self
.
_preprocess_completion
(
request
,
prompt_input
=
request
.
prompt
,
prompt_embeds
=
request
.
prompt_embeds
,
)
return
engine_prompts
return
await
self
.
openai_serving_render
.
render_completion
(
request
)
async
def
create_completion
(
self
,
...
...
vllm/entrypoints/openai/generate/api_router.py
View file @
a2268617
...
...
@@ -72,6 +72,29 @@ async def init_generate_state(
tool_server
=
None
resolved_chat_template
=
load_chat_template
(
args
.
chat_template
)
# Render endpoints are always backed by OpenAIServingRender so that
# /v1/chat/completions/render and /v1/completions/render work on both
# generate-mode and render-only servers.
# It is created first so that OpenAIServingChat and OpenAIServingCompletion
# can delegate their preprocessing logic to it.
from
vllm.entrypoints.serve.render.serving
import
OpenAIServingRender
state
.
openai_serving_render
=
OpenAIServingRender
(
model_config
=
engine_client
.
model_config
,
renderer
=
engine_client
.
renderer
,
io_processor
=
engine_client
.
io_processor
,
model_registry
=
state
.
openai_serving_models
.
registry
,
request_logger
=
request_logger
,
chat_template
=
resolved_chat_template
,
chat_template_content_format
=
args
.
chat_template_content_format
,
trust_request_chat_template
=
args
.
trust_request_chat_template
,
enable_auto_tools
=
args
.
enable_auto_tool_choice
,
exclude_tools_when_tool_choice_none
=
args
.
exclude_tools_when_tool_choice_none
,
tool_parser
=
args
.
tool_call_parser
,
default_chat_template_kwargs
=
args
.
default_chat_template_kwargs
,
log_error_stack
=
args
.
log_error_stack
,
)
state
.
openai_serving_responses
=
(
OpenAIServingResponses
(
engine_client
,
...
...
@@ -96,6 +119,7 @@ async def init_generate_state(
engine_client
,
state
.
openai_serving_models
,
args
.
response_role
,
openai_serving_render
=
state
.
openai_serving_render
,
request_logger
=
request_logger
,
chat_template
=
resolved_chat_template
,
chat_template_content_format
=
args
.
chat_template_content_format
,
...
...
@@ -120,6 +144,7 @@ async def init_generate_state(
OpenAIServingCompletion
(
engine_client
,
state
.
openai_serving_models
,
openai_serving_render
=
state
.
openai_serving_render
,
request_logger
=
request_logger
,
return_tokens_as_token_ids
=
args
.
return_tokens_as_token_ids
,
enable_prompt_tokens_details
=
args
.
enable_prompt_tokens_details
,
...
...
@@ -133,6 +158,7 @@ async def init_generate_state(
engine_client
,
state
.
openai_serving_models
,
args
.
response_role
,
openai_serving_render
=
state
.
openai_serving_render
,
request_logger
=
request_logger
,
chat_template
=
resolved_chat_template
,
chat_template_content_format
=
args
.
chat_template_content_format
,
...
...
@@ -159,24 +185,3 @@ async def init_generate_state(
if
"generate"
in
supported_tasks
else
None
)
# Render endpoints are always backed by OpenAIServingRender so that
# /v1/chat/completions/render and /v1/completions/render work on both
# generate-mode and render-only servers.
from
vllm.entrypoints.serve.render.serving
import
OpenAIServingRender
state
.
openai_serving_render
=
OpenAIServingRender
(
model_config
=
engine_client
.
model_config
,
renderer
=
engine_client
.
renderer
,
io_processor
=
engine_client
.
io_processor
,
model_registry
=
state
.
openai_serving_models
.
registry
,
request_logger
=
request_logger
,
chat_template
=
resolved_chat_template
,
chat_template_content_format
=
args
.
chat_template_content_format
,
trust_request_chat_template
=
args
.
trust_request_chat_template
,
enable_auto_tools
=
args
.
enable_auto_tool_choice
,
exclude_tools_when_tool_choice_none
=
args
.
exclude_tools_when_tool_choice_none
,
tool_parser
=
args
.
tool_call_parser
,
default_chat_template_kwargs
=
args
.
default_chat_template_kwargs
,
log_error_stack
=
args
.
log_error_stack
,
)
vllm/entrypoints/serve/render/serving.py
View file @
a2268617
...
...
@@ -87,15 +87,26 @@ class OpenAIServingRender:
self
,
request
:
ChatCompletionRequest
,
)
->
tuple
[
list
[
ConversationMessage
],
list
[
ProcessorInputs
]]
|
ErrorResponse
:
"""
Copied from OpenAIServingChat.render_chat_
request.
"""
Validate the model and preprocess a chat completion
request.
Differences: engine_client.errored check removed (no engine client).
This is the authoritative implementation used directly by the
GPU-less render server and delegated to by OpenAIServingChat.
"""
error_check_ret
=
await
self
.
_check_model
(
request
)
if
error_check_ret
is
not
None
:
logger
.
error
(
"Error with model %s"
,
error_check_ret
)
return
error_check_ret
return
await
self
.
render_chat
(
request
)
async
def
render_chat
(
self
,
request
:
ChatCompletionRequest
,
)
->
tuple
[
list
[
ConversationMessage
],
list
[
ProcessorInputs
]]
|
ErrorResponse
:
"""Core preprocessing logic for chat requests (no model/engine check).
Called directly by render_chat_request and delegated to by
OpenAIServingChat.render_chat_request after its engine-aware checks.
"""
tokenizer
=
self
.
renderer
.
tokenizer
tool_parser
=
self
.
tool_parser
...
...
@@ -173,14 +184,25 @@ class OpenAIServingRender:
self
,
request
:
CompletionRequest
,
)
->
list
[
ProcessorInputs
]
|
ErrorResponse
:
"""
Copied from OpenAIServingCompletion.render_
completion
_
request.
"""
Validate the model and preprocess a
completion
request.
Differences: engine_client.errored check removed (no engine client).
This is the authoritative implementation used directly by the
GPU-less render server and delegated to by OpenAIServingCompletion.
"""
error_check_ret
=
await
self
.
_check_model
(
request
)
if
error_check_ret
is
not
None
:
return
error_check_ret
return
await
self
.
render_completion
(
request
)
async
def
render_completion
(
self
,
request
:
CompletionRequest
,
)
->
list
[
ProcessorInputs
]
|
ErrorResponse
:
"""Core preprocessing logic for completion requests (no model/engine check).
Called directly by render_completion_request and delegated to by
OpenAIServingCompletion.render_completion_request after its engine-aware checks.
"""
# Return error for unsupported features.
if
request
.
suffix
is
not
None
:
return
self
.
create_error_response
(
"suffix is not currently supported"
)
...
...
@@ -206,7 +228,7 @@ class OpenAIServingRender:
request
:
ChatCompletionRequest
,
should_include_tools
:
bool
=
True
,
):
"""
Copied from OpenAIServingChat._make_request_with_harmony
."""
"""
Build Harmony (GPT-OSS) messages and engine prompt from a chat request
."""
messages
:
list
[
OpenAIMessage
]
=
[]
# because of issues with pydantic we need to potentially
...
...
@@ -219,11 +241,10 @@ class OpenAIServingRender:
# if the model supports it. TODO: Support browsing.
assert
not
self
.
supports_browsing
assert
not
self
.
supports_code_interpreter
assert
request
.
reasoning_effort
!=
"none"
,
(
"Harmony does not support reasoning_effort='none'"
)
if
(
reasoning_effort
:
=
request
.
reasoning_effort
)
==
"none"
:
raise
ValueError
(
f
"Harmony does not support
{
reasoning_effort
=
}
"
)
sys_msg
=
get_system_message
(
reasoning_effort
=
request
.
reasoning_effort
,
reasoning_effort
=
reasoning_effort
,
browser_description
=
None
,
python_description
=
None
,
with_custom_tools
=
should_include_tools
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment