Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a2268617
Unverified
Commit
a2268617
authored
Mar 13, 2026
by
Sage
Committed by
GitHub
Mar 13, 2026
Browse files
[Frontend] Delegate preprocessing to `OpenAIServingRender` (#36483)
Signed-off-by:
Sage Ahrac
<
sagiahrak@gmail.com
>
parent
a4ad9db5
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
203 additions
and
196 deletions
+203
-196
tests/entrypoints/openai/test_chat_error.py
tests/entrypoints/openai/test_chat_error.py
+14
-1
tests/entrypoints/openai/test_completion_error.py
tests/entrypoints/openai/test_completion_error.py
+11
-0
tests/entrypoints/openai/test_lora_resolvers.py
tests/entrypoints/openai/test_lora_resolvers.py
+11
-1
tests/entrypoints/openai/test_serving_chat.py
tests/entrypoints/openai/test_serving_chat.py
+68
-14
tests/v1/engine/test_async_llm.py
tests/v1/engine/test_async_llm.py
+14
-0
vllm/entrypoints/anthropic/serving.py
vllm/entrypoints/anthropic/serving.py
+6
-1
vllm/entrypoints/openai/chat_completion/serving.py
vllm/entrypoints/openai/chat_completion/serving.py
+12
-128
vllm/entrypoints/openai/completion/serving.py
vllm/entrypoints/openai/completion/serving.py
+11
-21
vllm/entrypoints/openai/generate/api_router.py
vllm/entrypoints/openai/generate/api_router.py
+26
-21
vllm/entrypoints/serve/render/serving.py
vllm/entrypoints/serve/render/serving.py
+30
-9
No files found.
tests/entrypoints/openai/test_chat_error.py
View file @
a2268617
...
@@ -13,6 +13,7 @@ from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
...
@@ -13,6 +13,7 @@ from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
from
vllm.entrypoints.openai.engine.protocol
import
GenerationError
from
vllm.entrypoints.openai.engine.protocol
import
GenerationError
from
vllm.entrypoints.openai.models.protocol
import
BaseModelPath
from
vllm.entrypoints.openai.models.protocol
import
BaseModelPath
from
vllm.entrypoints.openai.models.serving
import
OpenAIServingModels
from
vllm.entrypoints.openai.models.serving
import
OpenAIServingModels
from
vllm.entrypoints.serve.render.serving
import
OpenAIServingRender
from
vllm.outputs
import
CompletionOutput
,
RequestOutput
from
vllm.outputs
import
CompletionOutput
,
RequestOutput
from
vllm.renderers.hf
import
HfRenderer
from
vllm.renderers.hf
import
HfRenderer
from
vllm.tokenizers.registry
import
tokenizer_args_from_config
from
vllm.tokenizers.registry
import
tokenizer_args_from_config
...
@@ -84,10 +85,20 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
...
@@ -84,10 +85,20 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
engine_client
=
engine
,
engine_client
=
engine
,
base_model_paths
=
BASE_MODEL_PATHS
,
base_model_paths
=
BASE_MODEL_PATHS
,
)
)
serving_render
=
OpenAIServingRender
(
model_config
=
engine
.
model_config
,
renderer
=
engine
.
renderer
,
io_processor
=
engine
.
io_processor
,
model_registry
=
models
.
registry
,
request_logger
=
None
,
chat_template
=
None
,
chat_template_content_format
=
"auto"
,
)
serving_chat
=
OpenAIServingChat
(
serving_chat
=
OpenAIServingChat
(
engine
,
engine
,
models
,
models
,
response_role
=
"assistant"
,
response_role
=
"assistant"
,
openai_serving_render
=
serving_render
,
request_logger
=
None
,
request_logger
=
None
,
chat_template
=
None
,
chat_template
=
None
,
chat_template_content_format
=
"auto"
,
chat_template_content_format
=
"auto"
,
...
@@ -100,7 +111,9 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
...
@@ -100,7 +111,9 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
[{
"prompt_token_ids"
:
[
1
,
2
,
3
]}],
[{
"prompt_token_ids"
:
[
1
,
2
,
3
]}],
)
)
serving_chat
.
_preprocess_chat
=
AsyncMock
(
side_effect
=
_fake_preprocess_chat
)
serving_chat
.
openai_serving_render
.
_preprocess_chat
=
AsyncMock
(
side_effect
=
_fake_preprocess_chat
)
return
serving_chat
return
serving_chat
...
...
tests/entrypoints/openai/test_completion_error.py
View file @
a2268617
...
@@ -13,6 +13,7 @@ from vllm.entrypoints.openai.completion.serving import OpenAIServingCompletion
...
@@ -13,6 +13,7 @@ from vllm.entrypoints.openai.completion.serving import OpenAIServingCompletion
from
vllm.entrypoints.openai.engine.protocol
import
GenerationError
from
vllm.entrypoints.openai.engine.protocol
import
GenerationError
from
vllm.entrypoints.openai.models.protocol
import
BaseModelPath
from
vllm.entrypoints.openai.models.protocol
import
BaseModelPath
from
vllm.entrypoints.openai.models.serving
import
OpenAIServingModels
from
vllm.entrypoints.openai.models.serving
import
OpenAIServingModels
from
vllm.entrypoints.serve.render.serving
import
OpenAIServingRender
from
vllm.outputs
import
CompletionOutput
,
RequestOutput
from
vllm.outputs
import
CompletionOutput
,
RequestOutput
from
vllm.renderers.hf
import
HfRenderer
from
vllm.renderers.hf
import
HfRenderer
from
vllm.tokenizers.registry
import
tokenizer_args_from_config
from
vllm.tokenizers.registry
import
tokenizer_args_from_config
...
@@ -74,9 +75,19 @@ def _build_serving_completion(engine: AsyncLLM) -> OpenAIServingCompletion:
...
@@ -74,9 +75,19 @@ def _build_serving_completion(engine: AsyncLLM) -> OpenAIServingCompletion:
engine_client
=
engine
,
engine_client
=
engine
,
base_model_paths
=
BASE_MODEL_PATHS
,
base_model_paths
=
BASE_MODEL_PATHS
,
)
)
serving_render
=
OpenAIServingRender
(
model_config
=
engine
.
model_config
,
renderer
=
engine
.
renderer
,
io_processor
=
engine
.
io_processor
,
model_registry
=
models
.
registry
,
request_logger
=
None
,
chat_template
=
None
,
chat_template_content_format
=
"auto"
,
)
return
OpenAIServingCompletion
(
return
OpenAIServingCompletion
(
engine
,
engine
,
models
,
models
,
openai_serving_render
=
serving_render
,
request_logger
=
None
,
request_logger
=
None
,
)
)
...
...
tests/entrypoints/openai/test_lora_resolvers.py
View file @
a2268617
...
@@ -14,6 +14,7 @@ from vllm.entrypoints.openai.completion.serving import OpenAIServingCompletion
...
@@ -14,6 +14,7 @@ from vllm.entrypoints.openai.completion.serving import OpenAIServingCompletion
from
vllm.entrypoints.openai.engine.protocol
import
ErrorResponse
from
vllm.entrypoints.openai.engine.protocol
import
ErrorResponse
from
vllm.entrypoints.openai.models.protocol
import
BaseModelPath
from
vllm.entrypoints.openai.models.protocol
import
BaseModelPath
from
vllm.entrypoints.openai.models.serving
import
OpenAIServingModels
from
vllm.entrypoints.openai.models.serving
import
OpenAIServingModels
from
vllm.entrypoints.serve.render.serving
import
OpenAIServingRender
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.resolver
import
LoRAResolver
,
LoRAResolverRegistry
from
vllm.lora.resolver
import
LoRAResolver
,
LoRAResolverRegistry
from
vllm.renderers.hf
import
HfRenderer
from
vllm.renderers.hf
import
HfRenderer
...
@@ -145,8 +146,17 @@ def mock_serving_setup():
...
@@ -145,8 +146,17 @@ def mock_serving_setup():
base_model_paths
=
BASE_MODEL_PATHS
,
base_model_paths
=
BASE_MODEL_PATHS
,
)
)
serving_render
=
OpenAIServingRender
(
model_config
=
mock_engine
.
model_config
,
renderer
=
mock_engine
.
renderer
,
io_processor
=
mock_engine
.
io_processor
,
model_registry
=
models
.
registry
,
request_logger
=
None
,
chat_template
=
None
,
chat_template_content_format
=
"auto"
,
)
serving_completion
=
OpenAIServingCompletion
(
serving_completion
=
OpenAIServingCompletion
(
mock_engine
,
models
,
request_logger
=
None
mock_engine
,
models
,
openai_serving_render
=
serving_render
,
request_logger
=
None
)
)
return
mock_engine
,
serving_completion
return
mock_engine
,
serving_completion
...
...
tests/entrypoints/openai/test_serving_chat.py
View file @
a2268617
...
@@ -21,8 +21,13 @@ from vllm.entrypoints.openai.engine.protocol import (
...
@@ -21,8 +21,13 @@ from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse
,
ErrorResponse
,
RequestResponseMetadata
,
RequestResponseMetadata
,
)
)
from
vllm.entrypoints.openai.models.serving
import
BaseModelPath
,
OpenAIServingModels
from
vllm.entrypoints.openai.models.serving
import
(
BaseModelPath
,
OpenAIModelRegistry
,
OpenAIServingModels
,
)
from
vllm.entrypoints.openai.parser.harmony_utils
import
get_encoding
from
vllm.entrypoints.openai.parser.harmony_utils
import
get_encoding
from
vllm.entrypoints.serve.render.serving
import
OpenAIServingRender
from
vllm.exceptions
import
VLLMValidationError
from
vllm.exceptions
import
VLLMValidationError
from
vllm.inputs
import
TokensPrompt
from
vllm.inputs
import
TokensPrompt
from
vllm.outputs
import
CompletionOutput
,
RequestOutput
from
vllm.outputs
import
CompletionOutput
,
RequestOutput
...
@@ -557,15 +562,32 @@ def _build_renderer(model_config: MockModelConfig):
...
@@ -557,15 +562,32 @@ def _build_renderer(model_config: MockModelConfig):
)
)
def
_build_serving_render
(
engine
,
model_registry
:
OpenAIModelRegistry
)
->
OpenAIServingRender
:
return
OpenAIServingRender
(
model_config
=
engine
.
model_config
,
renderer
=
engine
.
renderer
,
io_processor
=
engine
.
io_processor
,
model_registry
=
model_registry
,
request_logger
=
None
,
chat_template
=
CHAT_TEMPLATE
,
chat_template_content_format
=
"auto"
,
)
def
_build_serving_chat
(
engine
:
AsyncLLM
)
->
OpenAIServingChat
:
def
_build_serving_chat
(
engine
:
AsyncLLM
)
->
OpenAIServingChat
:
models
=
OpenAIServingModels
(
models
=
OpenAIServingModels
(
engine_client
=
engine
,
engine_client
=
engine
,
base_model_paths
=
BASE_MODEL_PATHS
,
base_model_paths
=
BASE_MODEL_PATHS
,
)
)
openai_serving_render
=
_build_serving_render
(
engine
,
models
.
registry
)
serving_chat
=
OpenAIServingChat
(
serving_chat
=
OpenAIServingChat
(
engine
,
engine
,
models
,
models
,
response_role
=
"assistant"
,
response_role
=
"assistant"
,
openai_serving_render
=
openai_serving_render
,
chat_template
=
CHAT_TEMPLATE
,
chat_template
=
CHAT_TEMPLATE
,
chat_template_content_format
=
"auto"
,
chat_template_content_format
=
"auto"
,
request_logger
=
None
,
request_logger
=
None
,
...
@@ -586,10 +608,13 @@ async def _async_serving_chat_init():
...
@@ -586,10 +608,13 @@ async def _async_serving_chat_init():
engine
=
MockEngine
()
engine
=
MockEngine
()
models
=
OpenAIServingModels
(
engine
,
BASE_MODEL_PATHS
)
models
=
OpenAIServingModels
(
engine
,
BASE_MODEL_PATHS
)
openai_serving_render
=
_build_serving_render
(
engine
,
models
.
registry
)
serving_completion
=
OpenAIServingChat
(
serving_completion
=
OpenAIServingChat
(
engine
,
engine
,
models
,
models
,
response_role
=
"assistant"
,
response_role
=
"assistant"
,
openai_serving_render
=
openai_serving_render
,
chat_template
=
CHAT_TEMPLATE
,
chat_template
=
CHAT_TEMPLATE
,
chat_template_content_format
=
"auto"
,
chat_template_content_format
=
"auto"
,
request_logger
=
None
,
request_logger
=
None
,
...
@@ -1182,7 +1207,9 @@ class TestServingChatWithHarmony:
...
@@ -1182,7 +1207,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the first turn's input
# Test the Harmony messages for the first turn's input
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
input_messages
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
input_messages
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req
)
)
verify_harmony_messages
(
verify_harmony_messages
(
input_messages
,
input_messages
,
[
[
...
@@ -1209,7 +1236,9 @@ class TestServingChatWithHarmony:
...
@@ -1209,7 +1236,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the second turn's input
# Test the Harmony messages for the second turn's input
req_2
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
req_2
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
input_messages_2
,
_
=
serving_chat
.
_make_request_with_harmony
(
req_2
)
input_messages_2
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req_2
)
)
verify_harmony_messages
(
verify_harmony_messages
(
input_messages_2
,
input_messages_2
,
[
[
...
@@ -1230,7 +1259,9 @@ class TestServingChatWithHarmony:
...
@@ -1230,7 +1259,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the first turn's input
# Test the Harmony messages for the first turn's input
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
input_messages
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
input_messages
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req
)
)
verify_harmony_messages
(
verify_harmony_messages
(
input_messages
,
input_messages
,
[
[
...
@@ -1274,7 +1305,9 @@ class TestServingChatWithHarmony:
...
@@ -1274,7 +1305,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the second turn's input
# Test the Harmony messages for the second turn's input
req_2
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
req_2
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
input_messages_2
,
_
=
serving_chat
.
_make_request_with_harmony
(
req_2
)
input_messages_2
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req_2
)
)
verify_harmony_messages
(
verify_harmony_messages
(
input_messages_2
,
input_messages_2
,
[
[
...
@@ -1311,7 +1344,9 @@ class TestServingChatWithHarmony:
...
@@ -1311,7 +1344,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the first turn's input
# Test the Harmony messages for the first turn's input
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
input_messages
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
input_messages
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req
)
)
verify_harmony_messages
(
verify_harmony_messages
(
input_messages
,
input_messages
,
[
[
...
@@ -1355,7 +1390,9 @@ class TestServingChatWithHarmony:
...
@@ -1355,7 +1390,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the second turn's input
# Test the Harmony messages for the second turn's input
req_2
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
req_2
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
input_messages_2
,
_
=
serving_chat
.
_make_request_with_harmony
(
req_2
)
input_messages_2
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req_2
)
)
verify_harmony_messages
(
verify_harmony_messages
(
input_messages_2
,
input_messages_2
,
[
[
...
@@ -1392,7 +1429,9 @@ class TestServingChatWithHarmony:
...
@@ -1392,7 +1429,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the first turn's input
# Test the Harmony messages for the first turn's input
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
input_messages
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
input_messages
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req
)
)
verify_harmony_messages
(
verify_harmony_messages
(
input_messages
,
input_messages
,
[
[
...
@@ -1436,7 +1475,9 @@ class TestServingChatWithHarmony:
...
@@ -1436,7 +1475,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the second turn's input
# Test the Harmony messages for the second turn's input
req_2
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
req_2
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
input_messages_2
,
_
=
serving_chat
.
_make_request_with_harmony
(
req_2
)
input_messages_2
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req_2
)
)
verify_harmony_messages
(
verify_harmony_messages
(
input_messages_2
,
input_messages_2
,
[
[
...
@@ -1486,7 +1527,9 @@ class TestServingChatWithHarmony:
...
@@ -1486,7 +1527,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the third turn's input
# Test the Harmony messages for the third turn's input
req_3
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
req_3
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
input_messages_3
,
_
=
serving_chat
.
_make_request_with_harmony
(
req_3
)
input_messages_3
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req_3
)
)
verify_harmony_messages
(
verify_harmony_messages
(
input_messages_3
,
input_messages_3
,
[
[
...
@@ -1549,7 +1592,9 @@ class TestServingChatWithHarmony:
...
@@ -1549,7 +1592,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the fourth turn's input
# Test the Harmony messages for the fourth turn's input
req_4
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
req_4
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
input_messages_4
,
_
=
serving_chat
.
_make_request_with_harmony
(
req_4
)
input_messages_4
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req_4
)
)
verify_harmony_messages
(
verify_harmony_messages
(
input_messages_4
,
input_messages_4
,
[
[
...
@@ -1598,7 +1643,9 @@ class TestServingChatWithHarmony:
...
@@ -1598,7 +1643,9 @@ class TestServingChatWithHarmony:
},
},
]
]
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
input_messages
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
input_messages
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req
)
)
verify_harmony_messages
(
verify_harmony_messages
(
input_messages
,
input_messages
,
...
@@ -1629,7 +1676,9 @@ class TestServingChatWithHarmony:
...
@@ -1629,7 +1676,9 @@ class TestServingChatWithHarmony:
},
},
]
]
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
input_messages
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
input_messages
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req
)
)
verify_harmony_messages
(
verify_harmony_messages
(
input_messages
,
input_messages
,
...
@@ -1658,7 +1707,9 @@ class TestServingChatWithHarmony:
...
@@ -1658,7 +1707,9 @@ class TestServingChatWithHarmony:
},
},
]
]
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
input_messages
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
input_messages
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req
)
)
verify_harmony_messages
(
verify_harmony_messages
(
input_messages
,
input_messages
,
...
@@ -1689,11 +1740,14 @@ async def test_tool_choice_validation_without_parser():
...
@@ -1689,11 +1740,14 @@ async def test_tool_choice_validation_without_parser():
engine_client
=
mock_engine
,
engine_client
=
mock_engine
,
base_model_paths
=
BASE_MODEL_PATHS
,
base_model_paths
=
BASE_MODEL_PATHS
,
)
)
openai_serving_render
=
_build_serving_render
(
mock_engine
,
models
.
registry
)
# Create serving_chat without tool_parser (enable_auto_tools=False)
# Create serving_chat without tool_parser (enable_auto_tools=False)
serving_chat
=
OpenAIServingChat
(
serving_chat
=
OpenAIServingChat
(
mock_engine
,
mock_engine
,
models
,
models
,
response_role
=
"assistant"
,
response_role
=
"assistant"
,
openai_serving_render
=
openai_serving_render
,
chat_template
=
CHAT_TEMPLATE
,
chat_template
=
CHAT_TEMPLATE
,
chat_template_content_format
=
"auto"
,
chat_template_content_format
=
"auto"
,
request_logger
=
None
,
request_logger
=
None
,
...
...
tests/v1/engine/test_async_llm.py
View file @
a2268617
...
@@ -508,11 +508,25 @@ async def test_header_dp_rank_argument():
...
@@ -508,11 +508,25 @@ async def test_header_dp_rank_argument():
base_model_paths
=
BASE_MODEL_PATHS
,
base_model_paths
=
BASE_MODEL_PATHS
,
)
)
# Create render serving instance (required by OpenAIServingChat)
from
vllm.entrypoints.serve.render.serving
import
OpenAIServingRender
serving_render
=
OpenAIServingRender
(
model_config
=
engine
.
model_config
,
renderer
=
engine
.
renderer
,
io_processor
=
engine
.
io_processor
,
model_registry
=
models
.
registry
,
request_logger
=
None
,
chat_template
=
None
,
chat_template_content_format
=
"auto"
,
)
# Create serving chat instance
# Create serving chat instance
serving_chat
=
OpenAIServingChat
(
serving_chat
=
OpenAIServingChat
(
engine_client
=
engine
,
engine_client
=
engine
,
models
=
models
,
models
=
models
,
response_role
=
"assistant"
,
response_role
=
"assistant"
,
openai_serving_render
=
serving_render
,
chat_template
=
None
,
chat_template
=
None
,
chat_template_content_format
=
"auto"
,
chat_template_content_format
=
"auto"
,
request_logger
=
None
,
request_logger
=
None
,
...
...
vllm/entrypoints/anthropic/serving.py
View file @
a2268617
...
@@ -10,7 +10,7 @@ import logging
...
@@ -10,7 +10,7 @@ import logging
import
time
import
time
import
uuid
import
uuid
from
collections.abc
import
AsyncGenerator
from
collections.abc
import
AsyncGenerator
from
typing
import
Any
from
typing
import
TYPE_CHECKING
,
Any
from
fastapi
import
Request
from
fastapi
import
Request
...
@@ -43,6 +43,9 @@ from vllm.entrypoints.openai.engine.protocol import (
...
@@ -43,6 +43,9 @@ from vllm.entrypoints.openai.engine.protocol import (
)
)
from
vllm.entrypoints.openai.models.serving
import
OpenAIServingModels
from
vllm.entrypoints.openai.models.serving
import
OpenAIServingModels
if
TYPE_CHECKING
:
from
vllm.entrypoints.serve.render.serving
import
OpenAIServingRender
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
...
@@ -59,6 +62,7 @@ class AnthropicServingMessages(OpenAIServingChat):
...
@@ -59,6 +62,7 @@ class AnthropicServingMessages(OpenAIServingChat):
models
:
OpenAIServingModels
,
models
:
OpenAIServingModels
,
response_role
:
str
,
response_role
:
str
,
*
,
*
,
openai_serving_render
:
"OpenAIServingRender"
,
request_logger
:
RequestLogger
|
None
,
request_logger
:
RequestLogger
|
None
,
chat_template
:
str
|
None
,
chat_template
:
str
|
None
,
chat_template_content_format
:
ChatTemplateContentFormatOption
,
chat_template_content_format
:
ChatTemplateContentFormatOption
,
...
@@ -73,6 +77,7 @@ class AnthropicServingMessages(OpenAIServingChat):
...
@@ -73,6 +77,7 @@ class AnthropicServingMessages(OpenAIServingChat):
engine_client
=
engine_client
,
engine_client
=
engine_client
,
models
=
models
,
models
=
models
,
response_role
=
response_role
,
response_role
=
response_role
,
openai_serving_render
=
openai_serving_render
,
request_logger
=
request_logger
,
request_logger
=
request_logger
,
chat_template
=
chat_template
,
chat_template
=
chat_template
,
chat_template_content_format
=
chat_template_content_format
,
chat_template_content_format
=
chat_template_content_format
,
...
...
vllm/entrypoints/openai/chat_completion/serving.py
View file @
a2268617
...
@@ -6,12 +6,11 @@ import json
...
@@ -6,12 +6,11 @@ import json
import
time
import
time
from
collections.abc
import
AsyncGenerator
,
AsyncIterator
from
collections.abc
import
AsyncGenerator
,
AsyncIterator
from
collections.abc
import
Sequence
as
GenericSequence
from
collections.abc
import
Sequence
as
GenericSequence
from
typing
import
Any
,
Final
from
typing
import
TYPE_CHECKING
,
Any
,
Final
import
partial_json_parser
import
partial_json_parser
import
regex
as
re
import
regex
as
re
from
fastapi
import
Request
from
fastapi
import
Request
from
openai_harmony
import
Message
as
OpenAIMessage
from
partial_json_parser.core.options
import
Allow
from
partial_json_parser.core.options
import
Allow
from
vllm.engine.protocol
import
EngineClient
from
vllm.engine.protocol
import
EngineClient
...
@@ -56,17 +55,13 @@ from vllm.entrypoints.openai.engine.serving import (
...
@@ -56,17 +55,13 @@ from vllm.entrypoints.openai.engine.serving import (
)
)
from
vllm.entrypoints.openai.models.serving
import
OpenAIServingModels
from
vllm.entrypoints.openai.models.serving
import
OpenAIServingModels
from
vllm.entrypoints.openai.parser.harmony_utils
import
(
from
vllm.entrypoints.openai.parser.harmony_utils
import
(
get_developer_message
,
get_stop_tokens_for_assistant_actions
,
get_stop_tokens_for_assistant_actions
,
get_streamable_parser_for_assistant
,
get_streamable_parser_for_assistant
,
get_system_message
,
parse_chat_inputs_to_harmony_messages
,
parse_chat_output
,
parse_chat_output
,
render_for_completion
,
)
)
from
vllm.entrypoints.openai.utils
import
maybe_filter_parallel_tool_calls
from
vllm.entrypoints.openai.utils
import
maybe_filter_parallel_tool_calls
from
vllm.entrypoints.utils
import
get_max_tokens
,
should_include_usage
from
vllm.entrypoints.utils
import
get_max_tokens
,
should_include_usage
from
vllm.inputs.data
import
ProcessorInputs
,
TokensPrompt
from
vllm.inputs.data
import
ProcessorInputs
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.logprobs
import
Logprob
from
vllm.logprobs
import
Logprob
from
vllm.outputs
import
CompletionOutput
,
RequestOutput
from
vllm.outputs
import
CompletionOutput
,
RequestOutput
...
@@ -80,7 +75,9 @@ from vllm.tool_parsers.mistral_tool_parser import MistralToolCall
...
@@ -80,7 +75,9 @@ from vllm.tool_parsers.mistral_tool_parser import MistralToolCall
from
vllm.tool_parsers.utils
import
partial_json_loads
from
vllm.tool_parsers.utils
import
partial_json_loads
from
vllm.utils.collection_utils
import
as_list
from
vllm.utils.collection_utils
import
as_list
from
vllm.utils.mistral
import
is_mistral_tokenizer
from
vllm.utils.mistral
import
is_mistral_tokenizer
from
vllm.utils.mistral
import
mt
as
_mt
if
TYPE_CHECKING
:
from
vllm.entrypoints.serve.render.serving
import
OpenAIServingRender
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -92,6 +89,7 @@ class OpenAIServingChat(OpenAIServing):
...
@@ -92,6 +89,7 @@ class OpenAIServingChat(OpenAIServing):
models
:
OpenAIServingModels
,
models
:
OpenAIServingModels
,
response_role
:
str
,
response_role
:
str
,
*
,
*
,
openai_serving_render
:
"OpenAIServingRender"
,
request_logger
:
RequestLogger
|
None
,
request_logger
:
RequestLogger
|
None
,
chat_template
:
str
|
None
,
chat_template
:
str
|
None
,
chat_template_content_format
:
ChatTemplateContentFormatOption
,
chat_template_content_format
:
ChatTemplateContentFormatOption
,
...
@@ -114,6 +112,7 @@ class OpenAIServingChat(OpenAIServing):
...
@@ -114,6 +112,7 @@ class OpenAIServingChat(OpenAIServing):
return_tokens_as_token_ids
=
return_tokens_as_token_ids
,
return_tokens_as_token_ids
=
return_tokens_as_token_ids
,
)
)
self
.
openai_serving_render
=
openai_serving_render
self
.
response_role
=
response_role
self
.
response_role
=
response_role
self
.
chat_template
=
chat_template
self
.
chat_template
=
chat_template
self
.
chat_template_content_format
:
Final
=
chat_template_content_format
self
.
chat_template_content_format
:
Final
=
chat_template_content_format
...
@@ -186,7 +185,10 @@ class OpenAIServingChat(OpenAIServing):
...
@@ -186,7 +185,10 @@ class OpenAIServingChat(OpenAIServing):
request
:
ChatCompletionRequest
,
request
:
ChatCompletionRequest
,
)
->
tuple
[
list
[
ConversationMessage
],
list
[
ProcessorInputs
]]
|
ErrorResponse
:
)
->
tuple
[
list
[
ConversationMessage
],
list
[
ProcessorInputs
]]
|
ErrorResponse
:
"""
"""
render chat request by validating and preprocessing inputs.
Validate the model and preprocess a chat completion request.
Delegates preprocessing logic to OpenAIServingRender, adding the
engine-aware checks (LoRA model validation, engine health).
Returns:
Returns:
A tuple of (conversation, engine_prompts) on success,
A tuple of (conversation, engine_prompts) on success,
...
@@ -203,78 +205,7 @@ class OpenAIServingChat(OpenAIServing):
...
@@ -203,78 +205,7 @@ class OpenAIServingChat(OpenAIServing):
if
self
.
engine_client
.
errored
:
if
self
.
engine_client
.
errored
:
raise
self
.
engine_client
.
dead_error
raise
self
.
engine_client
.
dead_error
tokenizer
=
self
.
renderer
.
tokenizer
return
await
self
.
openai_serving_render
.
render_chat
(
request
)
tool_parser
=
self
.
tool_parser
if
is_mistral_tokenizer
(
tokenizer
):
# because of issues with pydantic we need to potentially
# re-serialize the tool_calls field of the request
# for more info: see comment in `maybe_serialize_tool_calls`
_mt
.
maybe_serialize_tool_calls
(
request
)
# type: ignore[arg-type]
_mt
.
truncate_tool_call_ids
(
request
)
# type: ignore[arg-type]
_mt
.
validate_request_params
(
request
)
# Check if tool parsing is unavailable (common condition)
tool_parsing_unavailable
=
(
tool_parser
is
None
and
not
is_mistral_tokenizer
(
tokenizer
)
and
not
self
.
use_harmony
)
# Validate tool_choice when tool parsing is required but unavailable
if
tool_parsing_unavailable
and
request
.
tool_choice
not
in
(
None
,
"none"
,
):
if
request
.
tool_choice
==
"auto"
and
not
self
.
enable_auto_tools
:
# for hf tokenizers, "auto" tools requires
# --enable-auto-tool-choice and --tool-call-parser
return
self
.
create_error_response
(
'"auto" tool choice requires '
"--enable-auto-tool-choice and --tool-call-parser to be set"
)
elif
request
.
tool_choice
!=
"auto"
:
# "required" or named tool requires tool parser
return
self
.
create_error_response
(
f
'tool_choice="
{
request
.
tool_choice
}
" requires '
"--tool-call-parser to be set"
)
if
request
.
tools
is
None
or
(
request
.
tool_choice
==
"none"
and
self
.
exclude_tools_when_tool_choice_none
):
tool_dicts
=
None
else
:
tool_dicts
=
[
tool
.
model_dump
()
for
tool
in
request
.
tools
]
if
not
self
.
use_harmony
:
# Common case.
error_check_ret
=
self
.
_validate_chat_template
(
request_chat_template
=
request
.
chat_template
,
chat_template_kwargs
=
request
.
chat_template_kwargs
,
trust_request_chat_template
=
self
.
trust_request_chat_template
,
)
if
error_check_ret
is
not
None
:
return
error_check_ret
conversation
,
engine_prompts
=
await
self
.
_preprocess_chat
(
request
,
request
.
messages
,
default_template
=
self
.
chat_template
,
default_template_content_format
=
self
.
chat_template_content_format
,
default_template_kwargs
=
self
.
default_chat_template_kwargs
,
tool_dicts
=
tool_dicts
,
tool_parser
=
tool_parser
,
)
else
:
# For GPT-OSS.
should_include_tools
=
tool_dicts
is
not
None
conversation
,
engine_prompts
=
self
.
_make_request_with_harmony
(
request
,
should_include_tools
)
return
conversation
,
engine_prompts
async
def
create_chat_completion
(
async
def
create_chat_completion
(
self
,
self
,
...
@@ -1875,50 +1806,3 @@ class OpenAIServingChat(OpenAIServing):
...
@@ -1875,50 +1806,3 @@ class OpenAIServingChat(OpenAIServing):
)
)
]
]
)
)
def
_make_request_with_harmony
(
self
,
request
:
ChatCompletionRequest
,
should_include_tools
:
bool
=
True
,
):
messages
:
list
[
OpenAIMessage
]
=
[]
# because of issues with pydantic we need to potentially
# re-serialize the tool_calls field of the request
# for more info: see comment in `maybe_serialize_tool_calls`
_mt
.
maybe_serialize_tool_calls
(
request
)
# type: ignore[arg-type]
# Add system message.
# NOTE: In Chat Completion API, browsing is enabled by default
# if the model supports it. TODO: Support browsing.
assert
not
self
.
supports_browsing
assert
not
self
.
supports_code_interpreter
if
(
reasoning_effort
:
=
request
.
reasoning_effort
)
==
"none"
:
raise
ValueError
(
f
"Harmony does not support
{
reasoning_effort
=
}
"
)
sys_msg
=
get_system_message
(
reasoning_effort
=
reasoning_effort
,
browser_description
=
None
,
python_description
=
None
,
with_custom_tools
=
should_include_tools
,
)
messages
.
append
(
sys_msg
)
# Add developer message.
if
request
.
tools
:
dev_msg
=
get_developer_message
(
tools
=
request
.
tools
if
should_include_tools
else
None
# type: ignore[arg-type]
)
messages
.
append
(
dev_msg
)
# Add user message.
messages
.
extend
(
parse_chat_inputs_to_harmony_messages
(
request
.
messages
))
# Render prompt token ids.
prompt_token_ids
=
render_for_completion
(
messages
)
engine_prompt
=
TokensPrompt
(
prompt_token_ids
=
prompt_token_ids
)
# Add cache_salt if provided in the request
if
request
.
cache_salt
is
not
None
:
engine_prompt
[
"cache_salt"
]
=
request
.
cache_salt
return
messages
,
[
engine_prompt
]
vllm/entrypoints/openai/completion/serving.py
View file @
a2268617
...
@@ -5,7 +5,7 @@ import asyncio
...
@@ -5,7 +5,7 @@ import asyncio
import
time
import
time
from
collections.abc
import
AsyncGenerator
,
AsyncIterator
from
collections.abc
import
AsyncGenerator
,
AsyncIterator
from
collections.abc
import
Sequence
as
GenericSequence
from
collections.abc
import
Sequence
as
GenericSequence
from
typing
import
cast
from
typing
import
TYPE_CHECKING
,
cast
from
fastapi
import
Request
from
fastapi
import
Request
...
@@ -42,6 +42,9 @@ from vllm.tokenizers import TokenizerLike
...
@@ -42,6 +42,9 @@ from vllm.tokenizers import TokenizerLike
from
vllm.utils.async_utils
import
merge_async_iterators
from
vllm.utils.async_utils
import
merge_async_iterators
from
vllm.utils.collection_utils
import
as_list
from
vllm.utils.collection_utils
import
as_list
if
TYPE_CHECKING
:
from
vllm.entrypoints.serve.render.serving
import
OpenAIServingRender
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -51,6 +54,7 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -51,6 +54,7 @@ class OpenAIServingCompletion(OpenAIServing):
engine_client
:
EngineClient
,
engine_client
:
EngineClient
,
models
:
OpenAIServingModels
,
models
:
OpenAIServingModels
,
*
,
*
,
openai_serving_render
:
"OpenAIServingRender"
,
request_logger
:
RequestLogger
|
None
,
request_logger
:
RequestLogger
|
None
,
return_tokens_as_token_ids
:
bool
=
False
,
return_tokens_as_token_ids
:
bool
=
False
,
enable_prompt_tokens_details
:
bool
=
False
,
enable_prompt_tokens_details
:
bool
=
False
,
...
@@ -63,6 +67,7 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -63,6 +67,7 @@ class OpenAIServingCompletion(OpenAIServing):
return_tokens_as_token_ids
=
return_tokens_as_token_ids
,
return_tokens_as_token_ids
=
return_tokens_as_token_ids
,
)
)
self
.
openai_serving_render
=
openai_serving_render
self
.
enable_prompt_tokens_details
=
enable_prompt_tokens_details
self
.
enable_prompt_tokens_details
=
enable_prompt_tokens_details
self
.
enable_force_include_usage
=
enable_force_include_usage
self
.
enable_force_include_usage
=
enable_force_include_usage
...
@@ -79,7 +84,10 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -79,7 +84,10 @@ class OpenAIServingCompletion(OpenAIServing):
request
:
CompletionRequest
,
request
:
CompletionRequest
,
)
->
list
[
ProcessorInputs
]
|
ErrorResponse
:
)
->
list
[
ProcessorInputs
]
|
ErrorResponse
:
"""
"""
render completion request by validating and preprocessing inputs.
Validate the model and preprocess a completion request.
Delegates preprocessing logic to OpenAIServingRender, adding the
engine-aware checks (LoRA model validation, engine health).
Returns:
Returns:
A list of engine_prompts on success,
A list of engine_prompts on success,
...
@@ -95,25 +103,7 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -95,25 +103,7 @@ class OpenAIServingCompletion(OpenAIServing):
if
self
.
engine_client
.
errored
:
if
self
.
engine_client
.
errored
:
raise
self
.
engine_client
.
dead_error
raise
self
.
engine_client
.
dead_error
# Return error for unsupported features.
return
await
self
.
openai_serving_render
.
render_completion
(
request
)
if
request
.
suffix
is
not
None
:
return
self
.
create_error_response
(
"suffix is not currently supported"
)
if
request
.
echo
and
request
.
prompt_embeds
is
not
None
:
return
self
.
create_error_response
(
"Echo is unsupported with prompt embeds."
)
if
request
.
prompt_logprobs
is
not
None
and
request
.
prompt_embeds
is
not
None
:
return
self
.
create_error_response
(
"prompt_logprobs is not compatible with prompt embeds."
)
engine_prompts
=
await
self
.
_preprocess_completion
(
request
,
prompt_input
=
request
.
prompt
,
prompt_embeds
=
request
.
prompt_embeds
,
)
return
engine_prompts
async
def
create_completion
(
async
def
create_completion
(
self
,
self
,
...
...
vllm/entrypoints/openai/generate/api_router.py
View file @
a2268617
...
@@ -72,6 +72,29 @@ async def init_generate_state(
...
@@ -72,6 +72,29 @@ async def init_generate_state(
tool_server
=
None
tool_server
=
None
resolved_chat_template
=
load_chat_template
(
args
.
chat_template
)
resolved_chat_template
=
load_chat_template
(
args
.
chat_template
)
# Render endpoints are always backed by OpenAIServingRender so that
# /v1/chat/completions/render and /v1/completions/render work on both
# generate-mode and render-only servers.
# It is created first so that OpenAIServingChat and OpenAIServingCompletion
# can delegate their preprocessing logic to it.
from
vllm.entrypoints.serve.render.serving
import
OpenAIServingRender
state
.
openai_serving_render
=
OpenAIServingRender
(
model_config
=
engine_client
.
model_config
,
renderer
=
engine_client
.
renderer
,
io_processor
=
engine_client
.
io_processor
,
model_registry
=
state
.
openai_serving_models
.
registry
,
request_logger
=
request_logger
,
chat_template
=
resolved_chat_template
,
chat_template_content_format
=
args
.
chat_template_content_format
,
trust_request_chat_template
=
args
.
trust_request_chat_template
,
enable_auto_tools
=
args
.
enable_auto_tool_choice
,
exclude_tools_when_tool_choice_none
=
args
.
exclude_tools_when_tool_choice_none
,
tool_parser
=
args
.
tool_call_parser
,
default_chat_template_kwargs
=
args
.
default_chat_template_kwargs
,
log_error_stack
=
args
.
log_error_stack
,
)
state
.
openai_serving_responses
=
(
state
.
openai_serving_responses
=
(
OpenAIServingResponses
(
OpenAIServingResponses
(
engine_client
,
engine_client
,
...
@@ -96,6 +119,7 @@ async def init_generate_state(
...
@@ -96,6 +119,7 @@ async def init_generate_state(
engine_client
,
engine_client
,
state
.
openai_serving_models
,
state
.
openai_serving_models
,
args
.
response_role
,
args
.
response_role
,
openai_serving_render
=
state
.
openai_serving_render
,
request_logger
=
request_logger
,
request_logger
=
request_logger
,
chat_template
=
resolved_chat_template
,
chat_template
=
resolved_chat_template
,
chat_template_content_format
=
args
.
chat_template_content_format
,
chat_template_content_format
=
args
.
chat_template_content_format
,
...
@@ -120,6 +144,7 @@ async def init_generate_state(
...
@@ -120,6 +144,7 @@ async def init_generate_state(
OpenAIServingCompletion
(
OpenAIServingCompletion
(
engine_client
,
engine_client
,
state
.
openai_serving_models
,
state
.
openai_serving_models
,
openai_serving_render
=
state
.
openai_serving_render
,
request_logger
=
request_logger
,
request_logger
=
request_logger
,
return_tokens_as_token_ids
=
args
.
return_tokens_as_token_ids
,
return_tokens_as_token_ids
=
args
.
return_tokens_as_token_ids
,
enable_prompt_tokens_details
=
args
.
enable_prompt_tokens_details
,
enable_prompt_tokens_details
=
args
.
enable_prompt_tokens_details
,
...
@@ -133,6 +158,7 @@ async def init_generate_state(
...
@@ -133,6 +158,7 @@ async def init_generate_state(
engine_client
,
engine_client
,
state
.
openai_serving_models
,
state
.
openai_serving_models
,
args
.
response_role
,
args
.
response_role
,
openai_serving_render
=
state
.
openai_serving_render
,
request_logger
=
request_logger
,
request_logger
=
request_logger
,
chat_template
=
resolved_chat_template
,
chat_template
=
resolved_chat_template
,
chat_template_content_format
=
args
.
chat_template_content_format
,
chat_template_content_format
=
args
.
chat_template_content_format
,
...
@@ -159,24 +185,3 @@ async def init_generate_state(
...
@@ -159,24 +185,3 @@ async def init_generate_state(
if
"generate"
in
supported_tasks
if
"generate"
in
supported_tasks
else
None
else
None
)
)
# Render endpoints are always backed by OpenAIServingRender so that
# /v1/chat/completions/render and /v1/completions/render work on both
# generate-mode and render-only servers.
from
vllm.entrypoints.serve.render.serving
import
OpenAIServingRender
state
.
openai_serving_render
=
OpenAIServingRender
(
model_config
=
engine_client
.
model_config
,
renderer
=
engine_client
.
renderer
,
io_processor
=
engine_client
.
io_processor
,
model_registry
=
state
.
openai_serving_models
.
registry
,
request_logger
=
request_logger
,
chat_template
=
resolved_chat_template
,
chat_template_content_format
=
args
.
chat_template_content_format
,
trust_request_chat_template
=
args
.
trust_request_chat_template
,
enable_auto_tools
=
args
.
enable_auto_tool_choice
,
exclude_tools_when_tool_choice_none
=
args
.
exclude_tools_when_tool_choice_none
,
tool_parser
=
args
.
tool_call_parser
,
default_chat_template_kwargs
=
args
.
default_chat_template_kwargs
,
log_error_stack
=
args
.
log_error_stack
,
)
vllm/entrypoints/serve/render/serving.py
View file @
a2268617
...
@@ -87,15 +87,26 @@ class OpenAIServingRender:
...
@@ -87,15 +87,26 @@ class OpenAIServingRender:
self
,
self
,
request
:
ChatCompletionRequest
,
request
:
ChatCompletionRequest
,
)
->
tuple
[
list
[
ConversationMessage
],
list
[
ProcessorInputs
]]
|
ErrorResponse
:
)
->
tuple
[
list
[
ConversationMessage
],
list
[
ProcessorInputs
]]
|
ErrorResponse
:
"""
Copied from OpenAIServingChat.render_chat_
request.
"""
Validate the model and preprocess a chat completion
request.
Differences: engine_client.errored check removed (no engine client).
This is the authoritative implementation used directly by the
GPU-less render server and delegated to by OpenAIServingChat.
"""
"""
error_check_ret
=
await
self
.
_check_model
(
request
)
error_check_ret
=
await
self
.
_check_model
(
request
)
if
error_check_ret
is
not
None
:
if
error_check_ret
is
not
None
:
logger
.
error
(
"Error with model %s"
,
error_check_ret
)
logger
.
error
(
"Error with model %s"
,
error_check_ret
)
return
error_check_ret
return
error_check_ret
return
await
self
.
render_chat
(
request
)
async
def
render_chat
(
self
,
request
:
ChatCompletionRequest
,
)
->
tuple
[
list
[
ConversationMessage
],
list
[
ProcessorInputs
]]
|
ErrorResponse
:
"""Core preprocessing logic for chat requests (no model/engine check).
Called directly by render_chat_request and delegated to by
OpenAIServingChat.render_chat_request after its engine-aware checks.
"""
tokenizer
=
self
.
renderer
.
tokenizer
tokenizer
=
self
.
renderer
.
tokenizer
tool_parser
=
self
.
tool_parser
tool_parser
=
self
.
tool_parser
...
@@ -173,14 +184,25 @@ class OpenAIServingRender:
...
@@ -173,14 +184,25 @@ class OpenAIServingRender:
self
,
self
,
request
:
CompletionRequest
,
request
:
CompletionRequest
,
)
->
list
[
ProcessorInputs
]
|
ErrorResponse
:
)
->
list
[
ProcessorInputs
]
|
ErrorResponse
:
"""
Copied from OpenAIServingCompletion.render_
completion
_
request.
"""
Validate the model and preprocess a
completion
request.
Differences: engine_client.errored check removed (no engine client).
This is the authoritative implementation used directly by the
GPU-less render server and delegated to by OpenAIServingCompletion.
"""
"""
error_check_ret
=
await
self
.
_check_model
(
request
)
error_check_ret
=
await
self
.
_check_model
(
request
)
if
error_check_ret
is
not
None
:
if
error_check_ret
is
not
None
:
return
error_check_ret
return
error_check_ret
return
await
self
.
render_completion
(
request
)
async
def
render_completion
(
self
,
request
:
CompletionRequest
,
)
->
list
[
ProcessorInputs
]
|
ErrorResponse
:
"""Core preprocessing logic for completion requests (no model/engine check).
Called directly by render_completion_request and delegated to by
OpenAIServingCompletion.render_completion_request after its engine-aware checks.
"""
# Return error for unsupported features.
# Return error for unsupported features.
if
request
.
suffix
is
not
None
:
if
request
.
suffix
is
not
None
:
return
self
.
create_error_response
(
"suffix is not currently supported"
)
return
self
.
create_error_response
(
"suffix is not currently supported"
)
...
@@ -206,7 +228,7 @@ class OpenAIServingRender:
...
@@ -206,7 +228,7 @@ class OpenAIServingRender:
request
:
ChatCompletionRequest
,
request
:
ChatCompletionRequest
,
should_include_tools
:
bool
=
True
,
should_include_tools
:
bool
=
True
,
):
):
"""
Copied from OpenAIServingChat._make_request_with_harmony
."""
"""
Build Harmony (GPT-OSS) messages and engine prompt from a chat request
."""
messages
:
list
[
OpenAIMessage
]
=
[]
messages
:
list
[
OpenAIMessage
]
=
[]
# because of issues with pydantic we need to potentially
# because of issues with pydantic we need to potentially
...
@@ -219,11 +241,10 @@ class OpenAIServingRender:
...
@@ -219,11 +241,10 @@ class OpenAIServingRender:
# if the model supports it. TODO: Support browsing.
# if the model supports it. TODO: Support browsing.
assert
not
self
.
supports_browsing
assert
not
self
.
supports_browsing
assert
not
self
.
supports_code_interpreter
assert
not
self
.
supports_code_interpreter
assert
request
.
reasoning_effort
!=
"none"
,
(
if
(
reasoning_effort
:
=
request
.
reasoning_effort
)
==
"none"
:
"Harmony does not support reasoning_effort='none'"
raise
ValueError
(
f
"Harmony does not support
{
reasoning_effort
=
}
"
)
)
sys_msg
=
get_system_message
(
sys_msg
=
get_system_message
(
reasoning_effort
=
request
.
reasoning_effort
,
reasoning_effort
=
reasoning_effort
,
browser_description
=
None
,
browser_description
=
None
,
python_description
=
None
,
python_description
=
None
,
with_custom_tools
=
should_include_tools
,
with_custom_tools
=
should_include_tools
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment