[gpt-oss][1/N] EZ: refactor serving_responses for modularity (#26948)

Signed-off-by: Andrew Xia <axia@meta.com>

[gpt-oss][1/N] EZ: refactor serving_responses for modularity (#26948)
Signed-off-by: Andrew Xia <axia@meta.com>
e6ba2000 · Andrew Xia · GitHub · aa255ff5 · e6ba2000
Unverified Commit e6ba2000 authored Oct 16, 2025 by Andrew Xia Committed by GitHub Oct 16, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 70 additions and 54 deletions

vllm/entrypoints/openai/serving_responses.py vllm/entrypoints/openai/serving_responses.py +70 -54

No files found.
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -227,6 +227,29 @@ class OpenAIServingResponses(OpenAIServing):
            )
        return None
+    def _validate_create_responses_input(
+        self, request: ResponsesRequest
+    ) -> ErrorResponse | None:
+        if self.use_harmony and request.is_include_output_logprobs():
+            return self.create_error_response(
+                err_type="invalid_request_error",
+                message="logprobs are not supported with gpt-oss models",
+                status_code=HTTPStatus.BAD_REQUEST,
+            )
+        if request.store and not self.enable_store and request.background:
+            return self.create_error_response(
+                err_type="invalid_request_error",
+                message=(
+                    "This vLLM engine does not support `store=True` and "
+                    "therefore does not support the background mode. To "
+                    "enable these features, set the environment variable "
+                    "`VLLM_ENABLE_RESPONSES_API_STORE=1` when launching "
+                    "the vLLM server."
+                ),
+                status_code=HTTPStatus.BAD_REQUEST,
+            )
+        return None
    async def create_responses(
        self,
        request: ResponsesRequest,
@@ -240,6 +263,9 @@ class OpenAIServingResponses(OpenAIServing):
        if error_check_ret is not None:
            logger.error("Error with model %s", error_check_ret)
            return error_check_ret
+        maybe_validation_error = self._validate_create_responses_input(request)
+        if maybe_validation_error is not None:
+            return maybe_validation_error
        # If the engine is dead, raise the engine's DEAD_ERROR.
        # This is required for the streaming case, where we return a
@@ -248,18 +274,6 @@ class OpenAIServingResponses(OpenAIServing):
            raise self.engine_client.dead_error
        if request.store and not self.enable_store:
-            if request.background:
-                return self.create_error_response(
-                    err_type="invalid_request_error",
-                    message=(
-                        "This vLLM engine does not support `store=True` and "
-                        "therefore does not support the background mode. To "
-                        "enable these features, set the environment variable "
-                        "`VLLM_ENABLE_RESPONSES_API_STORE=1` when launching "
-                        "the vLLM server."
-                    ),
-                    status_code=HTTPStatus.BAD_REQUEST,
-                )
            # Disable the store option.
            # NOTE(woosuk): Although returning an error is possible, we opted
            # to implicitly disable store and process the request anyway, as
@@ -267,12 +281,6 @@ class OpenAIServingResponses(OpenAIServing):
            # (i.e., their request's `store=True` just because it's the default
            # value).
            request.store = False
-        if self.use_harmony and request.is_include_output_logprobs():
-            return self.create_error_response(
-                err_type="invalid_request_error",
-                message="logprobs are not supported with gpt-oss models",
-                status_code=HTTPStatus.BAD_REQUEST,
-            )
        # Handle the previous response ID.
        prev_response_id = request.previous_response_id
@@ -849,6 +857,47 @@ class OpenAIServingResponses(OpenAIServing):
            messages.extend(request.input)  # type: ignore
        return messages
+    def _construct_harmony_system_input_message(
+        self, request: ResponsesRequest, with_custom_tools: bool, tool_types: list[str]
+    ) -> OpenAIHarmonyMessage:
+        reasoning_effort = request.reasoning.effort if request.reasoning else None
+        enable_browser = (
+            "web_search_preview" in tool_types
+            and self.tool_server is not None
+            and self.tool_server.has_tool("browser")
+        )
+        enable_code_interpreter = (
+            "code_interpreter" in tool_types
+            and self.tool_server is not None
+            and self.tool_server.has_tool("python")
+        )
+        enable_container = (
+            "container" in tool_types
+            and self.tool_server is not None
+            and self.tool_server.has_tool("container")
+        )
+        sys_msg = get_system_message(
+            reasoning_effort=reasoning_effort,
+            browser_description=(
+                self.tool_server.get_tool_description("browser")
+                if enable_browser and self.tool_server is not None
+                else None
+            ),
+            python_description=(
+                self.tool_server.get_tool_description("python")
+                if enable_code_interpreter and self.tool_server is not None
+                else None
+            ),
+            container_description=(
+                self.tool_server.get_tool_description("container")
+                if enable_container and self.tool_server is not None
+                else None
+            ),
+            instructions=request.instructions,
+            with_custom_tools=with_custom_tools,
+        )
+        return sys_msg
    def _construct_input_messages_with_harmony(
        self,
        request: ResponsesRequest,
@@ -857,9 +906,7 @@ class OpenAIServingResponses(OpenAIServing):
        messages: list[OpenAIHarmonyMessage] = []
        if prev_response is None:
            # New conversation.
-            reasoning_effort = request.reasoning.effort if request.reasoning else None
            tool_types = [tool.type for tool in request.tools]
            # Allow the MCP Tool type to enable built in tools if the
            # server_label is allowlisted in
            # envs.GPT_OSS_SYSTEM_TOOL_MCP_LABELS
@@ -870,41 +917,10 @@ class OpenAIServingResponses(OpenAIServing):
                        and tool.server_label in envs.GPT_OSS_SYSTEM_TOOL_MCP_LABELS
                    ):
                        tool_types.append(tool.server_label)
-            enable_browser = (
-                "web_search_preview" in tool_types
-                and self.tool_server is not None
-                and self.tool_server.has_tool("browser")
-            )
-            enable_code_interpreter = (
-                "code_interpreter" in tool_types
-                and self.tool_server is not None
-                and self.tool_server.has_tool("python")
-            )
-            enable_container = (
-                "container" in tool_types
-                and self.tool_server is not None
-                and self.tool_server.has_tool("container")
-            )
            with_custom_tools = has_custom_tools(tool_types)
-            sys_msg = get_system_message(
-                reasoning_effort=reasoning_effort,
+            sys_msg = self._construct_harmony_system_input_message(
-                browser_description=(
+                request, with_custom_tools, tool_types
-                    self.tool_server.get_tool_description("browser")
-                    if enable_browser and self.tool_server is not None
-                    else None
-                ),
-                python_description=(
-                    self.tool_server.get_tool_description("python")
-                    if enable_code_interpreter and self.tool_server is not None
-                    else None
-                ),
-                container_description=(
-                    self.tool_server.get_tool_description("container")
-                    if enable_container and self.tool_server is not None
-                    else None
-                ),
-                instructions=request.instructions,
-                with_custom_tools=with_custom_tools,
            )
            messages.append(sys_msg)
            if with_custom_tools: