[gpt-oss] Add IncompleteDetails to ResponsesRepsonse (#24561)

Signed-off-by: Andrew Xia <axia@meta.com>

[gpt-oss] Add IncompleteDetails to ResponsesRepsonse (#24561)
Signed-off-by: Andrew Xia <axia@meta.com>
25aba2b6 · Andrew Xia · GitHub · 94b03f88 · 25aba2b6 · 25aba2b6
Unverified Commit 25aba2b6 authored Sep 15, 2025 by Andrew Xia Committed by GitHub Sep 15, 2025
7 changed files
--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@@ -74,6 +74,20 @@ async def test_basic_with_reasoning_effort(client: OpenAI, model_name: str):
    assert response.status == "completed"
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_max_tokens(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="What is the first paragraph of Moby Dick?",
+        reasoning={"effort": "low"},
+        max_output_tokens=30,
+    )
+    assert response is not None
+    assert response.status == "incomplete"
+    assert response.incomplete_details.reason == "max_output_tokens"
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_chat(client: OpenAI, model_name: str):

--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -112,6 +112,7 @@ class HarmonyContext(ConversationContext):
        available_tools: list[str],
    ):
        self._messages = messages
+        self.finish_reason: Optional[str] = None
        self.available_tools = available_tools
        self._tool_sessions: dict[str, Union[ClientSession, Tool]] = {}
        self.called_tools: set[str] = set()
@@ -135,7 +136,8 @@ class HarmonyContext(ConversationContext):
        if self.parser.current_channel in {"analysis", "commentary"}:
            self.num_reasoning_tokens += 1
-    def append_output(self, output) -> None:
+    def append_output(self, output: Union[RequestOutput,
+                                          list[Message]]) -> None:
        if isinstance(output, RequestOutput):
            output_token_ids = output.outputs[0].token_ids
            self.parser = get_streamable_parser_for_assistant()
@@ -150,6 +152,8 @@ class HarmonyContext(ConversationContext):
            # Move current turn to previous turn for next turn's calculations
            self.previous_turn = self.current_turn.copy()
            output_msgs = self.parser.messages
+            # The responses finish reason is set in the last message
+            self.finish_reason = output.outputs[0].finish_reason
        else:
            # Tool output.
            output_msgs = output
@@ -157,18 +161,18 @@ class HarmonyContext(ConversationContext):
    def _update_prefill_token_usage(self, output: RequestOutput) -> None:
        """Update token usage statistics for the prefill phase of generation.
        The prefill phase processes the input prompt tokens. This method:
        1. Counts the prompt tokens for this turn
        2. Calculates tool output tokens for multi-turn conversations
        3. Updates cached token counts
        4. Tracks state for next turn calculations
        Tool output tokens are calculated as:
-        current_prompt_tokens - last_turn_prompt_tokens - 
+        current_prompt_tokens - last_turn_prompt_tokens -
        last_turn_output_tokens
        This represents tokens added between turns (typically tool responses).
        Args:
            output: The RequestOutput containing prompt token information
        """
@@ -214,18 +218,18 @@ class HarmonyContext(ConversationContext):
    def _update_decode_token_usage(self, output: RequestOutput) -> int:
        """Update token usage statistics for the decode phase of generation.
        The decode phase processes the generated output tokens. This method:
        1. Counts output tokens from all completion outputs
        2. Updates the total output token count
        3. Tracks tokens generated in the current turn
        In streaming mode, this is called for each token generated.
        In non-streaming mode, this is called once with all output tokens.
        Args:
            output: The RequestOutput containing generated token information
        Returns:
            int: Number of output tokens processed in this call
        """
@@ -385,7 +389,8 @@ class StreamingHarmonyContext(HarmonyContext):
    def messages(self) -> list:
        return self.parser.messages
-    def append_output(self, output) -> None:
+    def append_output(self, output: Union[RequestOutput,
+                                          list[Message]]) -> None:
        if isinstance(output, RequestOutput):
            # append_output is called for each output token in streaming case,
            # so we only want to add the prompt tokens once for each message.

--- a/vllm/entrypoints/harmony_utils.py
+++ b/vllm/entrypoints/harmony_utils.py
@@ -387,7 +387,9 @@ def parse_remaining_state(
            id=f"msg_{random_uuid()}",
            content=[output_text],
            role="assistant",
-            status="completed",
+            # if the parser still has messages (ie if the generator got cut
+            # abruptly), this should be incomplete
+            status="incomplete",
            type="message",
        )
        return [text_item]

--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -30,7 +30,7 @@ except ImportError:  # For newer openai versions (>= 1.100.0)
    from openai.types.responses import (ResponseFormatTextConfig as
                                        ResponseTextConfig)
-from openai.types.responses.response import ToolChoice
+from openai.types.responses.response import IncompleteDetails, ToolChoice
 from openai.types.responses.tool import Tool
 from openai.types.shared import Metadata, Reasoning
 from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter,
@@ -1868,7 +1868,7 @@ class ResponsesResponse(OpenAIBaseModel):
    id: str = Field(default_factory=lambda: f"resp_{random_uuid()}")
    created_at: int = Field(default_factory=lambda: int(time.time()))
    # error: Optional[ResponseError] = None
-    # incomplete_details: Optional[IncompleteDetails] = None
+    incomplete_details: Optional[IncompleteDetails] = None
    instructions: Optional[str] = None
    metadata: Optional[Metadata] = None
    model: str
@@ -1904,9 +1904,18 @@ class ResponsesResponse(OpenAIBaseModel):
        status: ResponseStatus,
        usage: Optional[ResponseUsage] = None,
    ) -> "ResponsesResponse":
+        incomplete_details: Optional[IncompleteDetails] = None
+        if status == 'incomplete':
+            incomplete_details = IncompleteDetails(reason='max_output_tokens')
+        # TODO: implement the other reason for incomplete_details,
+        # which is content_filter
+        # incomplete_details = IncompleteDetails(reason='content_filter')
        return cls(
            id=request.request_id,
            created_at=created_time,
+            incomplete_details=incomplete_details,
            instructions=request.instructions,
            metadata=request.metadata,
            model=model_name,
@@ -2109,7 +2118,7 @@ class DetokenizeResponse(OpenAIBaseModel):
 class TokenizerInfoResponse(OpenAIBaseModel):
    """
-    Response containing tokenizer configuration 
+    Response containing tokenizer configuration
    equivalent to tokenizer_config.json
    """
@@ -2199,7 +2208,7 @@ class TranscriptionRequest(OpenAIBaseModel):
    to_language: Optional[str] = None
    """The language of the output audio we transcribe to.
-    Please note that this is not currently used by supported models at this 
+    Please note that this is not currently used by supported models at this
    time, but it is a placeholder for future use, matching translation api.
    """

--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -27,7 +27,7 @@ from openai.types.responses import (ResponseCreatedEvent,
                                    ResponseReasoningItem,
                                    ResponseReasoningTextDeltaEvent,
                                    ResponseReasoningTextDoneEvent,
-                                    response_text_delta_event)
+                                    ResponseStatus, response_text_delta_event)
 from openai.types.responses.response_output_text import (Logprob,
                                                         LogprobTopLogprob)
 # yapf: enable
@@ -461,10 +461,22 @@ class OpenAIServingResponses(OpenAIServing):
                # TODO: Use a vllm-specific Validation Error
                return self.create_error_response(str(e))
+        # NOTE: Implementation of stauts is still WIP, but for now
+        # we guarantee that if the status is not "completed", it is accurate.
+        # "completed" is implemented as the "catch-all" for now.
+        status: ResponseStatus = "completed"
        if self.use_harmony:
            assert isinstance(context, HarmonyContext)
            output = self._make_response_output_items_with_harmony(context)
            num_tool_output_tokens = context.num_tool_output_tokens
+            if len(output) > 0:
+                if context.finish_reason == "length":
+                    status = "incomplete"
+                elif context.finish_reason == "abort":
+                    status = "cancelled"
+            else:
+                status = "incomplete"
        else:
            assert isinstance(context, SimpleContext)
            final_res = context.last_output
@@ -501,7 +513,7 @@ class OpenAIServingResponses(OpenAIServing):
            model_name=model_name,
            created_time=created_time,
            output=output,
-            status="completed",
+            status=status,
            usage=usage,
        )
@@ -658,7 +670,7 @@ class OpenAIServingResponses(OpenAIServing):
        self,
        context: HarmonyContext,
    ) -> list[ResponseOutputItem]:
-        output_items = []
+        output_items: list[ResponseOutputItem] = []
        num_init_messages = context.num_init_messages
        for msg in context.messages[num_init_messages:]:
            output_items.extend(parse_output_message(msg))

--- a/vllm/v1/core/sched/utils.py
+++ b/vllm/v1/core/sched/utils.py
@@ -10,19 +10,19 @@ from vllm.v1.request import Request, RequestStatus
 def remove_all(lst: list, items_to_remove: set) -> list:
    """Remove all items from a list that are in the items_to_remove set.
    This method optimizes for the common case of removing a single item,
    falling back to list comprehension for multiple items.
    Args:
        lst: The list to remove items from
        items_to_remove: Set of items to remove
    Returns:
        Either the modified original list (for single item removal) or
        a new list (for multiple item removal). Callers should use the
        returned value.
    Note:
        For single item removal, this modifies the original list in-place
        and returns it. For multiple items, it creates and returns a new list.

--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -373,17 +373,17 @@ class OutputProcessor:
        1) Compute stats for logging
        2) Detokenize
        3) Create and handle RequestOutput objects:
-            * If there is a queue (for usage with AsyncLLM), 
+            * If there is a queue (for usage with AsyncLLM),
              put the RequestOutput objects into the queue for
              handling by the per-request generate() tasks.
-            * If there is no queue (for usage with LLMEngine), 
+            * If there is no queue (for usage with LLMEngine),
              return a list of RequestOutput objects.
        NOTE FOR DEVELOPERS
        vLLM V1 minimizes the number of python loops over the full
-        batch to ensure system overheads are minimized. This is the 
+        batch to ensure system overheads are minimized. This is the
        only function that should loop over EngineCoreOutputs.
        If you need to touch every element of the batch, do it from