[gpt-oss] Add IncompleteDetails to ResponsesRepsonse (#24561)

Signed-off-by: Andrew Xia <axia@meta.com>

[gpt-oss] Add IncompleteDetails to ResponsesRepsonse (#24561)
Signed-off-by: Andrew Xia <axia@meta.com>
25aba2b6 · Andrew Xia · GitHub · 94b03f88 · 25aba2b6 · 25aba2b6
Unverified Commit 25aba2b6 authored Sep 15, 2025 by Andrew Xia Committed by GitHub Sep 15, 2025
7 changed files
--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@@ -74,6 +74,20 @@ async def test_basic_with_reasoning_effort(client: OpenAI, model_name: str):
    assert response.status == "completed"


+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_max_tokens(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="What is the first paragraph of Moby Dick?",
+        reasoning={"effort": "low"},
+        max_output_tokens=30,
+    )
+    assert response is not None
+    assert response.status == "incomplete"
+    assert response.incomplete_details.reason == "max_output_tokens"
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_chat(client: OpenAI, model_name: str):

--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -112,6 +112,7 @@ class HarmonyContext(ConversationContext):
        available_tools: list[str],
    ):
        self._messages = messages
+        self.finish_reason: Optional[str] = None
        self.available_tools = available_tools
        self._tool_sessions: dict[str, Union[ClientSession, Tool]] = {}
        self.called_tools: set[str] = set()
@@ -135,7 +136,8 @@ class HarmonyContext(ConversationContext):
        if self.parser.current_channel in {"analysis", "commentary"}:
            self.num_reasoning_tokens += 1

-    def append_output(self, output) -> None:
+    def append_output(self, output: Union[RequestOutput,
+                                          list[Message]]) -> None:
        if isinstance(output, RequestOutput):
            output_token_ids = output.outputs[0].token_ids
            self.parser = get_streamable_parser_for_assistant()
@@ -150,6 +152,8 @@ class HarmonyContext(ConversationContext):
            # Move current turn to previous turn for next turn's calculations
            self.previous_turn = self.current_turn.copy()
            output_msgs = self.parser.messages
+            # The responses finish reason is set in the last message
+            self.finish_reason = output.outputs[0].finish_reason
        else:
            # Tool output.
            output_msgs = output
@@ -157,18 +161,18 @@ class HarmonyContext(ConversationContext):

    def _update_prefill_token_usage(self, output: RequestOutput) -> None:
        """Update token usage statistics for the prefill phase of generation.
-        
+
        The prefill phase processes the input prompt tokens. This method:
        1. Counts the prompt tokens for this turn
        2. Calculates tool output tokens for multi-turn conversations
        3. Updates cached token counts
        4. Tracks state for next turn calculations
-        
+
        Tool output tokens are calculated as:
-        current_prompt_tokens - last_turn_prompt_tokens - 
+        current_prompt_tokens - last_turn_prompt_tokens -
        last_turn_output_tokens
        This represents tokens added between turns (typically tool responses).
-        
+
        Args:
            output: The RequestOutput containing prompt token information
        """
@@ -214,18 +218,18 @@ class HarmonyContext(ConversationContext):

    def _update_decode_token_usage(self, output: RequestOutput) -> int:
        """Update token usage statistics for the decode phase of generation.
-        
+
        The decode phase processes the generated output tokens. This method:
        1. Counts output tokens from all completion outputs
        2. Updates the total output token count
        3. Tracks tokens generated in the current turn
-        
+
        In streaming mode, this is called for each token generated.
        In non-streaming mode, this is called once with all output tokens.
-        
+
        Args:
            output: The RequestOutput containing generated token information
-            
+
        Returns:
            int: Number of output tokens processed in this call
        """
@@ -385,7 +389,8 @@ class StreamingHarmonyContext(HarmonyContext):
    def messages(self) -> list:
        return self.parser.messages

-    def append_output(self, output) -> None:
+    def append_output(self, output: Union[RequestOutput,
+                                          list[Message]]) -> None:
        if isinstance(output, RequestOutput):
            # append_output is called for each output token in streaming case,
            # so we only want to add the prompt tokens once for each message.

--- a/vllm/entrypoints/harmony_utils.py
+++ b/vllm/entrypoints/harmony_utils.py
@@ -387,7 +387,9 @@ def parse_remaining_state(
            id=f"msg_{random_uuid()}",
            content=[output_text],
            role="assistant",
-            status="completed",
+            # if the parser still has messages (ie if the generator got cut
+            # abruptly), this should be incomplete
+            status="incomplete",
            type="message",
        )
        return [text_item]

--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -30,7 +30,7 @@ except ImportError:  # For newer openai versions (>= 1.100.0)
    from openai.types.responses import (ResponseFormatTextConfig as
                                        ResponseTextConfig)

-from openai.types.responses.response import ToolChoice
+from openai.types.responses.response import IncompleteDetails, ToolChoice
 from openai.types.responses.tool import Tool
 from openai.types.shared import Metadata, Reasoning
 from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter,
@@ -1868,7 +1868,7 @@ class ResponsesResponse(OpenAIBaseModel):
    id: str = Field(default_factory=lambda: f"resp_{random_uuid()}")
    created_at: int = Field(default_factory=lambda: int(time.time()))
    # error: Optional[ResponseError] = None
-    # incomplete_details: Optional[IncompleteDetails] = None
+    incomplete_details: Optional[IncompleteDetails] = None
    instructions: Optional[str] = None
    metadata: Optional[Metadata] = None
    model: str
@@ -1904,9 +1904,18 @@ class ResponsesResponse(OpenAIBaseModel):
        status: ResponseStatus,
        usage: Optional[ResponseUsage] = None,
    ) -> "ResponsesResponse":
+
+        incomplete_details: Optional[IncompleteDetails] = None
+        if status == 'incomplete':
+            incomplete_details = IncompleteDetails(reason='max_output_tokens')
+        # TODO: implement the other reason for incomplete_details,
+        # which is content_filter
+        # incomplete_details = IncompleteDetails(reason='content_filter')
+
        return cls(
            id=request.request_id,
            created_at=created_time,
+            incomplete_details=incomplete_details,
            instructions=request.instructions,
            metadata=request.metadata,
            model=model_name,
@@ -2109,7 +2118,7 @@ class DetokenizeResponse(OpenAIBaseModel):

 class TokenizerInfoResponse(OpenAIBaseModel):
    """
-    Response containing tokenizer configuration 
+    Response containing tokenizer configuration
    equivalent to tokenizer_config.json
    """

@@ -2199,7 +2208,7 @@ class TranscriptionRequest(OpenAIBaseModel):
    to_language: Optional[str] = None
    """The language of the output audio we transcribe to.

-    Please note that this is not currently used by supported models at this 
+    Please note that this is not currently used by supported models at this
    time, but it is a placeholder for future use, matching translation api.
    """


--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -27,7 +27,7 @@ from openai.types.responses import (ResponseCreatedEvent,
                                    ResponseReasoningItem,
                                    ResponseReasoningTextDeltaEvent,
                                    ResponseReasoningTextDoneEvent,
-                                    response_text_delta_event)
+                                    ResponseStatus, response_text_delta_event)
 from openai.types.responses.response_output_text import (Logprob,
                                                         LogprobTopLogprob)
 # yapf: enable
@@ -461,10 +461,22 @@ class OpenAIServingResponses(OpenAIServing):
                # TODO: Use a vllm-specific Validation Error
                return self.create_error_response(str(e))

+        # NOTE: Implementation of stauts is still WIP, but for now
+        # we guarantee that if the status is not "completed", it is accurate.
+        # "completed" is implemented as the "catch-all" for now.
+        status: ResponseStatus = "completed"
+
        if self.use_harmony:
            assert isinstance(context, HarmonyContext)
            output = self._make_response_output_items_with_harmony(context)
            num_tool_output_tokens = context.num_tool_output_tokens
+            if len(output) > 0:
+                if context.finish_reason == "length":
+                    status = "incomplete"
+                elif context.finish_reason == "abort":
+                    status = "cancelled"
+            else:
+                status = "incomplete"
        else:
            assert isinstance(context, SimpleContext)
            final_res = context.last_output
@@ -501,7 +513,7 @@ class OpenAIServingResponses(OpenAIServing):
            model_name=model_name,
            created_time=created_time,
            output=output,
-            status="completed",
+            status=status,
            usage=usage,
        )

@@ -658,7 +670,7 @@ class OpenAIServingResponses(OpenAIServing):
        self,
        context: HarmonyContext,
    ) -> list[ResponseOutputItem]:
-        output_items = []
+        output_items: list[ResponseOutputItem] = []
        num_init_messages = context.num_init_messages
        for msg in context.messages[num_init_messages:]:
            output_items.extend(parse_output_message(msg))

--- a/vllm/v1/core/sched/utils.py
+++ b/vllm/v1/core/sched/utils.py
@@ -10,19 +10,19 @@ from vllm.v1.request import Request, RequestStatus

 def remove_all(lst: list, items_to_remove: set) -> list:
    """Remove all items from a list that are in the items_to_remove set.
-    
+
    This method optimizes for the common case of removing a single item,
    falling back to list comprehension for multiple items.
-    
+
    Args:
        lst: The list to remove items from
        items_to_remove: Set of items to remove
-    
+
    Returns:
        Either the modified original list (for single item removal) or
        a new list (for multiple item removal). Callers should use the
        returned value.
-    
+
    Note:
        For single item removal, this modifies the original list in-place
        and returns it. For multiple items, it creates and returns a new list.

--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -373,17 +373,17 @@ class OutputProcessor:
        1) Compute stats for logging
        2) Detokenize
        3) Create and handle RequestOutput objects:
-            * If there is a queue (for usage with AsyncLLM), 
+            * If there is a queue (for usage with AsyncLLM),
              put the RequestOutput objects into the queue for
              handling by the per-request generate() tasks.

-            * If there is no queue (for usage with LLMEngine), 
+            * If there is no queue (for usage with LLMEngine),
              return a list of RequestOutput objects.

        NOTE FOR DEVELOPERS

        vLLM V1 minimizes the number of python loops over the full
-        batch to ensure system overheads are minimized. This is the 
+        batch to ensure system overheads are minimized. This is the
        only function that should loop over EngineCoreOutputs.

        If you need to touch every element of the batch, do it from