Unverified Commit 25aba2b6 authored by Andrew Xia's avatar Andrew Xia Committed by GitHub
Browse files

[gpt-oss] Add IncompleteDetails to ResponsesRepsonse (#24561)


Signed-off-by: default avatarAndrew Xia <axia@meta.com>
parent 94b03f88
...@@ -74,6 +74,20 @@ async def test_basic_with_reasoning_effort(client: OpenAI, model_name: str): ...@@ -74,6 +74,20 @@ async def test_basic_with_reasoning_effort(client: OpenAI, model_name: str):
assert response.status == "completed" assert response.status == "completed"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_max_tokens(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input="What is the first paragraph of Moby Dick?",
reasoning={"effort": "low"},
max_output_tokens=30,
)
assert response is not None
assert response.status == "incomplete"
assert response.incomplete_details.reason == "max_output_tokens"
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_chat(client: OpenAI, model_name: str): async def test_chat(client: OpenAI, model_name: str):
......
...@@ -112,6 +112,7 @@ class HarmonyContext(ConversationContext): ...@@ -112,6 +112,7 @@ class HarmonyContext(ConversationContext):
available_tools: list[str], available_tools: list[str],
): ):
self._messages = messages self._messages = messages
self.finish_reason: Optional[str] = None
self.available_tools = available_tools self.available_tools = available_tools
self._tool_sessions: dict[str, Union[ClientSession, Tool]] = {} self._tool_sessions: dict[str, Union[ClientSession, Tool]] = {}
self.called_tools: set[str] = set() self.called_tools: set[str] = set()
...@@ -135,7 +136,8 @@ class HarmonyContext(ConversationContext): ...@@ -135,7 +136,8 @@ class HarmonyContext(ConversationContext):
if self.parser.current_channel in {"analysis", "commentary"}: if self.parser.current_channel in {"analysis", "commentary"}:
self.num_reasoning_tokens += 1 self.num_reasoning_tokens += 1
def append_output(self, output) -> None: def append_output(self, output: Union[RequestOutput,
list[Message]]) -> None:
if isinstance(output, RequestOutput): if isinstance(output, RequestOutput):
output_token_ids = output.outputs[0].token_ids output_token_ids = output.outputs[0].token_ids
self.parser = get_streamable_parser_for_assistant() self.parser = get_streamable_parser_for_assistant()
...@@ -150,6 +152,8 @@ class HarmonyContext(ConversationContext): ...@@ -150,6 +152,8 @@ class HarmonyContext(ConversationContext):
# Move current turn to previous turn for next turn's calculations # Move current turn to previous turn for next turn's calculations
self.previous_turn = self.current_turn.copy() self.previous_turn = self.current_turn.copy()
output_msgs = self.parser.messages output_msgs = self.parser.messages
# The responses finish reason is set in the last message
self.finish_reason = output.outputs[0].finish_reason
else: else:
# Tool output. # Tool output.
output_msgs = output output_msgs = output
...@@ -385,7 +389,8 @@ class StreamingHarmonyContext(HarmonyContext): ...@@ -385,7 +389,8 @@ class StreamingHarmonyContext(HarmonyContext):
def messages(self) -> list: def messages(self) -> list:
return self.parser.messages return self.parser.messages
def append_output(self, output) -> None: def append_output(self, output: Union[RequestOutput,
list[Message]]) -> None:
if isinstance(output, RequestOutput): if isinstance(output, RequestOutput):
# append_output is called for each output token in streaming case, # append_output is called for each output token in streaming case,
# so we only want to add the prompt tokens once for each message. # so we only want to add the prompt tokens once for each message.
......
...@@ -387,7 +387,9 @@ def parse_remaining_state( ...@@ -387,7 +387,9 @@ def parse_remaining_state(
id=f"msg_{random_uuid()}", id=f"msg_{random_uuid()}",
content=[output_text], content=[output_text],
role="assistant", role="assistant",
status="completed", # if the parser still has messages (ie if the generator got cut
# abruptly), this should be incomplete
status="incomplete",
type="message", type="message",
) )
return [text_item] return [text_item]
......
...@@ -30,7 +30,7 @@ except ImportError: # For newer openai versions (>= 1.100.0) ...@@ -30,7 +30,7 @@ except ImportError: # For newer openai versions (>= 1.100.0)
from openai.types.responses import (ResponseFormatTextConfig as from openai.types.responses import (ResponseFormatTextConfig as
ResponseTextConfig) ResponseTextConfig)
from openai.types.responses.response import ToolChoice from openai.types.responses.response import IncompleteDetails, ToolChoice
from openai.types.responses.tool import Tool from openai.types.responses.tool import Tool
from openai.types.shared import Metadata, Reasoning from openai.types.shared import Metadata, Reasoning
from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter, from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter,
...@@ -1868,7 +1868,7 @@ class ResponsesResponse(OpenAIBaseModel): ...@@ -1868,7 +1868,7 @@ class ResponsesResponse(OpenAIBaseModel):
id: str = Field(default_factory=lambda: f"resp_{random_uuid()}") id: str = Field(default_factory=lambda: f"resp_{random_uuid()}")
created_at: int = Field(default_factory=lambda: int(time.time())) created_at: int = Field(default_factory=lambda: int(time.time()))
# error: Optional[ResponseError] = None # error: Optional[ResponseError] = None
# incomplete_details: Optional[IncompleteDetails] = None incomplete_details: Optional[IncompleteDetails] = None
instructions: Optional[str] = None instructions: Optional[str] = None
metadata: Optional[Metadata] = None metadata: Optional[Metadata] = None
model: str model: str
...@@ -1904,9 +1904,18 @@ class ResponsesResponse(OpenAIBaseModel): ...@@ -1904,9 +1904,18 @@ class ResponsesResponse(OpenAIBaseModel):
status: ResponseStatus, status: ResponseStatus,
usage: Optional[ResponseUsage] = None, usage: Optional[ResponseUsage] = None,
) -> "ResponsesResponse": ) -> "ResponsesResponse":
incomplete_details: Optional[IncompleteDetails] = None
if status == 'incomplete':
incomplete_details = IncompleteDetails(reason='max_output_tokens')
# TODO: implement the other reason for incomplete_details,
# which is content_filter
# incomplete_details = IncompleteDetails(reason='content_filter')
return cls( return cls(
id=request.request_id, id=request.request_id,
created_at=created_time, created_at=created_time,
incomplete_details=incomplete_details,
instructions=request.instructions, instructions=request.instructions,
metadata=request.metadata, metadata=request.metadata,
model=model_name, model=model_name,
......
...@@ -27,7 +27,7 @@ from openai.types.responses import (ResponseCreatedEvent, ...@@ -27,7 +27,7 @@ from openai.types.responses import (ResponseCreatedEvent,
ResponseReasoningItem, ResponseReasoningItem,
ResponseReasoningTextDeltaEvent, ResponseReasoningTextDeltaEvent,
ResponseReasoningTextDoneEvent, ResponseReasoningTextDoneEvent,
response_text_delta_event) ResponseStatus, response_text_delta_event)
from openai.types.responses.response_output_text import (Logprob, from openai.types.responses.response_output_text import (Logprob,
LogprobTopLogprob) LogprobTopLogprob)
# yapf: enable # yapf: enable
...@@ -461,10 +461,22 @@ class OpenAIServingResponses(OpenAIServing): ...@@ -461,10 +461,22 @@ class OpenAIServingResponses(OpenAIServing):
# TODO: Use a vllm-specific Validation Error # TODO: Use a vllm-specific Validation Error
return self.create_error_response(str(e)) return self.create_error_response(str(e))
# NOTE: Implementation of stauts is still WIP, but for now
# we guarantee that if the status is not "completed", it is accurate.
# "completed" is implemented as the "catch-all" for now.
status: ResponseStatus = "completed"
if self.use_harmony: if self.use_harmony:
assert isinstance(context, HarmonyContext) assert isinstance(context, HarmonyContext)
output = self._make_response_output_items_with_harmony(context) output = self._make_response_output_items_with_harmony(context)
num_tool_output_tokens = context.num_tool_output_tokens num_tool_output_tokens = context.num_tool_output_tokens
if len(output) > 0:
if context.finish_reason == "length":
status = "incomplete"
elif context.finish_reason == "abort":
status = "cancelled"
else:
status = "incomplete"
else: else:
assert isinstance(context, SimpleContext) assert isinstance(context, SimpleContext)
final_res = context.last_output final_res = context.last_output
...@@ -501,7 +513,7 @@ class OpenAIServingResponses(OpenAIServing): ...@@ -501,7 +513,7 @@ class OpenAIServingResponses(OpenAIServing):
model_name=model_name, model_name=model_name,
created_time=created_time, created_time=created_time,
output=output, output=output,
status="completed", status=status,
usage=usage, usage=usage,
) )
...@@ -658,7 +670,7 @@ class OpenAIServingResponses(OpenAIServing): ...@@ -658,7 +670,7 @@ class OpenAIServingResponses(OpenAIServing):
self, self,
context: HarmonyContext, context: HarmonyContext,
) -> list[ResponseOutputItem]: ) -> list[ResponseOutputItem]:
output_items = [] output_items: list[ResponseOutputItem] = []
num_init_messages = context.num_init_messages num_init_messages = context.num_init_messages
for msg in context.messages[num_init_messages:]: for msg in context.messages[num_init_messages:]:
output_items.extend(parse_output_message(msg)) output_items.extend(parse_output_message(msg))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment