Unverified Commit 25aba2b6 authored by Andrew Xia's avatar Andrew Xia Committed by GitHub
Browse files

[gpt-oss] Add IncompleteDetails to ResponsesRepsonse (#24561)


Signed-off-by: default avatarAndrew Xia <axia@meta.com>
parent 94b03f88
...@@ -74,6 +74,20 @@ async def test_basic_with_reasoning_effort(client: OpenAI, model_name: str): ...@@ -74,6 +74,20 @@ async def test_basic_with_reasoning_effort(client: OpenAI, model_name: str):
assert response.status == "completed" assert response.status == "completed"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_max_tokens(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input="What is the first paragraph of Moby Dick?",
reasoning={"effort": "low"},
max_output_tokens=30,
)
assert response is not None
assert response.status == "incomplete"
assert response.incomplete_details.reason == "max_output_tokens"
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_chat(client: OpenAI, model_name: str): async def test_chat(client: OpenAI, model_name: str):
......
...@@ -112,6 +112,7 @@ class HarmonyContext(ConversationContext): ...@@ -112,6 +112,7 @@ class HarmonyContext(ConversationContext):
available_tools: list[str], available_tools: list[str],
): ):
self._messages = messages self._messages = messages
self.finish_reason: Optional[str] = None
self.available_tools = available_tools self.available_tools = available_tools
self._tool_sessions: dict[str, Union[ClientSession, Tool]] = {} self._tool_sessions: dict[str, Union[ClientSession, Tool]] = {}
self.called_tools: set[str] = set() self.called_tools: set[str] = set()
...@@ -135,7 +136,8 @@ class HarmonyContext(ConversationContext): ...@@ -135,7 +136,8 @@ class HarmonyContext(ConversationContext):
if self.parser.current_channel in {"analysis", "commentary"}: if self.parser.current_channel in {"analysis", "commentary"}:
self.num_reasoning_tokens += 1 self.num_reasoning_tokens += 1
def append_output(self, output) -> None: def append_output(self, output: Union[RequestOutput,
list[Message]]) -> None:
if isinstance(output, RequestOutput): if isinstance(output, RequestOutput):
output_token_ids = output.outputs[0].token_ids output_token_ids = output.outputs[0].token_ids
self.parser = get_streamable_parser_for_assistant() self.parser = get_streamable_parser_for_assistant()
...@@ -150,6 +152,8 @@ class HarmonyContext(ConversationContext): ...@@ -150,6 +152,8 @@ class HarmonyContext(ConversationContext):
# Move current turn to previous turn for next turn's calculations # Move current turn to previous turn for next turn's calculations
self.previous_turn = self.current_turn.copy() self.previous_turn = self.current_turn.copy()
output_msgs = self.parser.messages output_msgs = self.parser.messages
# The responses finish reason is set in the last message
self.finish_reason = output.outputs[0].finish_reason
else: else:
# Tool output. # Tool output.
output_msgs = output output_msgs = output
...@@ -157,18 +161,18 @@ class HarmonyContext(ConversationContext): ...@@ -157,18 +161,18 @@ class HarmonyContext(ConversationContext):
def _update_prefill_token_usage(self, output: RequestOutput) -> None: def _update_prefill_token_usage(self, output: RequestOutput) -> None:
"""Update token usage statistics for the prefill phase of generation. """Update token usage statistics for the prefill phase of generation.
The prefill phase processes the input prompt tokens. This method: The prefill phase processes the input prompt tokens. This method:
1. Counts the prompt tokens for this turn 1. Counts the prompt tokens for this turn
2. Calculates tool output tokens for multi-turn conversations 2. Calculates tool output tokens for multi-turn conversations
3. Updates cached token counts 3. Updates cached token counts
4. Tracks state for next turn calculations 4. Tracks state for next turn calculations
Tool output tokens are calculated as: Tool output tokens are calculated as:
current_prompt_tokens - last_turn_prompt_tokens - current_prompt_tokens - last_turn_prompt_tokens -
last_turn_output_tokens last_turn_output_tokens
This represents tokens added between turns (typically tool responses). This represents tokens added between turns (typically tool responses).
Args: Args:
output: The RequestOutput containing prompt token information output: The RequestOutput containing prompt token information
""" """
...@@ -214,18 +218,18 @@ class HarmonyContext(ConversationContext): ...@@ -214,18 +218,18 @@ class HarmonyContext(ConversationContext):
def _update_decode_token_usage(self, output: RequestOutput) -> int: def _update_decode_token_usage(self, output: RequestOutput) -> int:
"""Update token usage statistics for the decode phase of generation. """Update token usage statistics for the decode phase of generation.
The decode phase processes the generated output tokens. This method: The decode phase processes the generated output tokens. This method:
1. Counts output tokens from all completion outputs 1. Counts output tokens from all completion outputs
2. Updates the total output token count 2. Updates the total output token count
3. Tracks tokens generated in the current turn 3. Tracks tokens generated in the current turn
In streaming mode, this is called for each token generated. In streaming mode, this is called for each token generated.
In non-streaming mode, this is called once with all output tokens. In non-streaming mode, this is called once with all output tokens.
Args: Args:
output: The RequestOutput containing generated token information output: The RequestOutput containing generated token information
Returns: Returns:
int: Number of output tokens processed in this call int: Number of output tokens processed in this call
""" """
...@@ -385,7 +389,8 @@ class StreamingHarmonyContext(HarmonyContext): ...@@ -385,7 +389,8 @@ class StreamingHarmonyContext(HarmonyContext):
def messages(self) -> list: def messages(self) -> list:
return self.parser.messages return self.parser.messages
def append_output(self, output) -> None: def append_output(self, output: Union[RequestOutput,
list[Message]]) -> None:
if isinstance(output, RequestOutput): if isinstance(output, RequestOutput):
# append_output is called for each output token in streaming case, # append_output is called for each output token in streaming case,
# so we only want to add the prompt tokens once for each message. # so we only want to add the prompt tokens once for each message.
......
...@@ -387,7 +387,9 @@ def parse_remaining_state( ...@@ -387,7 +387,9 @@ def parse_remaining_state(
id=f"msg_{random_uuid()}", id=f"msg_{random_uuid()}",
content=[output_text], content=[output_text],
role="assistant", role="assistant",
status="completed", # if the parser still has messages (ie if the generator got cut
# abruptly), this should be incomplete
status="incomplete",
type="message", type="message",
) )
return [text_item] return [text_item]
......
...@@ -30,7 +30,7 @@ except ImportError: # For newer openai versions (>= 1.100.0) ...@@ -30,7 +30,7 @@ except ImportError: # For newer openai versions (>= 1.100.0)
from openai.types.responses import (ResponseFormatTextConfig as from openai.types.responses import (ResponseFormatTextConfig as
ResponseTextConfig) ResponseTextConfig)
from openai.types.responses.response import ToolChoice from openai.types.responses.response import IncompleteDetails, ToolChoice
from openai.types.responses.tool import Tool from openai.types.responses.tool import Tool
from openai.types.shared import Metadata, Reasoning from openai.types.shared import Metadata, Reasoning
from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter, from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter,
...@@ -1868,7 +1868,7 @@ class ResponsesResponse(OpenAIBaseModel): ...@@ -1868,7 +1868,7 @@ class ResponsesResponse(OpenAIBaseModel):
id: str = Field(default_factory=lambda: f"resp_{random_uuid()}") id: str = Field(default_factory=lambda: f"resp_{random_uuid()}")
created_at: int = Field(default_factory=lambda: int(time.time())) created_at: int = Field(default_factory=lambda: int(time.time()))
# error: Optional[ResponseError] = None # error: Optional[ResponseError] = None
# incomplete_details: Optional[IncompleteDetails] = None incomplete_details: Optional[IncompleteDetails] = None
instructions: Optional[str] = None instructions: Optional[str] = None
metadata: Optional[Metadata] = None metadata: Optional[Metadata] = None
model: str model: str
...@@ -1904,9 +1904,18 @@ class ResponsesResponse(OpenAIBaseModel): ...@@ -1904,9 +1904,18 @@ class ResponsesResponse(OpenAIBaseModel):
status: ResponseStatus, status: ResponseStatus,
usage: Optional[ResponseUsage] = None, usage: Optional[ResponseUsage] = None,
) -> "ResponsesResponse": ) -> "ResponsesResponse":
incomplete_details: Optional[IncompleteDetails] = None
if status == 'incomplete':
incomplete_details = IncompleteDetails(reason='max_output_tokens')
# TODO: implement the other reason for incomplete_details,
# which is content_filter
# incomplete_details = IncompleteDetails(reason='content_filter')
return cls( return cls(
id=request.request_id, id=request.request_id,
created_at=created_time, created_at=created_time,
incomplete_details=incomplete_details,
instructions=request.instructions, instructions=request.instructions,
metadata=request.metadata, metadata=request.metadata,
model=model_name, model=model_name,
...@@ -2109,7 +2118,7 @@ class DetokenizeResponse(OpenAIBaseModel): ...@@ -2109,7 +2118,7 @@ class DetokenizeResponse(OpenAIBaseModel):
class TokenizerInfoResponse(OpenAIBaseModel): class TokenizerInfoResponse(OpenAIBaseModel):
""" """
Response containing tokenizer configuration Response containing tokenizer configuration
equivalent to tokenizer_config.json equivalent to tokenizer_config.json
""" """
...@@ -2199,7 +2208,7 @@ class TranscriptionRequest(OpenAIBaseModel): ...@@ -2199,7 +2208,7 @@ class TranscriptionRequest(OpenAIBaseModel):
to_language: Optional[str] = None to_language: Optional[str] = None
"""The language of the output audio we transcribe to. """The language of the output audio we transcribe to.
Please note that this is not currently used by supported models at this Please note that this is not currently used by supported models at this
time, but it is a placeholder for future use, matching translation api. time, but it is a placeholder for future use, matching translation api.
""" """
......
...@@ -27,7 +27,7 @@ from openai.types.responses import (ResponseCreatedEvent, ...@@ -27,7 +27,7 @@ from openai.types.responses import (ResponseCreatedEvent,
ResponseReasoningItem, ResponseReasoningItem,
ResponseReasoningTextDeltaEvent, ResponseReasoningTextDeltaEvent,
ResponseReasoningTextDoneEvent, ResponseReasoningTextDoneEvent,
response_text_delta_event) ResponseStatus, response_text_delta_event)
from openai.types.responses.response_output_text import (Logprob, from openai.types.responses.response_output_text import (Logprob,
LogprobTopLogprob) LogprobTopLogprob)
# yapf: enable # yapf: enable
...@@ -461,10 +461,22 @@ class OpenAIServingResponses(OpenAIServing): ...@@ -461,10 +461,22 @@ class OpenAIServingResponses(OpenAIServing):
# TODO: Use a vllm-specific Validation Error # TODO: Use a vllm-specific Validation Error
return self.create_error_response(str(e)) return self.create_error_response(str(e))
# NOTE: Implementation of stauts is still WIP, but for now
# we guarantee that if the status is not "completed", it is accurate.
# "completed" is implemented as the "catch-all" for now.
status: ResponseStatus = "completed"
if self.use_harmony: if self.use_harmony:
assert isinstance(context, HarmonyContext) assert isinstance(context, HarmonyContext)
output = self._make_response_output_items_with_harmony(context) output = self._make_response_output_items_with_harmony(context)
num_tool_output_tokens = context.num_tool_output_tokens num_tool_output_tokens = context.num_tool_output_tokens
if len(output) > 0:
if context.finish_reason == "length":
status = "incomplete"
elif context.finish_reason == "abort":
status = "cancelled"
else:
status = "incomplete"
else: else:
assert isinstance(context, SimpleContext) assert isinstance(context, SimpleContext)
final_res = context.last_output final_res = context.last_output
...@@ -501,7 +513,7 @@ class OpenAIServingResponses(OpenAIServing): ...@@ -501,7 +513,7 @@ class OpenAIServingResponses(OpenAIServing):
model_name=model_name, model_name=model_name,
created_time=created_time, created_time=created_time,
output=output, output=output,
status="completed", status=status,
usage=usage, usage=usage,
) )
...@@ -658,7 +670,7 @@ class OpenAIServingResponses(OpenAIServing): ...@@ -658,7 +670,7 @@ class OpenAIServingResponses(OpenAIServing):
self, self,
context: HarmonyContext, context: HarmonyContext,
) -> list[ResponseOutputItem]: ) -> list[ResponseOutputItem]:
output_items = [] output_items: list[ResponseOutputItem] = []
num_init_messages = context.num_init_messages num_init_messages = context.num_init_messages
for msg in context.messages[num_init_messages:]: for msg in context.messages[num_init_messages:]:
output_items.extend(parse_output_message(msg)) output_items.extend(parse_output_message(msg))
......
...@@ -10,19 +10,19 @@ from vllm.v1.request import Request, RequestStatus ...@@ -10,19 +10,19 @@ from vllm.v1.request import Request, RequestStatus
def remove_all(lst: list, items_to_remove: set) -> list: def remove_all(lst: list, items_to_remove: set) -> list:
"""Remove all items from a list that are in the items_to_remove set. """Remove all items from a list that are in the items_to_remove set.
This method optimizes for the common case of removing a single item, This method optimizes for the common case of removing a single item,
falling back to list comprehension for multiple items. falling back to list comprehension for multiple items.
Args: Args:
lst: The list to remove items from lst: The list to remove items from
items_to_remove: Set of items to remove items_to_remove: Set of items to remove
Returns: Returns:
Either the modified original list (for single item removal) or Either the modified original list (for single item removal) or
a new list (for multiple item removal). Callers should use the a new list (for multiple item removal). Callers should use the
returned value. returned value.
Note: Note:
For single item removal, this modifies the original list in-place For single item removal, this modifies the original list in-place
and returns it. For multiple items, it creates and returns a new list. and returns it. For multiple items, it creates and returns a new list.
......
...@@ -373,17 +373,17 @@ class OutputProcessor: ...@@ -373,17 +373,17 @@ class OutputProcessor:
1) Compute stats for logging 1) Compute stats for logging
2) Detokenize 2) Detokenize
3) Create and handle RequestOutput objects: 3) Create and handle RequestOutput objects:
* If there is a queue (for usage with AsyncLLM), * If there is a queue (for usage with AsyncLLM),
put the RequestOutput objects into the queue for put the RequestOutput objects into the queue for
handling by the per-request generate() tasks. handling by the per-request generate() tasks.
* If there is no queue (for usage with LLMEngine), * If there is no queue (for usage with LLMEngine),
return a list of RequestOutput objects. return a list of RequestOutput objects.
NOTE FOR DEVELOPERS NOTE FOR DEVELOPERS
vLLM V1 minimizes the number of python loops over the full vLLM V1 minimizes the number of python loops over the full
batch to ensure system overheads are minimized. This is the batch to ensure system overheads are minimized. This is the
only function that should loop over EngineCoreOutputs. only function that should loop over EngineCoreOutputs.
If you need to touch every element of the batch, do it from If you need to touch every element of the batch, do it from
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment