Unverified Commit 0d0c929f authored by Andrew Xia's avatar Andrew Xia Committed by GitHub
Browse files

[responsesAPI][8] input/output messages for ResponsesParser (#30158)


Signed-off-by: default avatarAndrew Xia <axia@fb.com>
Signed-off-by: default avatarAndrew Xia <axia@meta.com>
Co-authored-by: default avatarAndrew Xia <axia@fb.com>
Co-authored-by: default avatarChauncey <chaunceyjiang@gmail.com>
parent e94384bb
...@@ -165,6 +165,7 @@ async def test_mcp_tool_call(client: OpenAI, model_name: str): ...@@ -165,6 +165,7 @@ async def test_mcp_tool_call(client: OpenAI, model_name: str):
model=model_name, model=model_name,
input="What is 13 * 24? Use python to calculate the result.", input="What is 13 * 24? Use python to calculate the result.",
tools=[{"type": "code_interpreter", "container": {"type": "auto"}}], tools=[{"type": "code_interpreter", "container": {"type": "auto"}}],
extra_body={"enable_response_messages": True},
temperature=0.0, temperature=0.0,
) )
...@@ -178,3 +179,8 @@ async def test_mcp_tool_call(client: OpenAI, model_name: str): ...@@ -178,3 +179,8 @@ async def test_mcp_tool_call(client: OpenAI, model_name: str):
# make sure the correct math is in the final output # make sure the correct math is in the final output
assert response.output[3].type == "message" assert response.output[3].type == "message"
assert "312" in response.output[3].content[0].text assert "312" in response.output[3].content[0].text
# test raw input_messages / output_messages
assert len(response.input_messages) == 1
assert len(response.output_messages) == 3
assert "312" in response.output_messages[2]["message"]
...@@ -297,12 +297,40 @@ class ParsableContext(ConversationContext): ...@@ -297,12 +297,40 @@ class ParsableContext(ConversationContext):
self.chat_template = chat_template self.chat_template = chat_template
self.chat_template_content_format = chat_template_content_format self.chat_template_content_format = chat_template_content_format
self.input_messages: list[ResponseRawMessageAndToken] = []
self.output_messages: list[ResponseRawMessageAndToken] = []
def append_output(self, output: RequestOutput) -> None: def append_output(self, output: RequestOutput) -> None:
self.num_prompt_tokens = len(output.prompt_token_ids or []) self.num_prompt_tokens = len(output.prompt_token_ids or [])
self.num_cached_tokens = output.num_cached_tokens or 0 self.num_cached_tokens = output.num_cached_tokens or 0
self.num_output_tokens += len(output.outputs[0].token_ids or []) self.num_output_tokens += len(output.outputs[0].token_ids or [])
self.parser.process(output.outputs[0]) self.parser.process(output.outputs[0])
# only store if enable_response_messages is True, save memory
if self.request.enable_response_messages:
output_prompt = output.prompt or ""
output_prompt_token_ids = output.prompt_token_ids or []
if len(self.input_messages) == 0:
self.input_messages.append(
ResponseRawMessageAndToken(
message=output_prompt,
tokens=output_prompt_token_ids,
)
)
else:
self.output_messages.append(
ResponseRawMessageAndToken(
message=output_prompt,
tokens=output_prompt_token_ids,
)
)
self.output_messages.append(
ResponseRawMessageAndToken(
message=output.outputs[0].text,
tokens=output.outputs[0].token_ids,
)
)
def append_tool_output(self, output: list[ResponseInputOutputItem]) -> None: def append_tool_output(self, output: list[ResponseInputOutputItem]) -> None:
self.parser.response_messages.extend(output) self.parser.response_messages.extend(output)
......
...@@ -3,7 +3,11 @@ ...@@ -3,7 +3,11 @@
import logging import logging
from collections.abc import Callable from collections.abc import Callable
from openai.types.responses.response_function_tool_call import ResponseFunctionToolCall from openai.types.responses import ResponseFunctionToolCall, ResponseOutputItem
from openai.types.responses.response_function_tool_call_output_item import (
ResponseFunctionToolCallOutputItem,
)
from openai.types.responses.response_output_item import McpCall
from openai.types.responses.response_output_message import ResponseOutputMessage from openai.types.responses.response_output_message import ResponseOutputMessage
from openai.types.responses.response_output_text import ResponseOutputText from openai.types.responses.response_output_text import ResponseOutputText
from openai.types.responses.response_reasoning_item import ( from openai.types.responses.response_reasoning_item import (
...@@ -11,6 +15,7 @@ from openai.types.responses.response_reasoning_item import ( ...@@ -11,6 +15,7 @@ from openai.types.responses.response_reasoning_item import (
ResponseReasoningItem, ResponseReasoningItem,
) )
from vllm.entrypoints.constants import MCP_PREFIX
from vllm.entrypoints.openai.protocol import ResponseInputOutputItem, ResponsesRequest from vllm.entrypoints.openai.protocol import ResponseInputOutputItem, ResponsesRequest
from vllm.outputs import CompletionOutput from vllm.outputs import CompletionOutput
from vllm.reasoning.abs_reasoning_parsers import ReasoningParser from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
...@@ -111,6 +116,37 @@ class ResponsesParser: ...@@ -111,6 +116,37 @@ class ResponsesParser:
return self return self
def make_response_output_items_from_parsable_context(
self,
) -> list[ResponseOutputItem]:
"""Given a list of sentences, construct ResponseOutput Items."""
response_messages = self.response_messages[self.num_init_messages :]
output_messages: list[ResponseOutputItem] = []
for message in response_messages:
if not isinstance(message, ResponseFunctionToolCallOutputItem):
output_messages.append(message)
else:
if len(output_messages) == 0:
raise ValueError(
"Cannot have a FunctionToolCallOutput before FunctionToolCall."
)
if isinstance(output_messages[-1], ResponseFunctionToolCall):
mcp_message = McpCall(
id=f"{MCP_PREFIX}{random_uuid()}",
arguments=output_messages[-1].arguments,
name=output_messages[-1].name,
server_label=output_messages[
-1
].name, # TODO: store the server label
type="mcp_call",
status="completed",
output=message.output,
# TODO: support error output
)
output_messages[-1] = mcp_message
return output_messages
def get_responses_parser_for_simple_context( def get_responses_parser_for_simple_context(
*, *,
......
...@@ -104,7 +104,6 @@ from vllm.entrypoints.responses_utils import ( ...@@ -104,7 +104,6 @@ from vllm.entrypoints.responses_utils import (
construct_input_messages, construct_input_messages,
construct_tool_dicts, construct_tool_dicts,
extract_tool_types, extract_tool_types,
make_response_output_items_from_parsable_context,
) )
from vllm.entrypoints.tool_server import ToolServer from vllm.entrypoints.tool_server import ToolServer
from vllm.inputs.data import TokensPrompt from vllm.inputs.data import TokensPrompt
...@@ -658,17 +657,11 @@ class OpenAIServingResponses(OpenAIServing): ...@@ -658,17 +657,11 @@ class OpenAIServingResponses(OpenAIServing):
else: else:
status = "incomplete" status = "incomplete"
elif isinstance(context, ParsableContext): elif isinstance(context, ParsableContext):
response_messages = context.parser.response_messages[ output = context.parser.make_response_output_items_from_parsable_context()
context.parser.num_init_messages :
]
output = make_response_output_items_from_parsable_context(response_messages)
# TODO: context for non-gptoss models doesn't use messages
# so we can't get them out yet
if request.enable_response_messages: if request.enable_response_messages:
raise NotImplementedError( input_messages = context.input_messages
"enable_response_messages is currently only supported for gpt-oss" output_messages = context.output_messages
)
# TODO: Calculate usage. # TODO: Calculate usage.
# assert final_res.prompt_token_ids is not None # assert final_res.prompt_token_ids is not None
......
...@@ -16,7 +16,6 @@ from openai.types.responses.response import ToolChoice ...@@ -16,7 +16,6 @@ from openai.types.responses.response import ToolChoice
from openai.types.responses.response_function_tool_call_output_item import ( from openai.types.responses.response_function_tool_call_output_item import (
ResponseFunctionToolCallOutputItem, ResponseFunctionToolCallOutputItem,
) )
from openai.types.responses.response_output_item import McpCall
from openai.types.responses.response_output_message import ResponseOutputMessage from openai.types.responses.response_output_message import ResponseOutputMessage
from openai.types.responses.response_reasoning_item import ResponseReasoningItem from openai.types.responses.response_reasoning_item import ResponseReasoningItem
from openai.types.responses.tool import Tool from openai.types.responses.tool import Tool
...@@ -27,38 +26,6 @@ from vllm.entrypoints.openai.protocol import ( ...@@ -27,38 +26,6 @@ from vllm.entrypoints.openai.protocol import (
ChatCompletionMessageParam, ChatCompletionMessageParam,
ResponseInputOutputItem, ResponseInputOutputItem,
) )
from vllm.utils import random_uuid
def make_response_output_items_from_parsable_context(
response_messages: list[ResponseInputOutputItem],
) -> list[ResponseOutputItem]:
"""Given a list of sentences, construct ResponseOutput Items."""
output_messages: list[ResponseOutputItem] = []
for message in response_messages:
if not isinstance(message, ResponseFunctionToolCallOutputItem):
output_messages.append(message)
else:
if len(output_messages) == 0:
raise ValueError(
"Cannot have a FunctionToolCallOutput before FunctionToolCall."
)
if isinstance(output_messages[-1], ResponseFunctionToolCall):
mcp_message = McpCall(
id=f"{MCP_PREFIX}{random_uuid()}",
arguments=output_messages[-1].arguments,
name=output_messages[-1].name,
server_label=output_messages[
-1
].name, # TODO: store the server label
type=f"{MCP_PREFIX}call",
status="completed",
output=message.output,
# TODO: support error output
)
output_messages[-1] = mcp_message
return output_messages
def construct_input_messages( def construct_input_messages(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment