[Frontend] refactor harmony utils output message parsing (#29820)

Signed-off-by: Daniel Salib <danielsalib@meta.com>

[Frontend] refactor harmony utils output message parsing (#29820)
Signed-off-by: Daniel Salib <danielsalib@meta.com>
404fc4bf · daniel-salib · GitHub · 82a64b3d · 404fc4bf
Unverified Commit 404fc4bf authored Dec 03, 2025 by daniel-salib Committed by GitHub Dec 04, 2025
Show whitespace changes
Inline Side-by-side

Showing with 117 additions and 99 deletions

vllm/entrypoints/harmony_utils.py vllm/entrypoints/harmony_utils.py +117 -99

No files found.
--- a/vllm/entrypoints/harmony_utils.py
+++ b/vllm/entrypoints/harmony_utils.py
@@ -328,32 +328,16 @@ def render_for_completion(messages: list[Message]) -> list[int]:
    return token_ids
-def parse_output_message(message: Message) -> list[ResponseOutputItem]:
+def _parse_browser_tool_call(message: Message, recipient: str) -> ResponseOutputItem:
-    """
+    """Parse browser tool calls (search, open, find) into web search items."""
-    Parse a Harmony message into a list of output response items.
-    """
-    if message.author.role != "assistant":
-        # This is a message from a tool to the assistant (e.g., search result).
-        # Don't include it in the final output for now. This aligns with
-        # OpenAI's behavior on models like o4-mini.
-        return []
-    output_items: list[ResponseOutputItem] = []
-    recipient = message.recipient
-    if recipient is not None and recipient.startswith("browser."):
    if len(message.content) != 1:
        raise ValueError("Invalid number of contents in browser message")
    content = message.content[0]
-        # We do not need to check the VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY
-        # env variable since if it is not set, we are certain the json is valid
+    # Parse JSON args (with retry detection)
-        # The use of Actions for web search will be removed entirely in
-        # the future, so this is only necessary temporarily
    try:
        browser_call = json.loads(content.text)
    except json.JSONDecodeError:
-            # If the content is not valid JSON, then it was
-            # caught and retried by vLLM, which means we
-            # need to make note of that so the user is aware
        json_retry_output_message = (
            f"Invalid JSON args, caught and retried: {content.text}"
        )
@@ -362,7 +346,8 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]:
            "url": json_retry_output_message,
            "pattern": json_retry_output_message,
        }
-        # TODO: translate to url properly!
+    # Create appropriate action based on recipient
    if recipient == "browser.search":
        action = ActionSearch(
            query=f"cursor:{browser_call.get('query', '')}", type="search"
@@ -373,36 +358,25 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]:
        )
    elif recipient == "browser.find":
        action = ActionFind(
-                pattern=browser_call["pattern"],
+            pattern=browser_call.get("pattern", ""),
            url=f"cursor:{browser_call.get('url', '')}",
            type="find",
        )
    else:
        raise ValueError(f"Unknown browser action: {recipient}")
-        web_search_item = ResponseFunctionWebSearch(
+    return ResponseFunctionWebSearch(
        id=f"ws_{random_uuid()}",
        action=action,
        status="completed",
        type="web_search_call",
    )
-        output_items.append(web_search_item)
-    elif message.channel == "analysis":
-        for content in message.content:
+def _parse_function_call(message: Message, recipient: str) -> list[ResponseOutputItem]:
-            reasoning_item = ResponseReasoningItem(
+    """Parse function calls into function tool call items."""
-                id=f"rs_{random_uuid()}",
-                summary=[],
-                type="reasoning",
-                content=[
-                    ResponseReasoningTextContent(
-                        text=content.text, type="reasoning_text"
-                    )
-                ],
-                status=None,
-            )
-            output_items.append(reasoning_item)
-    elif message.channel == "commentary":
-        if recipient is not None and recipient.startswith("functions."):
    function_name = recipient.split(".")[-1]
+    output_items = []
    for content in message.content:
        random_id = random_uuid()
        response_item = ResponseFunctionToolCall(
@@ -413,27 +387,28 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]:
            id=f"fc_{random_id}",
        )
        output_items.append(response_item)
-        elif recipient is not None and (
+    return output_items
-            recipient.startswith("python")
-            or recipient.startswith("browser")
-            or recipient.startswith("container")
+def _parse_reasoning_content(message: Message) -> list[ResponseOutputItem]:
-        ):
+    """Parse reasoning/analysis content into reasoning items."""
+    output_items = []
    for content in message.content:
        reasoning_item = ResponseReasoningItem(
            id=f"rs_{random_uuid()}",
            summary=[],
            type="reasoning",
            content=[
-                        ResponseReasoningTextContent(
+                ResponseReasoningTextContent(text=content.text, type="reasoning_text")
-                            text=content.text, type="reasoning_text"
-                        )
            ],
            status=None,
        )
        output_items.append(reasoning_item)
-        else:
+    return output_items
-            raise ValueError(f"Unknown recipient: {recipient}")
-    elif message.channel == "final":
+def _parse_final_message(message: Message) -> ResponseOutputItem:
+    """Parse final channel messages into output message items."""
    contents = []
    for content in message.content:
        output_text = ResponseOutputText(
@@ -443,16 +418,59 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]:
            logprobs=None,  # TODO
        )
        contents.append(output_text)
-        text_item = ResponseOutputMessage(
+    return ResponseOutputMessage(
        id=f"msg_{random_uuid()}",
        content=contents,
        role=message.author.role,
        status="completed",
        type="message",
    )
-        output_items.append(text_item)
+def parse_output_message(message: Message) -> list[ResponseOutputItem]:
+    """
+    Parse a Harmony message into a list of output response items.
+    """
+    if message.author.role != "assistant":
+        # This is a message from a tool to the assistant (e.g., search result).
+        # Don't include it in the final output for now. This aligns with
+        # OpenAI's behavior on models like o4-mini.
+        return []
+    output_items: list[ResponseOutputItem] = []
+    recipient = message.recipient
+    # Browser tool calls
+    if recipient is not None and recipient.startswith("browser."):
+        output_items.append(_parse_browser_tool_call(message, recipient))
+    # Analysis channel (reasoning/chain-of-thought)
+    elif message.channel == "analysis":
+        output_items.extend(_parse_reasoning_content(message))
+    # Commentary channel
+    elif message.channel == "commentary":
+        # Function calls
+        if recipient is not None and recipient.startswith("functions."):
+            output_items.extend(_parse_function_call(message, recipient))
+        # Built-in tools on commentary channel are treated as reasoning for now
+        elif recipient is not None and (
+            recipient.startswith("python")
+            or recipient.startswith("browser")
+            or recipient.startswith("container")
+        ):
+            output_items.extend(_parse_reasoning_content(message))
+        else:
+            raise ValueError(f"Unknown recipient: {recipient}")
+    # Final output message
+    elif message.channel == "final":
+        output_items.append(_parse_final_message(message))
    else:
        raise ValueError(f"Unknown channel: {message.channel}")
    return output_items