chore: tool calling -- sglang e2e tests (#3650)

43329cd6 · Elyas Mehtabuddin · GitHub · 6ace46ce · 43329cd6 · 43329cd6
Unverified Commit 43329cd6 authored Oct 20, 2025 by Elyas Mehtabuddin Committed by GitHub Oct 20, 2025
10 changed files
--- a/lib/llm/tests/data/sglang/gpt-oss-20b/chat_completion_stream_19c97899-tool.json
+++ b/lib/llm/tests/data/sglang/gpt-oss-20b/chat_completion_stream_19c97899-tool.json
--- a/lib/llm/tests/data/sglang/gpt-oss-20b/chat_completion_stream_675195a8-no-tool.json
+++ b/lib/llm/tests/data/sglang/gpt-oss-20b/chat_completion_stream_675195a8-no-tool.json
--- a/lib/llm/tests/data/sglang/qwen3-0.6B/chat_completion_stream_c42ba578-tool.json
+++ b/lib/llm/tests/data/sglang/qwen3-0.6B/chat_completion_stream_c42ba578-tool.json
--- a/lib/llm/tests/data/sglang/qwen3-0.6B/chat_completion_stream_f121d1ca-no-tool.json
+++ b/lib/llm/tests/data/sglang/qwen3-0.6B/chat_completion_stream_f121d1ca-no-tool.json
--- a/lib/llm/tests/data/vllm/gpt-oss-20b/chat_completion_stream_49f581c1-no-tool.json
+++ b/lib/llm/tests/data/vllm/gpt-oss-20b/chat_completion_stream_49f581c1-no-tool.json
--- a/lib/llm/tests/data/vllm/gpt-oss-20b/chat_completion_stream_f0c86d72-tool.json
+++ b/lib/llm/tests/data/vllm/gpt-oss-20b/chat_completion_stream_f0c86d72-tool.json
 {
  "request_id": "f0c86d72-cdc7-4d76-b59f-79af124643c1",
-  "normal_content": "",
-  "reasoning_content": "User asks for weather in San Francisco in Celsius. Use function.",
-  "tool_calls": [{"index": 0, "id": "call-1", "type": "function", "function": {"name": "get_weather", "arguments": "{\"location\":\"San Francisco, CA\",\"unit\":\"celsius\"}"}}],
-  "data": [
+  "expected_output": {
+      "normal_content": "",
+      "reasoning_content": "User asks for weather in San Francisco in Celsius. Use function.",
+      "tool_calls": [
+        {
+          "index": 0,
+          "id": "call-1",
+          "type": "function",
+          "function": {
+            "name": "get_weather",
+            "arguments": "{\"location\":\"San Francisco, CA\",\"unit\":\"celsius\"}"
+          }
+        }
+      ]
+    },
+  "input_stream": [
    {"data":{"id":"chatcmpl-f0c86d72-cdc7-4d76-b59f-79af124643c1","choices":[{"index":0,"delta":{"content":"<|channel|>","function_call":null,"tool_calls":null,"role":"assistant","refusal":null,"reasoning_content":null}}]}},
    {"data":{"id":"chatcmpl-f0c86d72-cdc7-4d76-b59f-79af124643c1","choices":[{"index":0,"delta":{"content":"analysis","function_call":null,"tool_calls":null,"role":"assistant","refusal":null,"reasoning_content":null}}]}},
    {"data":{"id":"chatcmpl-f0c86d72-cdc7-4d76-b59f-79af124643c1","choices":[{"index":0,"delta":{"content":"<|message|>","function_call":null,"tool_calls":null,"role":"assistant","refusal":null,"reasoning_content":null}}]}},

--- a/lib/llm/tests/data/vllm/nemotron-49b/chat_completion_stream_3d40f925-tool.json
+++ b/lib/llm/tests/data/vllm/nemotron-49b/chat_completion_stream_3d40f925-tool.json
 {
  "request_id": "3d40f925-f7fa-4cf3-b413-f70bb895390d",
-  "normal_content": "\n\n",
-  "reasoning_content": "\nOkay, the user is asking for the weather in Tokyo in Celsius. Let me check the tools available. There's a function called get_weather that takes location and unit as parameters. The location should be a string like \"Tokyo, JP\" maybe, but the user just said Tokyo. The unit needs to be \"celsius\" as per the enum. I need to make sure to format the location correctly. Since the user didn't specify the country, but Tokyo is a major city, the function might handle it with just \"Tokyo\". But sometimes weather APIs require the country code. However, the example given in the tool's parameters was \"San Francisco, CA\", which includes the state. But for Tokyo, maybe it's better to include the country code, like \"Tokyo, Japan\" or \"Tokyo, JP\". But the user didn't specify, so maybe just \"Tokyo\" is sufficient. The function might handle it. Alternatively, if the function requires more specific location, but since the user didn't provide more details, I should proceed with \"Tokyo\". The unit is definitely \"celsius\" as per the user's request. So the tool call should be get_weather with location \"Tokyo\" and unit \"celsius\".\n",
-  "tool_calls": [{"index": 0, "id": "call-db202d65-5799-45cb-9f43-7dc43a084a1c", "type": "function", "function": {"name": "get_weather", "arguments": "{\"unit\":\"celsius\",\"location\":\"Tokyo\"}"}}],
-  "data": [
+  "expected_output": {
+      "normal_content": "\n\n",
+      "reasoning_content": "\nOkay, the user is asking for the weather in Tokyo in Celsius. Let me check the tools available. There's a function called get_weather that takes location and unit as parameters. The location should be a string like \"Tokyo, JP\" maybe, but the user just said Tokyo. The unit needs to be \"celsius\" as per the enum. I need to make sure to format the location correctly. Since the user didn't specify the country, but Tokyo is a major city, the function might handle it with just \"Tokyo\". But sometimes weather APIs require the country code. However, the example given in the tool's parameters was \"San Francisco, CA\", which includes the state. But for Tokyo, maybe it's better to include the country code, like \"Tokyo, Japan\" or \"Tokyo, JP\". But the user didn't specify, so maybe just \"Tokyo\" is sufficient. The function might handle it. Alternatively, if the function requires more specific location, but since the user didn't provide more details, I should proceed with \"Tokyo\". The unit is definitely \"celsius\" as per the user's request. So the tool call should be get_weather with location \"Tokyo\" and unit \"celsius\".\n",
+      "tool_calls": [
+        {
+          "index": 0,
+          "id": "call-db202d65-5799-45cb-9f43-7dc43a084a1c",
+          "type": "function",
+          "function": {
+            "name": "get_weather",
+            "arguments": "{\"unit\":\"celsius\",\"location\":\"Tokyo\"}"
+          }
+        }
+      ]
+    },
+  "input_stream": [
    {"data":{"id":"chatcmpl-3d40f925-f7fa-4cf3-b413-f70bb895390d","choices":[{"index":0,"delta":{"content":"<th","function_call":null,"tool_calls":null,"role":"assistant","refusal":null,"reasoning_content":null}}]}},
    {"data":{"id":"chatcmpl-3d40f925-f7fa-4cf3-b413-f70bb895390d","choices":[{"index":0,"delta":{"content":"ink","function_call":null,"tool_calls":null,"role":"assistant","refusal":null,"reasoning_content":null}}]}},
    {"data":{"id":"chatcmpl-3d40f925-f7fa-4cf3-b413-f70bb895390d","choices":[{"index":0,"delta":{"content":">\n","function_call":null,"tool_calls":null,"role":"assistant","refusal":null,"reasoning_content":null}}]}},

--- a/lib/llm/tests/data/vllm/qwen3-0.6B/chat_completion_stream_5627a4c6-no-tool.json
+++ b/lib/llm/tests/data/vllm/qwen3-0.6B/chat_completion_stream_5627a4c6-no-tool.json
 {
  "request_id": "5627a4c6-39a4-4d9c-8f6a-0f2d853dc50e",
-  "normal_content": "\n\nI'm here to help! If you have any questions about NYC, weather, or anything else, feel free to ask! \ud83d\ude0a",
-  "reasoning_content": "\nOkay, the user is asking, \"How's your day going in NYC?\" Let me think about how to respond.\n\nFirst, I need to check if there's a function available to get the weather in NYC. The provided tool is get_weather, which requires location and unit. The user didn't mention any weather details, so maybe they just want a general response about NYC.\n\nSince there's no function to retrieve personal information or current weather, I can't use the tool here. I should acknowledge their question and offer to help with something else. I need to make sure my response is helpful and guides them appropriately without making up data.\n",
-  "tool_calls": [],
-  "data": [
+  "expected_output": {
+      "normal_content": "\n\nI'm here to help! If you have any questions about NYC, weather, or anything else, feel free to ask! 😊",
+      "reasoning_content": "\nOkay, the user is asking, \"How's your day going in NYC?\" Let me think about how to respond.\n\nFirst, I need to check if there's a function available to get the weather in NYC. The provided tool is get_weather, which requires location and unit. The user didn't mention any weather details, so maybe they just want a general response about NYC.\n\nSince there's no function to retrieve personal information or current weather, I can't use the tool here. I should acknowledge their question and offer to help with something else. I need to make sure my response is helpful and guides them appropriately without making up data.\n",
+      "tool_calls": []
+    },
+  "input_stream": [
    {"data":{"id":"chatcmpl-5627a4c6-39a4-4d9c-8f6a-0f2d853dc50e","choices":[{"index":0,"delta":{"content":"<think>","function_call":null,"tool_calls":null,"role":"assistant","refusal":null,"reasoning_content":null}}]}},
    {"data":{"id":"chatcmpl-5627a4c6-39a4-4d9c-8f6a-0f2d853dc50e","choices":[{"index":0,"delta":{"content":"\n","function_call":null,"tool_calls":null,"role":"assistant","refusal":null,"reasoning_content":null}}]}},
    {"data":{"id":"chatcmpl-5627a4c6-39a4-4d9c-8f6a-0f2d853dc50e","choices":[{"index":0,"delta":{"content":"Okay","function_call":null,"tool_calls":null,"role":"assistant","refusal":null,"reasoning_content":null}}]}},
@@ -162,7 +164,7 @@
    {"data":{"id":"chatcmpl-5627a4c6-39a4-4d9c-8f6a-0f2d853dc50e","choices":[{"index":0,"delta":{"content":" ask","function_call":null,"tool_calls":null,"role":"assistant","refusal":null,"reasoning_content":null}}]}},
    {"data":{"id":"chatcmpl-5627a4c6-39a4-4d9c-8f6a-0f2d853dc50e","choices":[{"index":0,"delta":{"content":"!","function_call":null,"tool_calls":null,"role":"assistant","refusal":null,"reasoning_content":null}}]}},
    {"data":{"id":"chatcmpl-5627a4c6-39a4-4d9c-8f6a-0f2d853dc50e","choices":[{"index":0,"delta":{"content":null,"function_call":null,"tool_calls":null,"role":"assistant","refusal":null,"reasoning_content":null}}]}},
-    {"data":{"id":"chatcmpl-5627a4c6-39a4-4d9c-8f6a-0f2d853dc50e","choices":[{"index":0,"delta":{"content":" \ud83d\ude0a","function_call":null,"tool_calls":null,"role":"assistant","refusal":null,"reasoning_content":null}}]}},
+    {"data":{"id":"chatcmpl-5627a4c6-39a4-4d9c-8f6a-0f2d853dc50e","choices":[{"index":0,"delta":{"content":" 😊","function_call":null,"tool_calls":null,"role":"assistant","refusal":null,"reasoning_content":null}}]}},
    {"data":{"id":"chatcmpl-5627a4c6-39a4-4d9c-8f6a-0f2d853dc50e","choices":[{"index":0,"delta":{"content":null,"function_call":null,"tool_calls":null,"role":"assistant","refusal":null,"reasoning_content":null},"finish_reason":"stop"}]}}
  ]
 }
--- a/lib/llm/tests/data/vllm/qwen3-0.6B/chat_completion_stream_8f33c28b-tool.json
+++ b/lib/llm/tests/data/vllm/qwen3-0.6B/chat_completion_stream_8f33c28b-tool.json
 {
  "request_id": "8f33c28b-cb52-4272-9ac5-0cb9f80386d3",
-  "normal_content": "\n\n",
-  "reasoning_content": "\nOkay, the user is asking for the weather in San Francisco in Celsius. Let me check the tools provided. There's a function called get_weather that requires location and unit. The location needs to be a city and state, which San Francisco, CA is. The unit they specified is Celsius. So I should call get_weather with location \"San Francisco, CA\" and unit \"celsius\". That should retrieve the current weather data in the requested format.\n",
-  "tool_calls": [{"index": 0, "id": "call-61672e29-0cbf-45a0-b028-72389d0b2972", "type": "function", "function": {"name": "get_weather", "arguments": "{\"location\":\"San Francisco, CA\",\"unit\":\"celsius\"}"}}],
-  "data": [
+  "expected_output": {
+      "normal_content": "\n\n",
+      "reasoning_content": "\nOkay, the user is asking for the weather in San Francisco in Celsius. Let me check the tools provided. There's a function called get_weather that requires location and unit. The location needs to be a city and state, which San Francisco, CA is. The unit they specified is Celsius. So I should call get_weather with location \"San Francisco, CA\" and unit \"celsius\". That should retrieve the current weather data in the requested format.\n",
+      "tool_calls": [
+        {
+          "index": 0,
+          "id": "call-61672e29-0cbf-45a0-b028-72389d0b2972",
+          "type": "function",
+          "function": {
+            "name": "get_weather",
+            "arguments": "{\"location\":\"San Francisco, CA\",\"unit\":\"celsius\"}"
+          }
+        }
+      ]
+    },
+  "input_stream": [
    {"data":{"id":"chatcmpl-8f33c28b-cb52-4272-9ac5-0cb9f80386d3","choices":[{"index":0,"delta":{"content":"<think>","function_call":null,"tool_calls":null,"role":"assistant","refusal":null,"reasoning_content":null}}]}},
    {"data":{"id":"chatcmpl-8f33c28b-cb52-4272-9ac5-0cb9f80386d3","choices":[{"index":0,"delta":{"content":"\n","function_call":null,"tool_calls":null,"role":"assistant","refusal":null,"reasoning_content":null}}]}},
    {"data":{"id":"chatcmpl-8f33c28b-cb52-4272-9ac5-0cb9f80386d3","choices":[{"index":0,"delta":{"content":"Okay","function_call":null,"tool_calls":null,"role":"assistant","refusal":null,"reasoning_content":null}}]}},

--- a/lib/llm/tests/test_parsers_e2e.rs
+++ b/lib/llm/tests/test_parsers_e2e.rs
 // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0

+/*
+
+- Primary reason these tests were added is because we wanted to iterate quickly
+  with concrete examples (rather than speculative fixtures), these tests catch
+  regressions caused by backend chunk boundaries or minor field differences even
+  when the overall protocol is the same.
+
+- The "vllm" / "sglang" labels are not parser-specific logic. They only indicate
+  the recorded source of the streaming chunks under tests/data. Different serving
+  frameworks can vary chunk granularity and some envelope details (e.g., TRT-LLM
+  often emits bigger deltas). Our parsing must be robust to these variations, so
+  we validate against multiple real-world backends.
+
+- These tests run through our full streaming parsing pipeline. We feed captured,
+  production-like chunks into tool call parsing, then assert the aggregated
+  reasoning content, final content, and tool-calls. This provides broader
+  coverage than narrowly scoped unit tests of helpers and gives quick confidence
+  when we tweak parsers (Harmony/Hermes/Qwen/Nemotron, etc.).
+
+- To add another backend (e.g., trt-llm), record its streams under
+tests/data/<backend>/... and mirror one of the existing tests so invariants hold
+across backends.
+
+*/
+
 use dynamo_async_openai::types::ChatChoiceStream;
 use dynamo_llm::preprocessor::OpenAIPreprocessor;
 use dynamo_llm::protocols::openai::chat_completions::NvCreateChatCompletionStreamResponse;
@@ -26,30 +51,34 @@ fn load_test_data(file_path: &str) -> TestData {
    // Parse the file as JSON
    let parsed_json: serde_json::Value = serde_json::from_str(&data).unwrap();

-    // Extract expected values
-    let expected_normal_content = parsed_json
+    // Extract expected values (supports both new and legacy formats)
+    let expected = parsed_json
+        .get("expected_output")
+        .expect("No 'expected_output' object found in JSON");
+
+    let expected_normal_content = expected
        .get("normal_content")
        .and_then(|v| v.as_str())
        .unwrap_or("")
        .to_string();

-    let expected_reasoning_content = parsed_json
+    let expected_reasoning_content = expected
        .get("reasoning_content")
        .and_then(|v| v.as_str())
        .unwrap_or("")
        .to_string();

-    let expected_tool_calls = parsed_json
+    let expected_tool_calls = expected
        .get("tool_calls")
        .and_then(|v| v.as_array())
        .cloned()
        .unwrap_or_default();

-    // Extract the data chunks with choices
+    // Extract the data chunks with choices from new `input_stream`
    let data_chunks = parsed_json
-        .get("data")
+        .get("input_stream")
        .and_then(|v| v.as_array())
-        .expect("No 'data' array found in JSON");
+        .expect("No 'input_stream' array found in JSON");

    let stream_chunks = data_chunks
        .iter()
@@ -428,6 +457,225 @@ mod tests {
        assert_tool_calls(&aggregated.tool_calls, &test_data.expected_tool_calls);
    }

+    #[tokio::test]
+    async fn test_gpt_oss_e2e_with_no_tool_calls_sglang() {
+        // SGLang Parsing test for GPT-OSS without tool calls.
+
+        let file_path = format!(
+            "{}/sglang/gpt-oss-20b/chat_completion_stream_675195a8-no-tool.json",
+            DATA_ROOT_PATH
+        );
+        let test_data = load_test_data(&file_path);
+
+        // Create a stream from the mock chunks
+        let input_stream = stream::iter(test_data.stream_chunks);
+
+        // Parse the response stream with reasoning and tool parsing enabled
+        let output_chunks = parse_response_stream(
+            input_stream,
+            true,
+            true,
+            Some("harmony".to_string()),
+            Some("gpt_oss".to_string()),
+        )
+        .await;
+
+        // Verify we got output chunks
+        assert!(!output_chunks.is_empty(), "Should have output chunks");
+
+        // Aggregate content from all chunks
+        let aggregated = aggregate_content_from_chunks(&output_chunks);
+
+        // Expect content and reasoning present, no tool calls
+        assert!(
+            !aggregated.normal_content.is_empty(),
+            "Should have normal content for no-tool case"
+        );
+        assert!(
+            !aggregated.reasoning_content.is_empty(),
+            "Should have reasoning content parsed from analysis channel"
+        );
+        assert!(
+            !aggregated.has_tool_calls,
+            "Should not have tool calls in no-tool case"
+        );
+
+        assert_eq!(
+            aggregated.reasoning_content, test_data.expected_reasoning_content,
+            "Reasoning content should match expected value.",
+        );
+
+        assert_eq!(
+            aggregated.normal_content, test_data.expected_normal_content,
+            "Normal content should match expected value.",
+        );
+
+        assert_tool_calls(&aggregated.tool_calls, &test_data.expected_tool_calls);
+    }
+
+    #[tokio::test]
+    async fn test_gpt_oss_e2e_with_tool_calls_sglang() {
+        // SGLang Parsing test for GPT-OSS with tool calls.
+
+        let file_path = format!(
+            "{}/sglang/gpt-oss-20b/chat_completion_stream_19c97899-tool.json",
+            DATA_ROOT_PATH
+        );
+        let test_data = load_test_data(&file_path);
+
+        // Create a stream from the mock chunks
+        let input_stream = stream::iter(test_data.stream_chunks);
+
+        // Parse the response stream with reasoning and tool parsing enabled
+        let output_chunks = parse_response_stream(
+            input_stream,
+            true,
+            true,
+            Some("harmony".to_string()),
+            Some("gpt_oss".to_string()),
+        )
+        .await;
+
+        // Verify we got output chunks
+        assert!(!output_chunks.is_empty(), "Should have output chunks");
+
+        // Aggregate content from all chunks
+        let aggregated = aggregate_content_from_chunks(&output_chunks);
+
+        // Expect reasoning parsed, no normal content, and tool calls present
+        assert!(
+            !aggregated.reasoning_content.is_empty(),
+            "Should have extracted reasoning content from analysis channel. Got: '{}'",
+            aggregated.reasoning_content
+        );
+
+        assert_eq!(
+            aggregated.normal_content, test_data.expected_normal_content,
+            "Normal content should match expected value.",
+        );
+
+        assert_eq!(
+            aggregated.reasoning_content, test_data.expected_reasoning_content,
+            "Reasoning content should match expected value.",
+        );
+
+        // Verify tool calls presence and values
+        let expected_has_tool_calls = !test_data.expected_tool_calls.is_empty();
+        assert_eq!(
+            aggregated.has_tool_calls, expected_has_tool_calls,
+            "Tool calls presence should match expected value"
+        );
+
+        assert_tool_calls(&aggregated.tool_calls, &test_data.expected_tool_calls);
+    }
+
+    #[tokio::test]
+    async fn test_qwen_e2e_with_no_tools_sglang() {
+        // SGLang Parsing test for Qwen with no tools.
+
+        let file_path = format!(
+            "{}/sglang/qwen3-0.6B/chat_completion_stream_f121d1ca-no-tool.json",
+            DATA_ROOT_PATH
+        );
+        let test_data = load_test_data(&file_path);
+
+        // Create a stream from the mock chunks
+        let input_stream = stream::iter(test_data.stream_chunks);
+
+        // Parse the response stream with reasoning and tool parsing enabled
+        let output_chunks = parse_response_stream(
+            input_stream,
+            true,
+            true,
+            Some("hermes".to_string()),
+            Some("qwen".to_string()),
+        )
+        .await;
+
+        // Verify we got output chunks
+        assert!(!output_chunks.is_empty(), "Should have output chunks");
+
+        // Aggregate content from output chunks
+        let aggregated = aggregate_content_from_chunks(&output_chunks);
+
+        // Expect both reasoning and normal content (final answer) present, and no tool calls
+        assert!(
+            !aggregated.reasoning_content.is_empty(),
+            "Should have extracted reasoning content."
+        );
+        assert!(
+            !aggregated.normal_content.is_empty(),
+            "Should have final normal content."
+        );
+        assert!(!aggregated.has_tool_calls, "Tool calls should be absent");
+
+        assert_eq!(
+            aggregated.reasoning_content, test_data.expected_reasoning_content,
+            "Reasoning content should match expected value.",
+        );
+
+        assert_eq!(
+            aggregated.normal_content, test_data.expected_normal_content,
+            "Normal content should match expected value.",
+        );
+
+        assert_tool_calls(&aggregated.tool_calls, &test_data.expected_tool_calls);
+    }
+
+    #[tokio::test]
+    async fn test_qwen_e2e_with_tools_sglang() {
+        // SGLang Parsing test for Qwen with tools.
+
+        let file_path = format!(
+            "{}/sglang/qwen3-0.6B/chat_completion_stream_c42ba578-tool.json",
+            DATA_ROOT_PATH
+        );
+        let test_data = load_test_data(&file_path);
+
+        // Create a stream from the mock chunks
+        let input_stream = stream::iter(test_data.stream_chunks);
+
+        // Parse the response stream with reasoning and tool parsing enabled
+        let output_chunks = parse_response_stream(
+            input_stream,
+            true,
+            true,
+            Some("hermes".to_string()),
+            Some("qwen".to_string()),
+        )
+        .await;
+
+        // Verify we got output chunks
+        assert!(!output_chunks.is_empty(), "Should have output chunks");
+
+        // Aggregate content from output chunks
+        let aggregated = aggregate_content_from_chunks(&output_chunks);
+
+        // Expect reasoning parsed, no normal content, and tool calls present
+        assert!(
+            !aggregated.reasoning_content.is_empty(),
+            "Should have extracted reasoning content."
+        );
+
+        assert_eq!(
+            aggregated.reasoning_content, test_data.expected_reasoning_content,
+            "Reasoning content should match expected value.",
+        );
+
+        assert_eq!(
+            aggregated.normal_content, test_data.expected_normal_content,
+            "Normal content should match expected value.",
+        );
+
+        // Verify tool calls presence and values
+        let expected_has_tool_calls = !test_data.expected_tool_calls.is_empty();
+        assert_eq!(
+            aggregated.has_tool_calls, expected_has_tool_calls,
+            "Tool calls presence should match expected value"
+        );
+        assert_tool_calls(&aggregated.tool_calls, &test_data.expected_tool_calls);
+    }
+
    #[tokio::test]
    async fn test_nemotron_e2e_with_tools_vllm() {
        // E2E Parsing test for Nemotron with tools.