feat(v4): cherry-pick #8665 onto release/deepseekv4 (#8709)

Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com> Co-authored-by: Keiven Chang <keivenchang@users.noreply.github.com>

feat(v4): cherry-pick #8665 onto release/deepseekv4 (#8709)
Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com> Co-authored-by: Keiven Chang <keivenchang@users.noreply.github.com>
35fa7129 · Keiven C · GitHub · 01002df7 · 35fa7129 · 35fa7129
Unverified Commit 35fa7129 authored Apr 24, 2026 by Keiven C Committed by GitHub Apr 24, 2026
15 changed files
--- a/lib/llm/tests/data/vllm/deepseek-v4/chat_completion_stream_fragmented_tokens.json
+++ b/lib/llm/tests/data/vllm/deepseek-v4/chat_completion_stream_fragmented_tokens.json
--- a/lib/llm/tests/data/vllm/deepseek-v4/chat_completion_stream_mixed_param_types.json
+++ b/lib/llm/tests/data/vllm/deepseek-v4/chat_completion_stream_mixed_param_types.json
+{
+  "request_id": "deepseek-v4-mixed-param-types-test",
+  "expected_output": {"normal_content": "", "reasoning_content": "The user asked me to send a high-priority overdue-billing notification. I'll call send_notification with the appropriate parameters.", "tool_calls": [{"id": "call_1", "type": "function", "function": {"name": "send_notification", "arguments": "{\"recipient\": \"user@example.com\", \"priority\": 3, \"urgent\": true, \"tags\": [\"billing\", \"overdue\"], \"metadata\": {\"ticket\": \"T-42\", \"retries\": 2}}"}}]},
+  "input_stream": [
+    {"data":{"id":"chatcmpl-deepseek-v4-mixed","choices":[{"index":0,"delta":{"content":"<think>The user asked me to send a high-priority overdue-billing notification. I'll call send_notification with the appropriate parameters.</think>","role":"assistant","reasoning_content":"The user asked me to send a high-priority overdue-billing notification. I'll call send_notification with the appropriate parameters."}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-mixed","choices":[{"index":0,"delta":{"content":"<｜DSML｜tool_calls>\n","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-mixed","choices":[{"index":0,"delta":{"content":"<｜DSML｜invoke name=\"send_notification\">\n","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-mixed","choices":[{"index":0,"delta":{"content":"<｜DSML｜parameter name=\"recipient\" string=\"true\">user@example.com</｜DSML｜parameter>\n","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-mixed","choices":[{"index":0,"delta":{"content":"<｜DSML｜parameter name=\"priority\" string=\"false\">3</｜DSML｜parameter>\n","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-mixed","choices":[{"index":0,"delta":{"content":"<｜DSML｜parameter name=\"urgent\" string=\"false\">true</｜DSML｜parameter>\n","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-mixed","choices":[{"index":0,"delta":{"content":"<｜DSML｜parameter name=\"tags\" string=\"false\">[\"billing\", \"overdue\"]</｜DSML｜parameter>\n","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-mixed","choices":[{"index":0,"delta":{"content":"<｜DSML｜parameter name=\"metadata\" string=\"false\">{\"ticket\": \"T-42\", \"retries\": 2}</｜DSML｜parameter>\n","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-mixed","choices":[{"index":0,"delta":{"content":"</｜DSML｜invoke>\n","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-mixed","choices":[{"index":0,"delta":{"content":"</｜DSML｜tool_calls>","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-mixed","choices":[{"index":0,"delta":{"content":null,"role":"assistant"},"finish_reason":"tool_calls"}]}}
+  ]
+}
--- a/lib/llm/tests/data/vllm/deepseek-v4/chat_completion_stream_multi_tool.json
+++ b/lib/llm/tests/data/vllm/deepseek-v4/chat_completion_stream_multi_tool.json
+{
+  "request_id": "deepseek-v4-multi-tool-test",
+  "expected_output": {"normal_content": "", "reasoning_content": "The user wants to check the weather in Beijing and Shanghai, I need to call the get_current_weather tool to get this information.", "tool_calls": [{"id": "call_1", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"Beijing\", \"format\": \"celsius\"}"}}, {"id": "call_2", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"Shanghai\", \"format\": \"celsius\"}"}}]},
+  "input_stream": [
+    {"data":{"id":"chatcmpl-deepseek-v4-multi-tool","choices":[{"index":0,"delta":{"content":"<think>The user wants to check the weather in Beijing and Shanghai, I need to call the get_current_weather tool to get this information.</think>","role":"assistant","reasoning_content":"The user wants to check the weather in Beijing and Shanghai, I need to call the get_current_weather tool to get this information."}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-multi-tool","choices":[{"index":0,"delta":{"content":"<｜DSML｜tool_calls>\n","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-multi-tool","choices":[{"index":0,"delta":{"content":"<｜DSML｜invoke name=\"get_current_weather\">\n","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-multi-tool","choices":[{"index":0,"delta":{"content":"<｜DSML｜parameter name=\"location\" string=\"true\">Beijing</｜DSML｜parameter>\n","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-multi-tool","choices":[{"index":0,"delta":{"content":"<｜DSML｜parameter name=\"format\" string=\"true\">celsius</｜DSML｜parameter>\n","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-multi-tool","choices":[{"index":0,"delta":{"content":"</｜DSML｜invoke>\n","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-multi-tool","choices":[{"index":0,"delta":{"content":"<｜DSML｜invoke name=\"get_current_weather\">\n","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-multi-tool","choices":[{"index":0,"delta":{"content":"<｜DSML｜parameter name=\"location\" string=\"true\">Shanghai</｜DSML｜parameter>\n","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-multi-tool","choices":[{"index":0,"delta":{"content":"<｜DSML｜parameter name=\"format\" string=\"true\">celsius</｜DSML｜parameter>\n","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-multi-tool","choices":[{"index":0,"delta":{"content":"</｜DSML｜invoke>\n","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-multi-tool","choices":[{"index":0,"delta":{"content":"</｜DSML｜tool_calls>","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-multi-tool","choices":[{"index":0,"delta":{"content":null,"role":"assistant"},"finish_reason":"tool_calls"}]}}
+  ]
+}
--- a/lib/llm/tests/data/vllm/deepseek-v4/chat_completion_stream_no_tool.json
+++ b/lib/llm/tests/data/vllm/deepseek-v4/chat_completion_stream_no_tool.json
+{
+  "request_id": "deepseek-v4-no-tool-test",
+  "expected_output": {"normal_content": "Hi! I'm here to help — what would you like to work on today?", "reasoning_content": "User greeted me politely. A short friendly reply is appropriate; no tools needed.", "tool_calls": []},
+  "input_stream": [
+    {"data":{"id":"chatcmpl-deepseek-v4-no-tool","choices":[{"index":0,"delta":{"content":"<think>User greeted me politely. A short friendly reply is appropriate; no tools needed.</think>","role":"assistant","reasoning_content":"User greeted me politely. A short friendly reply is appropriate; no tools needed."}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-no-tool","choices":[{"index":0,"delta":{"content":"Hi! I'm here to help — ","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-no-tool","choices":[{"index":0,"delta":{"content":"what would you like to work on today?","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-no-tool","choices":[{"index":0,"delta":{"content":null,"role":"assistant"},"finish_reason":"stop"}]}}
+  ]
+}
--- a/lib/llm/tests/data/vllm/deepseek-v4/chat_completion_stream_special_chars.json
+++ b/lib/llm/tests/data/vllm/deepseek-v4/chat_completion_stream_special_chars.json
+{
+  "request_id": "deepseek-v4-special-chars-test",
+  "expected_output": {"normal_content": "", "reasoning_content": "The user wants me to save a multiline note with special characters, quotes, unicode, and emoji. I'll call save_note.", "tool_calls": [{"id": "call_1", "type": "function", "function": {"name": "save_note", "arguments": "{\"note\": \"He said \\\"hello\\\".\\n\\t'world' `backtick` — 中文测试 — 🚀✨ <not_a_sentinel> & </not_a_sentinel>.\"}"}}]},
+  "input_stream": [
+    {"data":{"id":"chatcmpl-deepseek-v4-special","choices":[{"index":0,"delta":{"content":"<think>The user wants me to save a multiline note with special characters, quotes, unicode, and emoji. I'll call save_note.</think>","role":"assistant","reasoning_content":"The user wants me to save a multiline note with special characters, quotes, unicode, and emoji. I'll call save_note."}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-special","choices":[{"index":0,"delta":{"content":"<｜DSML｜tool_calls>\n","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-special","choices":[{"index":0,"delta":{"content":"<｜DSML｜invoke name=\"save_note\">\n","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-special","choices":[{"index":0,"delta":{"content":"<｜DSML｜parameter name=\"note\" string=\"true\">","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-special","choices":[{"index":0,"delta":{"content":"He said \"hello\".\n\t'world' `backtick` ","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-special","choices":[{"index":0,"delta":{"content":"— 中文测试 — 🚀✨ ","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-special","choices":[{"index":0,"delta":{"content":"<not_a_sentinel> & </not_a_sentinel>.</｜DSML｜parameter>\n","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-special","choices":[{"index":0,"delta":{"content":"</｜DSML｜invoke>\n","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-special","choices":[{"index":0,"delta":{"content":"</｜DSML｜tool_calls>","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-special","choices":[{"index":0,"delta":{"content":null,"role":"assistant"},"finish_reason":"tool_calls"}]}}
+  ]
+}
--- a/lib/llm/tests/data/vllm/deepseek-v4/chat_completion_stream_tool.json
+++ b/lib/llm/tests/data/vllm/deepseek-v4/chat_completion_stream_tool.json
+{
+  "request_id": "deepseek-v4-tool-call-test",
+  "expected_output": {"normal_content": "", "reasoning_content": "User wants the current weather in Beijing. I'll call get_current_weather with celsius units.", "tool_calls": [{"id": "call_1", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"Beijing\", \"format\": \"celsius\"}"}}]},
+  "input_stream": [
+    {"data":{"id":"chatcmpl-deepseek-v4-tool","choices":[{"index":0,"delta":{"content":"<think>User wants the current weather in Beijing. I'll call get_current_weather with celsius units.</think>","role":"assistant","reasoning_content":"User wants the current weather in Beijing. I'll call get_current_weather with celsius units."}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-tool","choices":[{"index":0,"delta":{"content":"<｜DSML｜tool_calls>\n","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-tool","choices":[{"index":0,"delta":{"content":"<｜DSML｜invoke name=\"get_current_weather\">\n","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-tool","choices":[{"index":0,"delta":{"content":"<｜DSML｜parameter name=\"location\" string=\"true\">Beijing</｜DSML｜parameter>\n","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-tool","choices":[{"index":0,"delta":{"content":"<｜DSML｜parameter name=\"format\" string=\"true\">celsius</｜DSML｜parameter>\n","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-tool","choices":[{"index":0,"delta":{"content":"</｜DSML｜invoke>\n","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-tool","choices":[{"index":0,"delta":{"content":"</｜DSML｜tool_calls>","role":"assistant"}}]}},
+    {"data":{"id":"chatcmpl-deepseek-v4-tool","choices":[{"index":0,"delta":{"content":null,"role":"assistant"},"finish_reason":"tool_calls"}]}}
+  ]
+}
--- a/lib/llm/tests/deepseek_v4_encoding.rs
+++ b/lib/llm/tests/deepseek_v4_encoding.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Tests for DeepSeek V4 encoding against official test data
+//!
+//! These tests use the official test files from:
+//! https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/tree/main/encoding
+
+use dynamo_llm::preprocessor::prompt::deepseek_v4::{ThinkingMode, encode_messages};
+use serde_json::Value as JsonValue;
+use std::fs;
+use std::path::PathBuf;
+
+fn get_test_data_path() -> PathBuf {
+    PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests/data/deepseek-v4")
+}
+
+/// Load an input fixture. V4 fixtures come in two shapes:
+///   1. `{"tools": [...], "messages": [...]}` — tools injected on first (system) message
+///   2. bare `[...]` — just the messages array
+fn load_messages(path: &PathBuf) -> Vec<JsonValue> {
+    let raw: JsonValue = serde_json::from_str(
+        &fs::read_to_string(path).unwrap_or_else(|_| panic!("Failed to read {:?}", path)),
+    )
+    .unwrap_or_else(|_| panic!("Failed to parse {:?}", path));
+
+    if let Some(messages) = raw.get("messages").and_then(|m| m.as_array()) {
+        let mut messages = messages.clone();
+        if let Some(tools) = raw.get("tools")
+            && let Some(first) = messages.get_mut(0)
+            && let Some(obj) = first.as_object_mut()
+        {
+            obj.insert("tools".to_string(), tools.clone());
+        }
+        messages
+    } else if let Some(arr) = raw.as_array() {
+        arr.clone()
+    } else {
+        panic!("Unexpected input shape in {:?}", path);
+    }
+}
+
+fn run_official_test(input_file: &str, output_file: &str, thinking_mode: ThinkingMode) {
+    let test_dir = get_test_data_path();
+    let messages = load_messages(&test_dir.join(input_file));
+    let expected = fs::read_to_string(test_dir.join(output_file))
+        .unwrap_or_else(|_| panic!("Failed to read {}", output_file));
+
+    let actual = encode_messages(&messages, thinking_mode, true)
+        .unwrap_or_else(|e| panic!("encode_messages failed for {}: {:?}", input_file, e));
+
+    let exp = expected.trim_end();
+    let act = actual.trim_end();
+
+    if exp != act {
+        println!("=== Test: {} ===", input_file);
+        let exp_lines: Vec<&str> = exp.lines().collect();
+        let act_lines: Vec<&str> = act.lines().collect();
+        for (i, (el, al)) in exp_lines.iter().zip(act_lines.iter()).enumerate() {
+            if el != al {
+                println!("Line {} differs:", i + 1);
+                println!("  Expected: {:?}", el);
+                println!("  Actual:   {:?}", al);
+                break;
+            }
+        }
+        if exp_lines.len() != act_lines.len() {
+            println!(
+                "\nLine count mismatch: expected {} lines, got {} lines",
+                exp_lines.len(),
+                act_lines.len()
+            );
+        }
+        panic!("Output does not match expected for {}", input_file);
+    }
+}
+
+/// Case 1 — thinking mode, single tool, tool result round-trip.
+#[test]
+fn test_official_thinking_with_tools() {
+    run_official_test(
+        "test_input_1.json",
+        "test_output_1.txt",
+        ThinkingMode::Thinking,
+    );
+}
+
+/// Case 2 — thinking mode, no tools, multi-turn (drop_thinking strips earlier reasoning).
+#[test]
+fn test_official_thinking_no_tools_multiturn() {
+    run_official_test(
+        "test_input_2.json",
+        "test_output_2.txt",
+        ThinkingMode::Thinking,
+    );
+}
+
+/// Case 3 — thinking mode, developer role with tools + latest_reminder + tool result.
+#[test]
+fn test_official_developer_with_tools_and_reminder() {
+    run_official_test(
+        "test_input_3.json",
+        "test_output_3.txt",
+        ThinkingMode::Thinking,
+    );
+}
+
+/// Case 4 — chat mode, latest_reminder + task="action" + mask preservation.
+#[test]
+fn test_official_chat_mode_action_task() {
+    run_official_test("test_input_4.json", "test_output_4.txt", ThinkingMode::Chat);
+}
--- a/lib/llm/tests/test_streaming_tool_parsers.rs
+++ b/lib/llm/tests/test_streaming_tool_parsers.rs
@@ -1106,6 +1106,167 @@ mod tests {
        );
    }

+    // ---- DeepSeek V4 (DSML format) streaming parser tests ----
+    //
+    // V4 emits tool calls inside a DSML block:
+    //   <｜DSML｜tool_calls>
+    //   <｜DSML｜invoke name="fn">
+    //   <｜DSML｜parameter name="k" string="true|false">v</｜DSML｜parameter>
+    //   </｜DSML｜invoke>
+    //   </｜DSML｜tool_calls>
+    // Fixtures live under tests/data/vllm/deepseek-v4/.
+
+    /// Shared harness for DeepSeek V4 e2e fixtures that end in a tool call.
+    async fn run_deepseek_v4_tool_call_fixture(file_path: &str) {
+        let test_data = load_test_data(file_path);
+        let input_stream = stream::iter(test_data.stream_chunks);
+
+        let output_chunks = parse_response_stream(
+            input_stream,
+            true,
+            true,
+            Some("deepseek_v4".to_string()),
+            Some("deepseek_v4".to_string()),
+        )
+        .await;
+
+        assert!(!output_chunks.is_empty(), "Should have output chunks");
+
+        let aggregated = aggregate_content_from_chunks(&output_chunks);
+
+        assert_eq!(
+            aggregated.reasoning_content, test_data.expected_reasoning_content,
+            "Should have extracted reasoning content.",
+        );
+        assert_eq!(
+            aggregated.normal_content, test_data.expected_normal_content,
+            "Normal content should match expected value.",
+        );
+
+        let expected_has_tool_calls = !test_data.expected_tool_calls.is_empty();
+        assert_eq!(
+            aggregated.has_tool_calls, expected_has_tool_calls,
+            "Tool calls presence should match expected value"
+        );
+        assert_tool_calls(&aggregated.tool_calls, &test_data.expected_tool_calls);
+
+        assert!(
+            validate_finish_reason(&output_chunks, FinishReason::ToolCalls),
+            "finish_reason validation failed for tool call case"
+        );
+    }
+
+    /// Single tool call, thinking mode (direct V4 analog of the V3 tool fixture).
+    /// `CASE.1` + `CASE.8` + `CASE.9` — single tool call, streaming assembly, paired with reasoning. Also validates `CASE.12` finish_reason=tool_calls.
+    #[tokio::test]
+    async fn test_deepseek_v4_e2e_with_tools_vllm() {
+        let file_path = format!(
+            "{}/vllm/deepseek-v4/chat_completion_stream_tool.json",
+            DATA_ROOT_PATH
+        );
+        run_deepseek_v4_tool_call_fixture(&file_path).await;
+    }
+
+    /// No tool call — thinking + plain body; finish_reason=stop.
+    /// `CASE.3` + `CASE.10` — no tool call + reasoning only. Also validates `CASE.12` finish_reason=stop.
+    #[tokio::test]
+    async fn test_deepseek_v4_e2e_with_no_tools_vllm() {
+        let file_path = format!(
+            "{}/vllm/deepseek-v4/chat_completion_stream_no_tool.json",
+            DATA_ROOT_PATH
+        );
+        let test_data = load_test_data(&file_path);
+        let input_stream = stream::iter(test_data.stream_chunks);
+
+        let output_chunks = parse_response_stream(
+            input_stream,
+            true,
+            true,
+            Some("deepseek_v4".to_string()),
+            Some("deepseek_v4".to_string()),
+        )
+        .await;
+
+        assert!(!output_chunks.is_empty(), "Should have output chunks");
+
+        let aggregated = aggregate_content_from_chunks(&output_chunks);
+
+        assert_eq!(
+            aggregated.reasoning_content, test_data.expected_reasoning_content,
+            "Should have extracted reasoning content.",
+        );
+        assert_eq!(
+            aggregated.normal_content, test_data.expected_normal_content,
+            "Normal content should match expected value.",
+        );
+        assert!(!aggregated.has_tool_calls, "Should not have any tool calls");
+
+        assert!(
+            validate_finish_reason(&output_chunks, FinishReason::Stop),
+            "finish_reason validation failed for non-tool call case"
+        );
+    }
+
+    /// Two parallel tool calls inside one DSML block.
+    /// `CASE.2` — parallel tool calls in one DSML block.
+    #[tokio::test]
+    async fn test_deepseek_v4_e2e_multi_tool_vllm() {
+        let file_path = format!(
+            "{}/vllm/deepseek-v4/chat_completion_stream_multi_tool.json",
+            DATA_ROOT_PATH
+        );
+        run_deepseek_v4_tool_call_fixture(&file_path).await;
+    }
+
+    /// string="true" vs string="false" — numbers, booleans, arrays, objects must
+    /// round-trip as their proper JSON types inside arguments.
+    /// `CASE.7` — complex args (mixed string="true|false" → strings / numbers / bools / arrays / objects round-trip).
+    #[tokio::test]
+    async fn test_deepseek_v4_e2e_mixed_param_types_vllm() {
+        let file_path = format!(
+            "{}/vllm/deepseek-v4/chat_completion_stream_mixed_param_types.json",
+            DATA_ROOT_PATH
+        );
+        run_deepseek_v4_tool_call_fixture(&file_path).await;
+    }
+
+    /// Body text emitted before the DSML block — parser must populate both
+    /// normal_content and tool_calls.
+    /// `CASE.13` — normal text interleaved before the DSML block.
+    #[tokio::test]
+    async fn test_deepseek_v4_e2e_content_before_tool_vllm() {
+        let file_path = format!(
+            "{}/vllm/deepseek-v4/chat_completion_stream_content_before_tool.json",
+            DATA_ROOT_PATH
+        );
+        run_deepseek_v4_tool_call_fixture(&file_path).await;
+    }
+
+    /// Parameter value containing unicode, emoji, embedded quotes/newlines/tabs,
+    /// and fragments that look like sentinels but aren't — must not confuse the
+    /// parser, which anchors only on the exact </｜DSML｜parameter> token.
+    /// `CASE.7` — Unicode / special characters inside argument values. (`CASE.xml.entities` is N/A for DSML — no entity decoding.)
+    #[tokio::test]
+    async fn test_deepseek_v4_e2e_special_chars_vllm() {
+        let file_path = format!(
+            "{}/vllm/deepseek-v4/chat_completion_stream_special_chars.json",
+            DATA_ROOT_PATH
+        );
+        run_deepseek_v4_tool_call_fixture(&file_path).await;
+    }
+
+    /// Adversarial streaming: every DSML character is its own delta (~200 chunks).
+    /// Exercises buffer accumulation across chunk boundaries.
+    /// `CASE.8` — streaming chunk-boundary splits (tokens straddle chunks).
+    #[tokio::test]
+    async fn test_deepseek_v4_e2e_fragmented_tokens_vllm() {
+        let file_path = format!(
+            "{}/vllm/deepseek-v4/chat_completion_stream_fragmented_tokens.json",
+            DATA_ROOT_PATH
+        );
+        run_deepseek_v4_tool_call_fixture(&file_path).await;
+    }
+
    // ---- Kimi K2 streaming jail reproduction tests ----
    //
    // These reproduce the customer-reported issue (DIS-1765): Kimi K2 agentic

--- a/lib/parsers/README.md
+++ b/lib/parsers/README.md
+# dynamo-parsers
+
+Rust crate for parsing **tool calls** and **reasoning content** out of raw LLM
+output. Wire-format-aware, streaming-first, model-family-aware.
+
+This is the post-model side of Dynamo's chat-completions pipeline: given a
+token stream from vLLM or SGLang, extract structured `Vec<ToolCall>` +
+`reasoning_content` for the client. The pre-model side (prompt formatting)
+lives in `lib/llm/src/preprocessor/prompt/`.
+
+## What's in the crate
+
+Two top-level modules, each with its own parser registry:
+
+```
+lib/parsers/
+├── src/
+│   ├── tool_calling/        ← tool-call extraction (17 registered parsers)
+│   │   ├── parsers.rs         — registry + dispatch (detect_and_parse_tool_call)
+│   │   ├── config.rs          — per-parser ToolCallConfig
+│   │   ├── response.rs        — ToolCallResponse shape (wire type)
+│   │   ├── dsml/              — DeepSeek V3.2 / V4 DSML grammar
+│   │   ├── xml/               — hermes, glm47, kimi_k2, minimax_m2, qwen3_coder
+│   │   ├── json/              — deepseek_v3, deepseek_v3_1, nemotron_deci/nano, jamba, mistral, phi4, llama3_json
+│   │   ├── harmony/           — OpenAI gpt-oss (Harmony token stream, uses openai_harmony crate)
+│   │   └── pythonic/          — Python function-call syntax (some Llama variants)
+│   └── reasoning/           ← reasoning-content extraction (14 registered parsers)
+│       ├── mod.rs             — registry + dispatch
+│       ├── base_parser.rs     — BasicReasoningParser (<think> ... </think>)
+│       ├── gpt_oss_parser.rs  — Harmony channel parsing
+│       ├── granite_parser.rs  — Granite-style
+│       └── minimax_append_think_parser.rs  — MiniMax inline-reasoning
+```
+
+## How a request flows through the crate
+
+```
+  token stream from engine
+            │
+            ▼
+  ┌─────────────────────────────────┐
+  │ reasoning parser                │  — registered by name via
+  │   (basic / gpt_oss / ...)       │    reasoning::mod.rs get_reasoning_parser_map()
+  │                                 │    returns: (reasoning_content, non_reasoning_tail)
+  └─────────────────────────────────┘
+            │
+            ▼ (non-reasoning tail)
+  ┌─────────────────────────────────┐
+  │ tool-call parser                │  — registered by name via
+  │   dispatched on parser name     │    tool_calling::parsers::get_tool_parser_map()
+  │   which picks a ParserConfig:   │
+  │     - Dsml(DsmlParserConfig)    │  → try_tool_call_parse_dsml
+  │     - Json(JsonParserConfig)    │  → try_tool_call_parse_json
+  │     - Xml(XmlParserConfig)      │  → try_tool_call_parse_xml
+  │     - KimiK2(KimiK2ParserConfig)│  → try_tool_call_parse_kimi_k2
+  │     - Pythonic / Harmony        │
+  └─────────────────────────────────┘
+            │
+            ▼
+  Vec<ToolCallResponse> + normal_text
+```
+
+Main public entry points in `tool_calling/parsers.rs`:
+
+- `detect_and_parse_tool_call(input, parser_name, schema) -> (calls, normal_text)`
+- `try_tool_call_parse(input, config) -> (calls, normal_text)` (lower-level, bypasses the registry)
+- `detect_tool_call_start(chunk, parser_name)` — streaming: "is this chunk starting a tool-call block?"
+- `find_tool_call_end_position(chunk, parser_name)` — streaming: "where does the block end in this chunk?"
+
+## Parser-family cheat sheet
+
+When adding a new model, the right parser family is usually one of:
+
+| Family | Grammar | Shared engine | Examples |
+| -- | -- | -- | -- |
+| **DSML** | `<｜DSML｜tool_calls>...` with typed `string="true|false"` parameters | `dsml/parser.rs` | DeepSeek V3.2, V4 |
+| **XML** | `<tool_call>...</tool_call>` with nested `<parameter>` or `<function>` | `xml/parser.rs` (generic) or own file for variants | hermes, qwen3_coder, minimax_m2, glm47 (own), kimi_k2 (own, special-token XML) |
+| **JSON** | Start sentinel + bare JSON array of `{name, arguments}` | `json/base_json_parser.rs` | deepseek_v3, deepseek_v3_1, nemotron_deci/nano |
+| **Harmony** | OpenAI Harmony token stream with `<\|channel\|>`, `<\|message\|>`, `<\|call\|>` | `harmony/harmony_parser.rs` (wraps external `openai_harmony` crate) | gpt-oss-20B / 120B |
+| **Pythonic** | `[func_name(arg=value, ...)]` Python function-call syntax | `pythonic/pythonic_parser.rs` | some Llama variants |
+
+Reasoning parsers:
+
+| Family | Grammar | Shared engine | Examples |
+| -- | -- | -- | -- |
+| **Basic (think-tag)** | `<think>...</think>` | `reasoning/base_parser.rs` (BasicReasoningParser) | Qwen3, Nemotron, Kimi K2.5, DeepSeek R1 / V4, GLM-4.5+ |
+| **Append-think** | `<think>...</think>` left inline as text, with `<think>` prefix on first chunk | `reasoning/minimax_append_think_parser.rs` | MiniMax M2 |
+| **Harmony channel** | Hidden `analysis` channel | `reasoning/gpt_oss_parser.rs` (wraps external `openai_harmony`) | gpt-oss-20B / 120B |
+| **Granite** | Custom start/end tokens | `reasoning/granite_parser.rs` | IBM Granite |
+
+## Adding a new parser
+
+1. **Pick the family** from the cheat sheet above. If an existing config-driven
+   family fits, add a `ToolCallConfig::<your_model>()` constructor in
+   `tool_calling/config.rs`, register it in `tool_calling/parsers.rs`. Done —
+   you inherit all the shared parser and tests.
+
+2. **If the grammar is genuinely new**, add a module under `tool_calling/` and
+   add a `ParserConfig` variant in `config.rs`. Follow the existing parser
+   modules for layout.
+
+3. **For reasoning**, prefer aliasing to `BasicReasoningParser` unless the
+   grammar truly diverges (append-think, Harmony channels). Most new models
+   use plain `<think>...</think>` and can share.
+
+4. **Write tests.** Minimum viable set is in [`TESTING.md`](./TESTING.md) (T1–T20
+   taxonomy). At minimum: T1/T2/T3 for correctness, T5 for truncation
+   behavior, T8/T9 for streaming, T14 for interleaved text. `N/A` categories
+   should be explicitly called out in a comment rather than silently skipped.
+
+## Related docs
+
+- [`TESTING.md`](./TESTING.md) — corner-case taxonomy (T1–T20). What every
+  parser should be tested against, what's N/A per family, what's a universal
+  gap today.
+- `lib/llm/tests/data/` — captured streaming fixtures per (engine × model)
+  that feed `test_streaming_tool_parsers.rs`. The replay side of the testing
+  story.
+
+## Integration with the rest of Dynamo
+
+- `lib/llm/src/preprocessor/prompt/` — pre-model side. Writes the prompts
+  that (eventually) come back and get parsed here.
+- `lib/llm/src/preprocessor.rs` — top-level request/response pipeline.
+  Decides whether to run the reasoning parser based on
+  `is_reasoning_disabled_by_request`, then hands the reasoning-stripped
+  tail to the tool-call parser.
+- `components/src/dynamo/frontend/` — Python frontend that surfaces parsed
+  output as OpenAI-compatible SSE chunks to the client.
--- a/lib/parsers/TESTING.md
+++ b/lib/parsers/TESTING.md
+# Tool-Call / Reasoning Parser Corner Cases
+
+Reference taxonomy for unit testing tool-call and reasoning parsers. Each parser
+added under `src/tool_calling/` or `src/reasoning/` should cover the generic
+`CASE.<n>` categories; family-specific parsers also cover their respective
+`CASE.xml<n>` / `CASE.harmony<n>` categories. `N/A` should be called out
+explicitly in the test file rather than silently omitted.
+
+Category layout:
+- **`CASE.1`–`CASE.16`** — **Generic**. Apply to every parser regardless of grammar.
+- **`CASE.xml1`–`CASE.xml2`** — XML-family only (hermes, glm47, qwen3_coder, minimax_m2, kimi_k2).
+- **`CASE.harmony1`** — Harmony only (gpt-oss).
+
+Per-model gap tracking lives elsewhere (not in this repo).
+
+## Quick reference
+
+### Generic (all parsers)
+
+- **`CASE.1`** Single tool call — happy path (one complete, well-formed call). Ex: `xml::test_parse_simple_tool_call`.
+- **`CASE.2`** Multiple tool calls — sequential or parallel (2+ in one response). Ex: `tool_choice::test_streaming_required_tool_parallel`.
+- **`CASE.3`** No tool call (response is text only). Ex: `xml::test_parse_no_tool_calls`.
+- **`CASE.4`** Malformed / partial JSON args (truncated, missing close brace, invalid syntax). Ex: `deepseek_v3::test_parse_..._with_invalid_json`, `xml::test_parse_missing_..._closing_tag`.
+- **`CASE.5`** Missing end-token recovery (recover calls when `section_end` is absent due to max_tokens / EOS). Ex: `kimi_k2::test_parse_malformed_no_section_end`.
+- **`CASE.6`** Empty args (`arguments={}` / no-arg call). Ex: `kimi_k2::test_parse_no_arg_call`.
+- **`CASE.7`** Complex arg types (nested objects, arrays, bool, number, Unicode / newlines in values). Ex: `kimi_k2::test_parse_complex_json_arguments`.
+- **`CASE.8`** Streaming — token-by-token assembly + chunk-boundary splits. Ex: `basic::test_buffer_state_persistence_across_calls`, `basic::test_partial_token_matching_closing_tag`, `basic::test_kimi_k2_one_shot_split`.
+- **`CASE.9`** Paired reasoning + tool in same response. Ex: `test_reasoning_parser::test_nemotron_with_reasoning_and_tool_calls`.
+- **`CASE.10`** Reasoning only (think tags, no tool call). Ex: `basic::test_detect_and_parse_reasoning_reasoning`.
+- **`CASE.11`** `tool_choice` = auto / required / named / none. Ex: `tool_choice::test_named_tool_choice_parses_json` (**hermes only today**).
+- **`CASE.12`** `finish_reason` semantics (`stop` / `tool_calls` / `length` mapping). Ex: `tool_choice_finish_reasons::*` (hermes only); `test_streaming_tool_parsers::test_qwen_finish_reason_length_vllm`.
+- **`CASE.13`** Normal text interleaved with tool calls. Ex: `xml::test_parse_with_normal_text`.
+- **`CASE.14`** Empty content / empty `tool_calls` array / null response. Ex: `parallel_tool_call_integration::test_empty_tool_calls`.
+- **`CASE.15`** Duplicate tool calls (same name twice). No test anywhere in the repo; universal gap.
+- **`CASE.16`** Regression for a specific customer bug (ticket ID referenced in test name or body). Ex: `kimi_k2::test_parse_malformed_no_section_end`.
+
+### XML-family (`CASE.xml*`)
+
+- **`CASE.xml1`** XML entity / HTML unescape handling (`&lt;`, `&amp;`, `&quot;` in parameter values). Ex: `xml::test_html_unescape`, `glm47::test_xml_entity_decoding`.
+- **`CASE.xml2`** Schema-aware type coercion (string → number/bool/array based on declared parameter schema). Ex: `xml::test_schema_aware_type_conversion`, `glm47::test_type_coercion_*`.
+
+### Harmony (`CASE.harmony*`)
+
+- **`CASE.harmony1`** Channel / recipient parsing (analysis / commentary / final channels). Ex: `harmony_parser::test_parse_tool_calls_harmony_*`.
+
+### Universal gaps (no test anywhere, not promoted to numbered categories)
+
+- Unicode in function names (non-ASCII tool names, emoji).
+- Numeric overflow in args (very large int / float outside JSON spec range).
+- Empty function name (`"name": ""`).
+- Concurrent parallel requests (process-level contention during parse).
+- Guided-decoding ↔ tool-call interaction (constrained generation emits malformed args).
+- Extremely long output (≥10 KB tool-call JSON in a single call).
+- Mid-stream error injection / interruption (worker kill, network drop mid-parse).
+- Schema arg-count mismatch (model emits extra or missing args vs declared schema).
+
+---
+
+## `CASE.1` — Single tool call, happy path
+
+One complete, well-formed call in the response.
+
+- Applies to every tool-call parser.
+- Baseline correctness check. If `CASE.1` fails, nothing else below matters.
+- Example: `dsml/parser.rs::test_parse_single_tool_call_string_param`.
+
+## `CASE.2` — Multiple tool calls (sequential or parallel)
+
+Two or more calls in one response, in the same block or back-to-back.
+
+- Applies to every tool-call parser.
+- Some grammars emit parallel calls in one block (DSML, XML); others emit
+  sequential top-level sentinels (JSON dialects). Either way, extract all.
+- Example: `dsml/parser.rs::test_parse_multiple_tool_calls`,
+  `tool_choice::test_streaming_required_tool_parallel`.
+
+## `CASE.3` — No tool call
+
+Response is plain text, no tool-call grammar present.
+
+- Applies to every tool-call parser.
+- Must return empty `Vec<ToolCall>` and the input as `normal_text`. Zero false
+  positives.
+- Example: `dsml/parser.rs::test_parse_no_tool_calls`.
+
+## `CASE.4` — Malformed / partial JSON args
+
+Truncated JSON, missing close brace, invalid syntax inside the arguments
+payload.
+
+- Applies to every tool-call parser. For parsers whose grammar never embeds
+  JSON (none today — all top-N families embed JSON somewhere), mark explicit
+  `N/A`.
+- Behavior must be documented: either graceful fallback to string (DSML's
+  current behavior via `serde_json::from_str(...).unwrap_or_else(|_| String(...))`)
+  or explicit error. Silent drop is the failure mode.
+- Example: `dsml/parser.rs::test_parse_deepseek_v4_malformed_json_value_falls_back_to_string`,
+  `deepseek_v3_parser.rs::test_parse_tool_calls_deepseek_v3_with_invalid_json`.
+
+## `CASE.5` — Missing end-token recovery
+
+The model's response is truncated before the closing fence arrives
+(`<|tool_calls_section_end|>` for Kimi, `</｜DSML｜tool_calls>` for DeepSeek
+DSML, etc.) — typically because the engine hit `max_tokens` or the model
+emitted EOS mid-generation.
+
+- Applies to every tool-call parser with paired start/end fences.
+- Customer-facing bug class: silent drop of the in-flight call looks like a
+  successful HTTP 200 with no tool_calls and no error.
+- Two acceptable resolutions: (a) recover completed invokes even without the
+  outer close fence (Kimi K2 does this post-fix), or (b) return an explicit
+  error. Either way, pin the behavior with a test so a future change is
+  intentional.
+- Example (post-recovery): `kimi_k2_parser.rs::test_parse_malformed_no_section_end`.
+- Example (behavior-pinning, pre-recovery): `dsml/parser.rs::test_parse_deepseek_v4_missing_end_token`.
+
+## `CASE.6` — Empty args
+
+Tool call with `arguments={}`, or a no-parameter invoke.
+
+- Applies to every tool-call parser.
+- Must still return the call — empty args is a valid call, not a missing one.
+- Example: `kimi_k2_parser.rs::test_parse_no_arg_call`,
+  `dsml/parser.rs::test_parse_deepseek_v4_no_parameters`.
+
+## `CASE.7` — Complex argument types
+
+Nested objects, arrays, booleans, numbers, mixed types, Unicode values, and
+newlines inside argument values.
+
+- Applies to every tool-call parser.
+- For grammars that carry type hints (DSML's `string="true|false"`), verify
+  JSON round-tripping. For XML grammars without hints, the type-coercion
+  half of the test is covered under `CASE.xml2` instead — here just verify
+  that complex values make it through without truncation or escape bugs.
+- Example: `dsml/parser.rs::test_parse_mixed_types_realistic`,
+  `kimi_k2_parser.rs::test_parse_complex_json_arguments`.
+
+## `CASE.8` — Streaming
+
+Chunked input arriving over SSE. Covers two concerns that tend to fail
+together:
+
+1. **Token-by-token assembly** — the parser incrementally reconstructs the
+   tool-call structure across many small chunks.
+2. **Chunk-boundary splits** — start fence, end fence, or parameter name /
+   value straddles a chunk boundary. Partial-token matching must return
+   `true` (keep buffering, don't flush as plain text) and complete the
+   match on the next chunk.
+
+- Applies to every tool-call parser. Dominant production path.
+- Example: `basic::test_buffer_state_persistence_across_calls`,
+  `basic::test_partial_token_matching_closing_tag`,
+  `basic::test_kimi_k2_one_shot_split`,
+  `test_streaming_tool_parsers::test_deepseek_v4_e2e_fragmented_tokens_vllm`.
+
+## `CASE.9` — Paired reasoning + tool in same response
+
+Model emits `<think>...</think>` (or analog) followed by a tool call. Both
+must be extracted: `reasoning_content` populated AND `tool_calls` populated.
+
+- Applies to every (tool, reasoning) parser pair.
+- Watch for the "unclosed think-tag swallows tool call" bug — if the reasoning
+  parser is greedy it may eat the tool-call content that follows.
+- Example: `test_reasoning_parser::test_nemotron_with_reasoning_and_tool_calls`,
+  `test_reasoning_parser::test_kimi_k25_with_reasoning_and_tool_calls`.
+
+## `CASE.10` — Reasoning only
+
+`<think>...</think>` or analog present, no tool call. Parser must populate
+`reasoning_content` and leave `tool_calls` empty.
+
+- Applies to every reasoning parser.
+- Example: `reasoning/base_parser.rs::test_detect_and_parse_reasoning_reasoning`,
+  `reasoning/mod.rs::test_deepseek_v4_detect_and_parse`.
+
+## `CASE.11` — `tool_choice` = auto / required / named / none
+
+Each of the four OpenAI `tool_choice` modes exercised per parser.
+
+- Applies to every tool-call parser.
+- Cross-parser suites at `lib/llm/tests/tool_choice.rs` /
+  `parallel_tool_call_integration.rs` / `tool_choice_finish_reasons.rs`
+  run `hermes` only today. Adding a new parser requires parametrizing those
+  suites or adding a per-parser equivalent.
+- Universal gap across most parsers in the repo as of 2026-04.
+- Example: `tool_choice::test_named_tool_choice_parses_json`,
+  `tool_choice::test_required_tool_choice_parses_json_array`.
+
+## `CASE.12` — `finish_reason` semantics
+
+`stop` vs `tool_calls` vs `length` mapping, in both streaming and
+non-streaming paths.
+
+- Applies to every tool-call parser.
+- When a tool call lands, `finish_reason` must become `tool_calls`. When
+  `max_tokens` truncates mid-stream, `length` must propagate — this is
+  often the signal that should trigger `CASE.5` recovery on the parser side.
+- Example: `tool_choice_finish_reasons::test_named_tool_choice_normal_stop_becomes_tool_calls`,
+  `test_streaming_tool_parsers::test_qwen_finish_reason_length_vllm`.
+
+## `CASE.13` — Normal text interleaved with tool calls
+
+Model emits narration text before / after / between tool-call blocks. Parser
+must split content correctly: text → `normal_content`, calls → `tool_calls`.
+
+- Applies to every tool-call parser.
+- Example: `dsml/parser.rs::test_parse_with_normal_text`,
+  `test_streaming_tool_parsers.rs::test_deepseek_v4_e2e_content_before_tool_vllm`.
+
+## `CASE.14` — Empty content / empty `tool_calls` array / null response
+
+Engine emits a chunk with `delta.content = ""`, or a final response with
+`tool_calls: []`, or `null` values inside arguments.
+
+- Applies to every tool-call parser.
+- Null-value handling inside parameters is parser-level (`parse_parameters`
+  in DSML handles it via `serde_json::Value::Null`). Empty-choices /
+  empty-stream handling is typically at the e2e integration layer.
+- Example: `dsml/parser.rs::test_parse_null_parameter`,
+  `parallel_tool_call_integration::test_empty_tool_calls`.
+
+## `CASE.15` — Duplicate tool calls (same name twice)
+
+Two calls to the same function name in one response, possibly with the same
+arguments.
+
+- Applies to every tool-call parser.
+- **Zero coverage across the entire repo as of 2026-04.** Universal gap.
+- Expected behavior: both calls must appear in `tool_calls` with distinct
+  IDs. (The runtime / client is responsible for deciding whether duplicate
+  invocation is intended.)
+
+## `CASE.16` — Regression for a specific customer bug
+
+Test named after (or containing) a ticket reference, pinning the fix for
+a customer-reported failure.
+
+- Applies per-incident. Not a category every parser needs to cover in
+  advance; populated as bugs are reported and fixed.
+- Existing example: `kimi_k2_parser.rs::test_parse_malformed_no_section_end`.
+
+---
+
+## `CASE.xml1` — XML entity / HTML unescape handling
+
+Parameter values contain XML-encoded entities (`&lt;`, `&amp;`, `&quot;`,
+`&apos;`, numeric entities like `&#38;`) that must be decoded before the
+value is surfaced to the client.
+
+- Applies only to XML-family tool-call parsers: `hermes`, `glm47`,
+  `qwen3_coder`, `minimax_m2`, `kimi_k2` (despite its special-token outer
+  fence, the inner parameter payload is XML-ish).
+- **N/A for DSML** — the `string="true|false"` attribute tells the parser
+  whether to JSON-decode or pass through verbatim; no entity decoding pass.
+- **N/A for JSON-family and Harmony** — JSON has its own escape semantics
+  handled by `serde_json`.
+- Example: `xml/parser.rs::test_html_unescape`,
+  `glm47_parser.rs::test_xml_entity_decoding`.
+
+## `CASE.xml2` — Schema-aware type coercion
+
+Parser uses the declared tool schema to coerce string args to
+number / bool / array based on the declared parameter type.
+
+- Applies only to XML-family parsers without explicit type annotations in
+  the wire format. `xml/parser.rs`, `glm47_parser.rs` do this.
+- **N/A for DSML** — the `string="true|false"` attribute carries the type
+  intent per parameter, so no schema lookup is needed.
+- **N/A for JSON-family** — JSON has native types.
+- **N/A for Harmony** — payload is JSON inside the channel envelope.
+- Example: `xml/parser.rs::test_schema_aware_type_conversion`,
+  `glm47_parser.rs::test_type_coercion_array_comma_separated`.
+
+---
+
+## `CASE.harmony1` — Channel / recipient parsing
+
+OpenAI Harmony's token stream carries channel metadata
+(`<|channel|>analysis|commentary|final<|message|>`) and recipient targets
+(`to=functions.foo`). Parser must route the `commentary` channel content
+into tool-call extraction while surfacing `analysis` as reasoning and
+`final` as the user-visible output.
+
+- **Harmony only.** N/A for every other family.
+- Example: `harmony/harmony_parser.rs::test_parse_tool_calls_harmony_with_multi_args`,
+  `harmony/harmony_parser.rs::test_parse_tool_calls_harmony_with_normal_text`.
+
+---
+
+## Applicability summary
+
+| Category block | Parsers | Notes |
+| -- | -- | -- |
+| `CASE.1`–`CASE.16` (generic) | All | Required contract for every parser |
+| `CASE.xml1`–`CASE.xml2` | XML-family only | Entity decoding + schema-aware coercion |
+| `CASE.harmony1` | Harmony only | Channel routing |
+
+## Adding a new parser: what you must include
+
+Minimum viable set for a new tool-call parser:
+
+1. `CASE.1`, `CASE.2`, `CASE.3` — baseline correctness.
+2. `CASE.4` or explicit N/A justification — handle or refuse malformed input.
+3. `CASE.5` — pin behavior when the outer fence is missing. Silent drop is a
+   regression waiting to happen.
+4. `CASE.6`, `CASE.7` — empty and complex args.
+5. `CASE.8` — streaming. Essentially non-negotiable for any parser that sits
+   behind a streaming frontend.
+6. `CASE.13` — interleaved text.
+7. `CASE.15` — document whether duplicate calls are supported. Flat gap
+   today; landing a test with the parser establishes the contract.
+8. Family-specific categories where applicable: `CASE.xml1` / `CASE.xml2`
+   for XML grammars, `CASE.harmony1` for Harmony.
+
+For reasoning parsers, replace `CASE.4` / `CASE.5` / `CASE.8`-assembly with
+`CASE.8`-partial-close-tag and `CASE.10` (reasoning-only).
--- a/lib/parsers/src/reasoning/mod.rs
+++ b/lib/parsers/src/reasoning/mod.rs
@@ -29,6 +29,22 @@ fn get_reasoning_parser_map() -> &'static HashMap<&'static str, ReasoningParserT
        map.insert("basic", ReasoningParserType::Basic);
        map.insert("gpt_oss", ReasoningParserType::GptOss);
        map.insert("qwen3", ReasoningParserType::Qwen);
+        // DeepSeek-V4 uses the same `<think>` / `</think>` delimiters as Qwen
+        // (confirmed against deepseek-ai/DeepSeek-V4-Pro's encoding_dsv4.py)
+        // so it delegates to the same `BasicReasoningParser` config today. We
+        // still route through a dedicated `DeepSeekV4` variant rather than
+        // hard-aliasing to `Qwen` so future divergence (different special
+        // tokens, max-thinking mode, etc.) has a place to land without rippling
+        // through Qwen's own config.
+        //
+        // The three name aliases exist because callers set this via
+        // `--dyn-reasoning-parser` / `--reasoning-parser` with whatever string
+        // the HF model / vLLM recipe / chat-template author picked. We accept
+        // all three separator conventions (snake / kebab / concat) rather than
+        // force a single canonical form on users.
+        map.insert("deepseek_v4", ReasoningParserType::DeepSeekV4);
+        map.insert("deepseek-v4", ReasoningParserType::DeepSeekV4);
+        map.insert("deepseekv4", ReasoningParserType::DeepSeekV4);
        map.insert("nemotron_deci", ReasoningParserType::NemotronDeci);
        map.insert("kimi", ReasoningParserType::Kimi);
        map.insert("kimi_k25", ReasoningParserType::KimiK25);
@@ -110,6 +126,14 @@ pub enum ReasoningParserType {
    Basic,
    GptOss,
    Qwen,
+    /// DeepSeek-V4-Pro / V4-Flash. Currently uses the same `<think>` /
+    /// `</think>` `BasicReasoningParser` config as Qwen (V4 never appends
+    /// `<think>` in the completion — the chat template always pre-injects it,
+    /// so the parser starts via `set_in_reasoning(true)` rather than
+    /// `force_reasoning`). A dedicated variant keeps future V4-specific
+    /// divergence (different delimiters, thinking-effort modes) from leaking
+    /// into Qwen's behavior.
+    DeepSeekV4,
    NemotronDeci,
    Kimi,
    KimiK25,
@@ -161,6 +185,12 @@ impl ReasoningParserType {
            ReasoningParserType::Qwen => ReasoningParserWrapper {
                parser: Box::new(basic_parser),
            },
+            // Same `<think>` / `</think>` config as Qwen today; kept as a
+            // distinct variant so V4-specific divergence has somewhere to land.
+            // See `ReasoningParserType::DeepSeekV4` docstring for rationale.
+            ReasoningParserType::DeepSeekV4 => ReasoningParserWrapper {
+                parser: Box::new(basic_parser),
+            },
            ReasoningParserType::NemotronDeci => ReasoningParserWrapper {
                parser: Box::new(basic_parser),
            },
@@ -246,6 +276,9 @@ mod tests {
            "basic",
            "gpt_oss",
            "qwen3",
+            "deepseek_v4",
+            "deepseek-v4",
+            "deepseekv4",
            "nemotron_deci",
            "kimi",
            "kimi_k25",
@@ -261,6 +294,45 @@ mod tests {
            assert!(parsers.contains(&parser));
        }
    }
+    /// `CASE.10` — reasoning-only (V4 `<think>`/`</think>`).
+
+    #[test]
+    fn test_deepseek_v4_detect_and_parse() {
+        for parser_name in ["deepseek_v4", "deepseek-v4", "deepseekv4"] {
+            let mut parser = ReasoningParserType::get_reasoning_parser_from_name(parser_name);
+            let result = parser.detect_and_parse_reasoning("<think>thinking</think>answer", &[]);
+            assert_eq!(result.reasoning_text, "thinking");
+            assert_eq!(result.normal_text, "answer");
+        }
+    }
+    /// `CASE.3` / `CASE.10` — no reasoning tags ⇒ no `reasoning_content`.
+
+    #[test]
+    fn test_deepseek_v4_no_forced_reasoning_without_tags() {
+        let mut parser = ReasoningParserType::get_reasoning_parser_from_name("deepseek_v4");
+        let result = parser.detect_and_parse_reasoning("answer only", &[]);
+        assert_eq!(result.reasoning_text, "");
+        assert_eq!(result.normal_text, "answer only");
+    }
+    /// `CASE.8` — streaming reasoning parse (chunked).
+
+    #[test]
+    fn test_deepseek_v4_streaming() {
+        let mut parser = ReasoningParserType::get_reasoning_parser_from_name("deepseek_v4");
+
+        let chunks = ["<think>rea", "son</think>answer"];
+        let mut reasoning = String::new();
+        let mut normal = String::new();
+
+        for chunk in chunks {
+            let result = parser.parse_reasoning_streaming_incremental(chunk, &[]);
+            reasoning.push_str(&result.reasoning_text);
+            normal.push_str(&result.normal_text);
+        }
+
+        assert_eq!(reasoning, "reason");
+        assert_eq!(normal, "answer");
+    }

    #[test]
    fn test_kimi_k25_detect_and_parse() {

--- a/lib/parsers/src/tool_calling/config.rs
+++ b/lib/parsers/src/tool_calling/config.rs
@@ -391,6 +391,22 @@ impl ToolCallConfig {
        }
    }

+    pub fn deepseek_v4() -> Self {
+        // DeepSeek V4 format (DSML):
+        // <｜DSML｜tool_calls>
+        // <｜DSML｜invoke name="function_name">
+        // <｜DSML｜parameter name="param_name" string="true|false">value</｜DSML｜parameter>
+        // </｜DSML｜invoke>
+        // </｜DSML｜tool_calls>
+        Self {
+            parser_config: ParserConfig::Dsml(DsmlParserConfig {
+                function_calls_start: "<｜DSML｜tool_calls>".to_string(),
+                function_calls_end: "</｜DSML｜tool_calls>".to_string(),
+                ..Default::default()
+            }),
+        }
+    }
+
    pub fn minimax_m2() -> Self {
        // MiniMax-M2.1 format:
        // <minimax:tool_call>

--- a/lib/parsers/src/tool_calling/dsml/parser.rs
+++ b/lib/parsers/src/tool_calling/dsml/parser.rs
--- a/lib/parsers/src/tool_calling/parsers.rs
+++ b/lib/parsers/src/tool_calling/parsers.rs
@@ -43,6 +43,9 @@ pub fn get_tool_parser_map() -> &'static HashMap<&'static str, ToolCallConfig> {
        map.insert("deepseek_v3", ToolCallConfig::deepseek_v3());
        map.insert("deepseek_v3_1", ToolCallConfig::deepseek_v3_1());
        map.insert("deepseek_v3_2", ToolCallConfig::deepseek_v3_2());
+        map.insert("deepseek_v4", ToolCallConfig::deepseek_v4());
+        map.insert("deepseek-v4", ToolCallConfig::deepseek_v4());
+        map.insert("deepseekv4", ToolCallConfig::deepseek_v4());
        map.insert("qwen3_coder", ToolCallConfig::qwen3_coder());
        map.insert("jamba", ToolCallConfig::jamba());
        map.insert("minimax_m2", ToolCallConfig::minimax_m2());
@@ -241,6 +244,9 @@ mod tests {
            "deepseek_v3",
            "deepseek_v3_1",
            "deepseek_v3_2",
+            "deepseek_v4",
+            "deepseek-v4",
+            "deepseekv4",
            "qwen3_coder",
            "jamba",
            "nemotron_nano",
@@ -1701,6 +1707,56 @@ Remember, San Francisco weather can be quite unpredictable, particularly with it
        assert_eq!(args["topn"], 10); // Should be number, not string
        assert_eq!(args["source"], "web");
    }
+    /// `CASE.1` — single-call happy path (V4).
+
+    #[tokio::test]
+    async fn test_deepseek_v4_single_tool_call() {
+        let input = r#"<｜DSML｜tool_calls>
+<｜DSML｜invoke name="get_datetime">
+<｜DSML｜parameter name="timezone" string="true">Asia/Shanghai</｜DSML｜parameter>
+</｜DSML｜invoke>
+</｜DSML｜tool_calls>"#;
+
+        let (tool_calls, normal_text) =
+            detect_and_parse_tool_call(input, Some("deepseek_v4"), None)
+                .await
+                .expect("Failed to parse");
+
+        assert_eq!(tool_calls.len(), 1);
+        assert_eq!(tool_calls[0].function.name, "get_datetime");
+        assert_eq!(normal_text, Some("".to_string()));
+
+        let args: serde_json::Value =
+            serde_json::from_str(&tool_calls[0].function.arguments).unwrap();
+        assert_eq!(args["timezone"], "Asia/Shanghai");
+    }
+    /// Alias registration: verifies `deepseek-v4` and `deepseekv4` route to the same parser as `deepseek_v4`. Not a CASE.*; covers registry plumbing.
+
+    #[tokio::test]
+    async fn test_deepseek_v4_compatibility_aliases() {
+        let input = r#"<｜DSML｜tool_calls>
+<｜DSML｜invoke name="search">
+<｜DSML｜parameter name="query" string="true">search agent benchmark 2024</｜DSML｜parameter>
+<｜DSML｜parameter name="topn" string="false">10</｜DSML｜parameter>
+<｜DSML｜parameter name="source" string="true">web</｜DSML｜parameter>
+</｜DSML｜invoke>
+</｜DSML｜tool_calls>"#;
+
+        for parser_name in ["deepseek_v4", "deepseek-v4", "deepseekv4"] {
+            let (tool_calls, _) = detect_and_parse_tool_call(input, Some(parser_name), None)
+                .await
+                .expect("Failed to parse");
+
+            assert_eq!(tool_calls.len(), 1);
+            assert_eq!(tool_calls[0].function.name, "search");
+
+            let args: serde_json::Value =
+                serde_json::from_str(&tool_calls[0].function.arguments).unwrap();
+            assert_eq!(args["query"], "search agent benchmark 2024");
+            assert_eq!(args["topn"], 10);
+            assert_eq!(args["source"], "web");
+        }
+    }

    #[tokio::test]
    async fn test_hermes_parser_without_new_line() {

--- a/tests/frontend/test_tool_calling_sglang.py
+++ b/tests/frontend/test_tool_calling_sglang.py
@@ -794,24 +794,6 @@ class TestToolCallingProtocol:
            names.add(tc["function"]["name"])
        assert len(names) >= 2, f"expected at least 2 distinct tools, got {names}"

-    def test_tool_call_ids_unique_in_single_response(self, client: OpenAI, model: str):
-        result = stream_chat(
-            client,
-            model,
-            messages=[
-                {
-                    "role": "user",
-                    "content": "Get weather for New York, London, and Tokyo.",
-                }
-            ],
-            tools=TOOLS_WEATHER,
-            tool_choice="required",
-            parallel_tool_calls=True,
-        )
-        assert_finish_reason(result, {"tool_calls"})
-        ids = [tc["id"] for tc in result.tool_calls]
-        assert len(ids) == len(set(ids)), f"duplicate tool ids: {ids}"
-
    def test_array_argument_schema_valid(self, client: OpenAI, model: str):
        tools = [
            {