feat: interleaved thinking support in reasoning parser (#6422)

Signed-off-by: Matej Kosec <mkosec@nvidia.com>

feat: interleaved thinking support in reasoning parser (#6422)
Signed-off-by: Matej Kosec <mkosec@nvidia.com>
d82b0050 · MatejKosec · GitHub · 7409bd3a · d82b0050 · d82b0050
Unverified Commit d82b0050 authored Feb 20, 2026 by MatejKosec Committed by GitHub Feb 20, 2026
Hide whitespace changes
Inline Side-by-side

Showing with 662 additions and 105 deletions

lib/parsers/src/reasoning/base_parser.rs lib/parsers/src/reasoning/base_parser.rs +660 -105

lib/parsers/src/reasoning/mod.rs lib/parsers/src/reasoning/mod.rs +2 -0

No files found.
--- a/lib/parsers/src/reasoning/base_parser.rs
+++ b/lib/parsers/src/reasoning/base_parser.rs
 // SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0

+//! # Reasoning and Tool Call Interplay
+//!
+//! Models like GLM-4.5/4.7 and Qwen3 interleave reasoning blocks with tool calls:
+//!
+//! ```text
+//! <think>reasoning about what tool to call</think>
+//! <tool_call>get_weather<arg_key>city</arg_key><arg_value>Beijing</arg_value></tool_call>
+//! <think>reasoning about the result</think>
+//! <tool_call>summarize<arg_key>text</arg_key><arg_value>...</arg_value></tool_call>
+//! ```
+//!
+//! The reasoning parser and the tool call parser are **independent, sequential** stages:
+//!
+//! 1. **Reasoning parser** (`BasicReasoningParser`) splits the stream into:
+//!    - `reasoning_content`: everything inside `<think>...</think>` blocks
+//!    - `normal_text`: everything outside (including tool call tags)
+//! 2. **Tool call parser** (`glm47` / others) then processes `normal_text` to extract
+//!    `<tool_call>...</tool_call>` blocks.
+//!
+//! This means tool calls **must** appear outside `<think>` blocks to be detected.
+//! If a model erroneously emits a tool call inside a `<think>` block (observed in
+//! GLM-4.7 under very long contexts), the tool call parser will not see it.
+//!
+//! ## `force_reasoning` and tokenizer behavior
+//!
+//! Some models (e.g. GLM-5-FP8 served via ZAI) consume `<think>` as a special
+//! tokenizer token and never emit it as literal text. In that case use
+//! `force_reasoning=true` (`deepseek_r1` parser), which treats all output as
+//! reasoning until `</think>` is seen. Models that do emit `<think>` as text
+//! (standard serving, Qwen3, GLM-4.5) should use `force_reasoning=false`
+//! (`glm45`, `nemotron_deci`, `qwen3` parsers).
+
 use crate::{ParserResult, ReasoningParser};

+/// Returns the length of the longest suffix of `s` that is also a prefix of `delim`.
+///
+/// Ported from ollama's `thinking/parser.go::overlap()`. Used to detect partial
+/// tags split across streaming chunk boundaries (e.g., `"Hello world <th"` where
+/// `<th` is a prefix of `<think>`).
+fn overlap(s: &str, delim: &str) -> usize {
+    let max = delim.len().min(s.len());
+    for i in (1..=max).rev() {
+        if !delim.is_char_boundary(i) {
+            continue; // Skip mid-codepoint positions (e.g., multi-byte `◁` in Kimi tags)
+        }
+        if s.ends_with(&delim[..i]) {
+            return i;
+        }
+    }
+    0
+}
+
 #[derive(Default, Debug, Clone)]
 pub struct BasicReasoningParser {
    think_start_token: String,
@@ -33,7 +83,8 @@ impl BasicReasoningParser {

 impl ReasoningParser for BasicReasoningParser {
    fn detect_and_parse_reasoning(&mut self, text: &str, _token_ids: &[u32]) -> ParserResult {
-        let in_reasoning = self._in_reasoning || text.contains(&self.think_start_token);
+        let has_think_tag = text.contains(&self.think_start_token);
+        let in_reasoning = self._in_reasoning || has_think_tag;
        if !in_reasoning {
            return ParserResult {
                normal_text: text.to_string(),
@@ -41,24 +92,53 @@ impl ReasoningParser for BasicReasoningParser {
            };
        }

-        // The text is considered to be in a reasoning block.
-        let processed_text = text.replace(&self.think_start_token, "").trim().to_string();
-
-        if !processed_text.contains(&self.think_end_token) {
-            // Assume reasoning was truncated before `think_end_token`
+        // If force_reasoning and no start tag, treat entire text as reasoning
+        if self._in_reasoning && !has_think_tag && !text.contains(&self.think_end_token) {
            return ParserResult {
                normal_text: String::new(),
-                reasoning_text: processed_text,
+                reasoning_text: text.to_string(),
            };
        }

-        // Extract reasoning content
-        let splits: Vec<&str> = processed_text.splitn(2, &self.think_end_token).collect();
-        let reasoning_text = splits.first().unwrap_or(&"").to_string();
-        let normal_text = splits
-            .get(1)
-            .map(|s| s.trim().to_string())
-            .unwrap_or_default();
+        // Extract all <think>...</think> pairs using cursor-based iteration
+        let mut reasoning_parts = Vec::new();
+        let mut normal_parts = Vec::new();
+        let mut cursor = 0;
+        let mut currently_reasoning = self._in_reasoning;
+
+        while cursor < text.len() {
+            if currently_reasoning {
+                // We're inside a reasoning block — look for end token
+                if let Some(end_offset) = text[cursor..].find(&self.think_end_token) {
+                    reasoning_parts.push(&text[cursor..cursor + end_offset]);
+                    cursor += end_offset + self.think_end_token.len();
+                    currently_reasoning = false;
+                } else {
+                    // No end token — rest is reasoning (truncated)
+                    reasoning_parts.push(&text[cursor..]);
+                    cursor = text.len();
+                }
+            } else {
+                // We're in normal text — look for start token
+                if let Some(start_offset) = text[cursor..].find(&self.think_start_token) {
+                    normal_parts.push(&text[cursor..cursor + start_offset]);
+                    cursor += start_offset + self.think_start_token.len();
+                    currently_reasoning = true;
+                } else {
+                    // No more think blocks — rest is normal text
+                    normal_parts.push(&text[cursor..]);
+                    cursor = text.len();
+                }
+            }
+        }
+
+        let reasoning_text = reasoning_parts.join("").trim().to_string();
+        let normal_text = normal_parts.join("").trim().to_string();
+
+        // Note: self._in_reasoning is intentionally NOT updated here. This method is
+        // documented to "reset or ignore internal streaming state" (see trait doc). Callers
+        // should not mix detect_and_parse_reasoning with parse_reasoning_streaming_incremental
+        // on the same parser instance.

        ParserResult {
            normal_text,
@@ -71,86 +151,93 @@ impl ReasoningParser for BasicReasoningParser {
        text: &str,
        _token_ids: &[u32],
    ) -> ParserResult {
-        // Incrementally parse the streaming text
        self._buffer.push_str(text);
-        let mut current_text = self._buffer.to_string();
-        // If the current text is a prefix of the think token, keep buffering.
-        // Only buffer for start token if we haven't found it yet.
-        // Only buffer for end token if we're currently inside a reasoning block.
-        // After reasoning ends, all content passes through as normal text.
-
-        if !self.stripped_think_start
-            && self.think_start_token.starts_with(&current_text)
-            && self.think_start_token.as_str() != current_text.as_str()
-        {
-            return ParserResult {
-                normal_text: String::new(),
-                reasoning_text: String::new(),
-            };
-        }
-        if self._in_reasoning
-            && self.think_end_token.starts_with(&current_text)
-            && self.think_end_token.as_str() != current_text.as_str()
-        {
-            return ParserResult {
-                normal_text: String::new(),
-                reasoning_text: String::new(),
-            };
-        }

-        // Strip `<think>` token if present
-        if !self.stripped_think_start && current_text.contains(&self.think_start_token) {
-            current_text = current_text.replace(&self.think_start_token, "");
-            self._buffer = current_text.to_string();
-            self.stripped_think_start = true;
-            self._in_reasoning = true;
-        }
-        // Handle end of reasoning block
-        let mut think_end_idx = current_text.len();
-        if self._in_reasoning {
-            think_end_idx = current_text
-                .find(&self.think_end_token)
-                .unwrap_or(current_text.len());
-        }
-        if self._in_reasoning && think_end_idx < current_text.len() {
-            let reasoning_text = &current_text[..think_end_idx];
-            self._buffer.clear();
-            self._in_reasoning = false;
-            let start_idx = think_end_idx + self.think_end_token.len();
-            let normal_text = if start_idx < current_text.len() {
-                &current_text[start_idx..]
-            } else {
-                ""
-            };
-            return ParserResult {
-                normal_text: normal_text.to_string(),
-                reasoning_text: reasoning_text.to_string(),
-            };
-        }
-        // Continue with reasoning content
-        if self._in_reasoning && self.stream_reasoning {
-            // Stream the content immediately
-            let reasoning_text = current_text;
-            self._buffer.clear();
-            ParserResult {
-                normal_text: String::new(),
-                reasoning_text,
-            }
-        } else if !self._in_reasoning {
-            // If we're not in a reasoning block return as normal text
-            let normal_text = current_text;
-            self._buffer.clear();
-            ParserResult {
-                normal_text,
-                reasoning_text: String::new(),
+        let mut accumulated_normal = String::new();
+        let mut accumulated_reasoning = String::new();
+
+        // Loop to exhaust all state transitions within a single chunk. Without this,
+        // a chunk containing two complete <think>...</think> blocks would process only
+        // the first transition and buffer the rest, risking content loss at end-of-stream.
+        loop {
+            let current_text = self._buffer.clone();
+
+            // Strip leading <think> tag if not yet stripped. Handles two cases:
+            // 1. force_reasoning=true where the model also emits <think> as text
+            // 2. First call where <think> arrives at buffer position 0
+            // Mid-text <think> (position > 0) falls through to the find() branch below.
+            if !self.stripped_think_start
+                && current_text.starts_with(self.think_start_token.as_str())
+            {
+                self._buffer = current_text[self.think_start_token.len()..].to_string();
+                self.stripped_think_start = true;
+                self._in_reasoning = true;
+                continue;
            }
-        } else {
-            // If we are in a reasoning block but no end token is found, return the current buffer
-            ParserResult {
-                normal_text: String::new(),
-                reasoning_text: String::new(),
+
+            if self._in_reasoning {
+                if let Some(end_idx) = current_text.find(self.think_end_token.as_str()) {
+                    // End of reasoning block: accumulate content and transition out.
+                    accumulated_reasoning.push_str(&current_text[..end_idx]);
+                    let after_end = end_idx + self.think_end_token.len();
+                    self._buffer = current_text[after_end..].to_string();
+                    self._in_reasoning = false;
+                    self.stripped_think_start = false; // Allow detecting next <think> block
+                    continue; // Process remainder — may contain further blocks
+                } else {
+                    // No complete end token — check for partial at end of buffer
+                    // (e.g., "reasoning content</th" where "</th" is a prefix of "</think>").
+                    if self.stream_reasoning {
+                        let ol = overlap(&current_text, &self.think_end_token);
+                        if ol >= 2 {
+                            let safe_end = current_text.len() - ol;
+                            if safe_end > 0 {
+                                accumulated_reasoning.push_str(&current_text[..safe_end]);
+                            }
+                            self._buffer = current_text[safe_end..].to_string();
+                        } else {
+                            accumulated_reasoning.push_str(&current_text);
+                            self._buffer.clear();
+                        }
+                    }
+                    // When stream_reasoning=false, buffer retains all content until
+                    // </think> arrives — no overlap check needed.
+                    break;
+                }
+            } else {
+                // Not in reasoning — look for the next <think> block.
+                if let Some(think_pos) = current_text.find(self.think_start_token.as_str()) {
+                    accumulated_normal.push_str(&current_text[..think_pos]);
+                    let after_start = think_pos + self.think_start_token.len();
+                    self._buffer = current_text[after_start..].to_string();
+                    self._in_reasoning = true;
+                    self.stripped_think_start = true;
+                    continue; // Process reasoning content
+                } else {
+                    // No complete start token — check for partial at end of buffer
+                    // (e.g., "Hello world <th" where "<th" is a prefix of "<think>").
+                    // Require overlap >= 2 so a lone `<` passes through for tool call
+                    // XML tags like `<invoke>` or `<minimax:tool_call>`.
+                    let ol = overlap(&current_text, &self.think_start_token);
+                    if ol >= 2 {
+                        let safe_end = current_text.len() - ol;
+                        if safe_end > 0 {
+                            accumulated_normal.push_str(&current_text[..safe_end]);
+                        }
+                        self._buffer = current_text[safe_end..].to_string();
+                    } else {
+                        accumulated_normal.push_str(&current_text);
+                        self._buffer.clear();
+                    }
+                    break;
+                }
            }
        }
+
+        ParserResult {
+            normal_text: accumulated_normal,
+            reasoning_text: accumulated_reasoning,
+        }
    }
 }

@@ -222,9 +309,8 @@ mod tests {
            "<think>first reasoning</think> middle <think>second reasoning</think> end",
            &[],
        );
-        // The current implementation only handles the first occurrence properly
-        assert_eq!(result.normal_text, "middle second reasoning</think> end");
-        assert_eq!(result.reasoning_text, "first reasoning");
+        assert_eq!(result.normal_text, "middle  end");
+        assert_eq!(result.reasoning_text, "first reasoningsecond reasoning");
    }

    #[test]
@@ -236,11 +322,11 @@ mod tests {
        assert_eq!(result1.normal_text, " middle");
        assert_eq!(result1.reasoning_text, "first reasoning");

-        // Basic parser assumes only one reasoning block at a time
+        // Second reasoning block: space before <think> is normal prefix, reasoning extracted
        let result2 = parser
            .parse_reasoning_streaming_incremental(" <think>second reasoning</think> end", &[]);
-        assert_eq!(result2.normal_text, " <think>second reasoning</think> end");
-        assert_eq!(result2.reasoning_text, "");
+        assert_eq!(result2.reasoning_text, "second reasoning");
+        assert_eq!(result2.normal_text, "  end"); // " " prefix + " end" suffix
    }

    #[test]
@@ -334,10 +420,11 @@ mod tests {
            "<think>outer <think>inner</think> reasoning</think> normal",
            &[],
        );
-        // Current implementation should handle this by finding the first closing tag
+        // Cursor-based parsing: first <think> starts reasoning, first </think> ends it.
+        // "outer <think>inner" is reasoning (inner <think> is just text within reasoning).
+        // " reasoning</think> normal" is normal text (stray </think> passes through).
+        assert_eq!(result.reasoning_text, "outer <think>inner");
        assert_eq!(result.normal_text, "reasoning</think> normal");
-        // All <think> tags are stripped, so <think>inner is not included
-        assert_eq!(result.reasoning_text, "outer inner");
    }

    #[test]
@@ -364,9 +451,10 @@ mod tests {
            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
        let result = parser
            .detect_and_parse_reasoning("<think>first <think>second reasoning</think> normal", &[]);
-        // Should handle by replacing all opening tags and using first closing tag
+        // Cursor-based: first <think> opens reasoning, finds first </think>.
+        // Inner <think> is just text within the reasoning block.
+        assert_eq!(result.reasoning_text, "first <think>second reasoning");
        assert_eq!(result.normal_text, "normal");
-        assert_eq!(result.reasoning_text, "first second reasoning");
    }

    #[test]
@@ -399,7 +487,7 @@ mod tests {
    #[test]
    fn test_streaming_reset_state_after_complete_block() {
        let mut parser =
-            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, false);
+            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);

        // Process complete reasoning block
        let result1 =
@@ -412,12 +500,28 @@ mod tests {
        assert_eq!(result2.normal_text, " more normal text");
        assert_eq!(result2.reasoning_text, "");

-        // Basic parser does not expect more than one reasoning block at a time
-        // So this should not affect the state
+        // Subsequent reasoning blocks should now be parsed (interleaved thinking)
+        // The leading " " before <think> is normal-text prefix; " final" is suffix.
        let result3 = parser
            .parse_reasoning_streaming_incremental(" <think>new reasoning</think> final", &[]);
-        assert_eq!(result3.normal_text, " <think>new reasoning</think> final");
-        assert_eq!(result3.reasoning_text, "");
+        assert_eq!(result3.reasoning_text, "new reasoning");
+        assert_eq!(result3.normal_text, "  final"); // " " prefix + " final" suffix
+
+        // Same test with separate chunks for clarity
+        let mut parser2 =
+            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
+
+        let r1 = parser2.parse_reasoning_streaming_incremental("<think>first</think> normal", &[]);
+        assert_eq!(r1.reasoning_text, "first");
+        assert_eq!(r1.normal_text, " normal");
+
+        let r2 = parser2.parse_reasoning_streaming_incremental(" between", &[]);
+        assert_eq!(r2.normal_text, " between");
+        assert_eq!(r2.reasoning_text, "");
+
+        let r3 = parser2.parse_reasoning_streaming_incremental("<think>second</think> final", &[]);
+        assert_eq!(r3.reasoning_text, "second");
+        assert_eq!(r3.normal_text, " final");
    }

    #[test]
@@ -474,4 +578,455 @@ mod tests {
        let r6 = parser.parse_reasoning_streaming_incremental("invoke name=\"get_weather\">", &[]);
        assert_eq!(r6.normal_text, "invoke name=\"get_weather\">");
    }
+
+    #[test]
+    fn test_interleaved_streaming_across_chunks() {
+        let mut parser =
+            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
+
+        let r1 = parser.parse_reasoning_streaming_incremental("<think>thought 1</think>", &[]);
+        assert_eq!(r1.reasoning_text, "thought 1");
+        assert_eq!(r1.normal_text, "");
+
+        let r2 = parser.parse_reasoning_streaming_incremental(" answer 1 ", &[]);
+        assert_eq!(r2.normal_text, " answer 1 ");
+        assert_eq!(r2.reasoning_text, "");
+
+        let r3 = parser.parse_reasoning_streaming_incremental("<think>thought 2</think>", &[]);
+        assert_eq!(r3.reasoning_text, "thought 2");
+        assert_eq!(r3.normal_text, "");
+
+        let r4 = parser.parse_reasoning_streaming_incremental(" answer 2", &[]);
+        assert_eq!(r4.normal_text, " answer 2");
+        assert_eq!(r4.reasoning_text, "");
+
+        let r5 = parser.parse_reasoning_streaming_incremental("<think>thought 3</think>", &[]);
+        assert_eq!(r5.reasoning_text, "thought 3");
+        assert_eq!(r5.normal_text, "");
+
+        let r6 = parser.parse_reasoning_streaming_incremental(" final answer", &[]);
+        assert_eq!(r6.normal_text, " final answer");
+        assert_eq!(r6.reasoning_text, "");
+    }
+
+    #[test]
+    fn test_three_reasoning_blocks_non_streaming() {
+        let mut parser =
+            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
+        let result = parser.detect_and_parse_reasoning(
+            "<think>A</think> one <think>B</think> two <think>C</think> three",
+            &[],
+        );
+        assert_eq!(result.reasoning_text, "ABC");
+        assert_eq!(result.normal_text, "one  two  three");
+    }
+
+    #[test]
+    fn test_streaming_transition_chunk() {
+        // </think> and <think> arrive in the same chunk.
+        // With loop-based processing, the second block's opening content is emitted
+        // immediately (stream_reasoning=true) rather than buffered until the next call.
+        let mut parser =
+            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
+
+        let r1 = parser.parse_reasoning_streaming_incremental("<think>first", &[]);
+        assert_eq!(r1.reasoning_text, "first");
+
+        // Mid-chunk transition: </think> then normal text then <think> with more content.
+        // The loop transitions out of reasoning, emits " middle " as normal text, enters
+        // the next reasoning block, and streams "second" immediately.
+        let r2 = parser.parse_reasoning_streaming_incremental("</think> middle <think>second", &[]);
+        assert_eq!(r2.reasoning_text, "second");
+        assert_eq!(r2.normal_text, " middle ");
+
+        // Continuation of second reasoning block
+        let r3 = parser.parse_reasoning_streaming_incremental(" more</think> end", &[]);
+        assert_eq!(r3.reasoning_text, " more");
+        assert_eq!(r3.normal_text, " end");
+    }
+
+    #[test]
+    fn test_interleaved_with_force_reasoning() {
+        // deepseek_r1 mode: force_reasoning=true, first tokens are reasoning without <think>
+        let mut parser =
+            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), true, true);
+
+        // No <think> tag — treated as reasoning because force_reasoning=true
+        let r1 = parser.parse_reasoning_streaming_incremental("initial reasoning", &[]);
+        assert_eq!(r1.reasoning_text, "initial reasoning");
+        assert_eq!(r1.normal_text, "");
+
+        // End of forced reasoning block
+        let r2 = parser.parse_reasoning_streaming_incremental("</think> answer", &[]);
+        assert_eq!(r2.reasoning_text, "");
+        assert_eq!(r2.normal_text, " answer");
+
+        // Second reasoning block with explicit <think>
+        let r3 =
+            parser.parse_reasoning_streaming_incremental("<think>second thought</think> done", &[]);
+        assert_eq!(r3.reasoning_text, "second thought");
+        assert_eq!(r3.normal_text, " done");
+    }
+
+    #[test]
+    fn test_interleaved_partial_think_tag_between_blocks() {
+        // After first reasoning block, partial <think> tag arrives across chunks
+        let mut parser =
+            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
+
+        let r1 = parser.parse_reasoning_streaming_incremental("<think>first</think> normal", &[]);
+        assert_eq!(r1.reasoning_text, "first");
+        assert_eq!(r1.normal_text, " normal");
+
+        // Partial <think> prefix: "<th" (2 chars, meets threshold)
+        let r2 = parser.parse_reasoning_streaming_incremental("<th", &[]);
+        assert_eq!(r2.normal_text, "");
+        assert_eq!(r2.reasoning_text, "");
+
+        // Complete the tag
+        let r3 = parser.parse_reasoning_streaming_incremental("ink>second</think> end", &[]);
+        assert_eq!(r3.reasoning_text, "second");
+        assert_eq!(r3.normal_text, " end");
+    }
+
+    #[test]
+    fn test_lone_angle_bracket_between_reasoning_blocks() {
+        // A lone `<` between reasoning blocks should pass through (not buffer)
+        let mut parser =
+            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
+
+        let r1 = parser.parse_reasoning_streaming_incremental("<think>thought</think>", &[]);
+        assert_eq!(r1.reasoning_text, "thought");
+
+        // Lone `<` must not be buffered — could be a tool call
+        let r2 = parser.parse_reasoning_streaming_incremental("<", &[]);
+        assert_eq!(r2.normal_text, "<");
+        assert_eq!(r2.reasoning_text, "");
+
+        let r3 = parser.parse_reasoning_streaming_incremental("tool_call>", &[]);
+        assert_eq!(r3.normal_text, "tool_call>");
+        assert_eq!(r3.reasoning_text, "");
+
+        // But a real <think> should still work after
+        let r4 =
+            parser.parse_reasoning_streaming_incremental("<think>more thought</think> done", &[]);
+        assert_eq!(r4.reasoning_text, "more thought");
+        assert_eq!(r4.normal_text, " done");
+    }
+
+    #[test]
+    fn test_force_reasoning_stream_false_buffers_until_end_token() {
+        // force_reasoning=true, stream_reasoning=false: content is buffered until </think>
+        // arrives, then returned as a single chunk. This is the expected behavior.
+        let mut parser =
+            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), true, false);
+
+        // No <think> — forced into reasoning, stream_reasoning=false means buffer silently
+        let r1 = parser.parse_reasoning_streaming_incremental("chunk one", &[]);
+        assert_eq!(r1.reasoning_text, "");
+        assert_eq!(r1.normal_text, "");
+
+        let r2 = parser.parse_reasoning_streaming_incremental(" chunk two", &[]);
+        assert_eq!(r2.reasoning_text, "");
+        assert_eq!(r2.normal_text, "");
+
+        // </think> arrives — entire buffered reasoning is flushed
+        let r3 = parser.parse_reasoning_streaming_incremental("</think> answer", &[]);
+        assert_eq!(r3.reasoning_text, "chunk one chunk two");
+        assert_eq!(r3.normal_text, " answer");
+    }
+
+    #[test]
+    fn test_multiple_full_blocks_in_single_streaming_chunk() {
+        // Two complete <think>...</think> blocks arrive in one chunk.
+        // The loop exhausts all transitions in a single call — both blocks are fully
+        // processed and no follow-up call is needed to flush buffered content.
+        let mut parser =
+            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
+
+        let r1 = parser.parse_reasoning_streaming_incremental(
+            "<think>A</think> mid <think>B</think> end",
+            &[],
+        );
+        assert_eq!(r1.reasoning_text, "AB");
+        assert_eq!(r1.normal_text, " mid  end");
+
+        // Buffer is fully drained; empty follow-up returns nothing
+        let r2 = parser.parse_reasoning_streaming_incremental("", &[]);
+        assert_eq!(r2.reasoning_text, "");
+        assert_eq!(r2.normal_text, "");
+    }
+
+    #[test]
+    fn test_partial_end_token_stream_reasoning_true() {
+        // Partial </think> split across chunks with stream_reasoning=true.
+        // The partial-end-token buffer check only fires when the parser is ALREADY in
+        // reasoning mode from a prior call. If <think> and </th arrive in the same chunk,
+        // stream_reasoning=true emits the reasoning content immediately (including </th).
+        // So <think> must arrive as its own chunk first.
+        let mut parser =
+            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
+
+        let r1 = parser.parse_reasoning_streaming_incremental("<think>reasoning", &[]);
+        assert_eq!(r1.reasoning_text, "reasoning");
+        assert_eq!(r1.normal_text, "");
+
+        // Partial end token while already in reasoning — buffered, nothing emitted
+        let r2 = parser.parse_reasoning_streaming_incremental("</th", &[]);
+        assert_eq!(r2.reasoning_text, "");
+        assert_eq!(r2.normal_text, "");
+
+        // Complete the end token
+        let r3 = parser.parse_reasoning_streaming_incremental("ink> normal", &[]);
+        assert_eq!(r3.reasoning_text, "");
+        assert_eq!(r3.normal_text, " normal");
+    }
+
+    #[test]
+    fn test_empty_string_input_various_states() {
+        // Empty string input should always return empty results without changing state
+        let mut parser =
+            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
+
+        // State: idle
+        let r1 = parser.parse_reasoning_streaming_incremental("", &[]);
+        assert_eq!(r1.reasoning_text, "");
+        assert_eq!(r1.normal_text, "");
+
+        // Enter reasoning
+        parser.parse_reasoning_streaming_incremental("<think>content", &[]);
+
+        // State: in reasoning
+        let r2 = parser.parse_reasoning_streaming_incremental("", &[]);
+        assert_eq!(r2.reasoning_text, "");
+        assert_eq!(r2.normal_text, "");
+
+        // Complete and exit reasoning
+        parser.parse_reasoning_streaming_incremental("</think>", &[]);
+
+        // State: post-reasoning (normal text)
+        let r3 = parser.parse_reasoning_streaming_incremental("", &[]);
+        assert_eq!(r3.reasoning_text, "");
+        assert_eq!(r3.normal_text, "");
+    }
+
+    #[test]
+    fn test_force_reasoning_stream_false_multiple_blocks() {
+        // force_reasoning=true (deepseek_r1 mode), stream_reasoning=false.
+        // First block uses forced-reasoning (no explicit <think>); subsequent blocks
+        // use explicit tags.
+        let mut parser =
+            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), true, false);
+
+        // Forced reasoning without open tag, flushed on </think>
+        let r1 =
+            parser.parse_reasoning_streaming_incremental("initial reasoning</think> normal1 ", &[]);
+        assert_eq!(r1.reasoning_text, "initial reasoning");
+        assert_eq!(r1.normal_text, " normal1 ");
+
+        // Subsequent explicit <think> block works correctly
+        let r2 = parser
+            .parse_reasoning_streaming_incremental("<think>second block</think> normal2", &[]);
+        assert_eq!(r2.reasoning_text, "second block");
+        assert_eq!(r2.normal_text, " normal2");
+    }
+
+    #[test]
+    fn test_glm5_pattern_a_burst_single_chunk() {
+        // GLM-5 Pattern A: the entire completion arrives in one SSE event.
+        // Format: <think>T1</think><tool_call>A</tool_call><think>T2</think><tool_call>B</tool_call>
+        //
+        // Both reasoning blocks must be extracted into reasoning_text; both tool calls
+        // must land in normal_text for the downstream tool call parser. No follow-up
+        // call should be needed — the loop fully drains the buffer in a single call.
+        let mut parser =
+            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
+
+        let r1 = parser.parse_reasoning_streaming_incremental(
+            "<think>T1</think><tool_call>A</tool_call><think>T2</think><tool_call>B</tool_call>",
+            &[],
+        );
+        assert_eq!(r1.reasoning_text, "T1T2");
+        assert_eq!(
+            r1.normal_text,
+            "<tool_call>A</tool_call><tool_call>B</tool_call>"
+        );
+
+        // Buffer is fully drained; stream can end here with no content loss
+        let r2 = parser.parse_reasoning_streaming_incremental("", &[]);
+        assert_eq!(r2.reasoning_text, "");
+        assert_eq!(r2.normal_text, "");
+    }
+
+    #[test]
+    fn test_tool_call_xml_between_reasoning_blocks_streaming() {
+        // GLM-5 Pattern A chunk-by-chunk: verifies that tool call XML between reasoning
+        // blocks lands in normal_text, not reasoning_text, across separate SSE events.
+        let mut parser =
+            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
+
+        let r1 = parser.parse_reasoning_streaming_incremental("<think>T1</think>", &[]);
+        assert_eq!(r1.reasoning_text, "T1");
+        assert_eq!(r1.normal_text, "");
+
+        let r2 = parser.parse_reasoning_streaming_incremental("<tool_call>A</tool_call>", &[]);
+        assert_eq!(r2.normal_text, "<tool_call>A</tool_call>");
+        assert_eq!(r2.reasoning_text, "");
+
+        let r3 = parser.parse_reasoning_streaming_incremental("<think>T2</think>", &[]);
+        assert_eq!(r3.reasoning_text, "T2");
+        assert_eq!(r3.normal_text, "");
+
+        let r4 = parser.parse_reasoning_streaming_incremental("<tool_call>B</tool_call>", &[]);
+        assert_eq!(r4.normal_text, "<tool_call>B</tool_call>");
+        assert_eq!(r4.reasoning_text, "");
+    }
+
+    // =========================================================================
+    // Mid-string partial tag tests (overlap-based buffering)
+    //
+    // These test scenarios where a <think> or </think> tag is split mid-string
+    // (not at the start of the buffer). Backends that batch multiple forward-pass
+    // tokens into a single chunked response can produce these patterns.
+    //
+    // Ported from PR #6448 (ryanolson) with additional fakeout tests.
+    // =========================================================================
+
+    #[test]
+    fn test_mid_string_partial_opening_tag_batched() {
+        // Backend batches tokens: "Hello world <th" arrives as one chunk
+        let mut parser =
+            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
+
+        let r1 = parser.parse_reasoning_streaming_incremental("Hello world <th", &[]);
+        // "Hello world " emitted as normal, "<th" held in buffer
+        assert_eq!(r1.normal_text, "Hello world ");
+        assert_eq!(r1.reasoning_text, "");
+
+        let r2 = parser
+            .parse_reasoning_streaming_incremental("ink>reasoning content</think> answer", &[]);
+        assert_eq!(r2.reasoning_text, "reasoning content");
+        assert_eq!(r2.normal_text, " answer");
+    }
+
+    #[test]
+    fn test_batched_tag_boundary_split() {
+        // Aggressive batching: <think> tag split with normal text prefix
+        let mut parser =
+            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
+
+        let r1 = parser.parse_reasoning_streaming_incremental("The answer is <thi", &[]);
+        assert_eq!(r1.normal_text, "The answer is ");
+        assert_eq!(r1.reasoning_text, "");
+
+        let r2 = parser.parse_reasoning_streaming_incremental("nk>let me think</think>42", &[]);
+        assert_eq!(r2.reasoning_text, "let me think");
+        assert_eq!(r2.normal_text, "42");
+    }
+
+    #[test]
+    fn test_mid_string_partial_closing_tag_stream_reasoning_false() {
+        // With stream_reasoning=false, content stays buffered until </think>.
+        // Partial </think> split mid-string while in reasoning mode.
+        let mut parser =
+            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, false);
+
+        let r1 =
+            parser.parse_reasoning_streaming_incremental("<think>reasoning content and </th", &[]);
+        assert_eq!(r1.normal_text, "");
+        assert_eq!(r1.reasoning_text, "");
+
+        let r2 = parser.parse_reasoning_streaming_incremental("ink> normal text", &[]);
+        assert_eq!(r2.reasoning_text, "reasoning content and ");
+        assert_eq!(r2.normal_text, " normal text");
+    }
+
+    #[test]
+    fn test_mid_string_partial_closing_tag_stream_reasoning_true() {
+        // With stream_reasoning=true, reasoning content is emitted incrementally.
+        // The partial "</th" at the end must NOT be emitted as reasoning text.
+        let mut parser =
+            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
+
+        let r1 =
+            parser.parse_reasoning_streaming_incremental("<think>reasoning content and </th", &[]);
+        // "reasoning content and " emitted as reasoning, "</th" held
+        assert_eq!(r1.reasoning_text, "reasoning content and ");
+        assert_eq!(r1.normal_text, "");
+
+        let r2 = parser.parse_reasoning_streaming_incremental("ink> normal text", &[]);
+        assert_eq!(r2.reasoning_text, "");
+        assert_eq!(r2.normal_text, " normal text");
+    }
+
+    #[test]
+    fn test_batched_interleaved_with_mid_string_partial() {
+        // First block complete in chunk 1, second block's <think> split at boundary
+        let mut parser =
+            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
+
+        let r1 =
+            parser.parse_reasoning_streaming_incremental("<think>thought1</think>answer1<thi", &[]);
+        assert_eq!(r1.reasoning_text, "thought1");
+        assert_eq!(r1.normal_text, "answer1");
+
+        let r2 = parser.parse_reasoning_streaming_incremental("nk>thought2</think>answer2", &[]);
+        assert_eq!(r2.reasoning_text, "thought2");
+        assert_eq!(r2.normal_text, "answer2");
+    }
+
+    #[test]
+    fn test_partial_tag_false_positive() {
+        // "<th" looks like partial <think> but "thesis" is not <think>
+        let mut parser =
+            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
+
+        let r1 = parser.parse_reasoning_streaming_incremental("value <thesis on", &[]);
+        // No suffix of "value <thesis on" is a prefix of "<think>" — all emitted
+        let r2 = parser.parse_reasoning_streaming_incremental(" AI> is great", &[]);
+
+        let combined_normal = format!("{}{}", r1.normal_text, r2.normal_text);
+        assert_eq!(combined_normal, "value <thesis on AI> is great");
+        assert_eq!(r1.reasoning_text, "");
+        assert_eq!(r2.reasoning_text, "");
+    }
+
+    #[test]
+    fn test_partial_closing_tag_fakeout() {
+        // Ollama-style fakeout: "</th" buffered, but "ing>" completes "</thing>" not "</think>"
+        let mut parser =
+            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
+
+        let r1 = parser.parse_reasoning_streaming_incremental("<think>abc</th", &[]);
+        assert_eq!(r1.reasoning_text, "abc");
+        assert_eq!(r1.normal_text, "");
+
+        // "ing>def" completes the partial as "</thing>def" — not a closing tag
+        let r2 = parser.parse_reasoning_streaming_incremental("ing>def", &[]);
+        assert_eq!(r2.reasoning_text, "</thing>def");
+        assert_eq!(r2.normal_text, "");
+
+        // Real closing tag arrives
+        let r3 = parser.parse_reasoning_streaming_incremental("</think>done", &[]);
+        assert_eq!(r3.reasoning_text, "");
+        assert_eq!(r3.normal_text, "done");
+    }
+
+    #[test]
+    fn test_overlap_helper_function() {
+        // Direct tests for the overlap utility
+        assert_eq!(overlap("abc</th", "</think>"), 4);
+        assert_eq!(overlap("abc</thing>def", "</think>"), 0);
+        assert_eq!(overlap("<", "<think>"), 1);
+        assert_eq!(overlap("<th", "<think>"), 3);
+        assert_eq!(overlap("<think>", "<think>"), 7); // full match
+        assert_eq!(overlap("no match", "<think>"), 0);
+        assert_eq!(overlap("", "<think>"), 0);
+        assert_eq!(overlap("Hello world <thi", "<think>"), 4);
+        // Multi-byte delimiters (Kimi parser uses ◁think▷ / ◁/think▷)
+        assert_eq!(overlap("text◁", "◁think▷"), 3); // ◁ is 3 bytes
+        assert_eq!(overlap("text◁th", "◁think▷"), 5);
+        assert_eq!(overlap("text◁/thi", "◁/think▷"), 7);
+        assert_eq!(overlap("no match", "◁think▷"), 0);
+    }
 }
--- a/lib/parsers/src/reasoning/mod.rs
+++ b/lib/parsers/src/reasoning/mod.rs
@@ -30,6 +30,7 @@ fn get_reasoning_parser_map() -> &'static HashMap<&'static str, ReasoningParserT
        map.insert("mistral", ReasoningParserType::Mistral);
        map.insert("granite", ReasoningParserType::Granite);
        map.insert("nemotron_nano", ReasoningParserType::NemotronDeci); // nemotron nano is <think>...</think>
+        map.insert("glm45", ReasoningParserType::NemotronDeci); // GLM-4.5/5 is <think>...</think>, no force_reasoning
        map.insert(
            "minimax_append_think",
            ReasoningParserType::MiniMaxAppendThink,
@@ -225,6 +226,7 @@ mod tests {
            "mistral",
            "granite",
            "nemotron_nano",
+            "glm45",
            "minimax_append_think",
        ];
        for parser in available_parsers {