fix(api): preserve interleaved reasoning order for KV cache correctness (#6442)

Signed-off-by: Matej Kosec <mkosec@nvidia.com>

fix(api): preserve interleaved reasoning order for KV cache correctness (#6442)
Signed-off-by: Matej Kosec <mkosec@nvidia.com>
a9e06960 · MatejKosec · GitHub · 5277fb9b · a9e06960 · a9e06960
Unverified Commit a9e06960 authored Feb 23, 2026 by MatejKosec Committed by GitHub Feb 24, 2026
5 changed files
--- a/lib/async-openai/src/types/chat.rs
+++ b/lib/async-openai/src/types/chat.rs
@@ -466,6 +466,50 @@ pub struct ChatCompletionRequestAssistantMessageAudio {
    pub id: String,
 }

+/// Reasoning content from a previous assistant turn.
+///
+/// This is an untagged enum that deserializes from either:
+/// - A plain string: `"reasoning_content": "thinking..."` -> `Text("thinking...")`
+/// - An array of strings: `"reasoning_content": ["seg1", "seg2"]` -> `Segments(["seg1", "seg2"])`
+///
+/// The `Segments` variant preserves interleaved reasoning order needed for KV cache–correct
+/// context reconstruction. `segments[i]` is the reasoning that preceded `tool_calls[i]`;
+/// `segments[tool_calls.len()]` is any trailing reasoning after the last tool call.
+/// `segments.len() == tool_calls.len() + 1` always when set.
+#[derive(ToSchema, Serialize, Deserialize, Clone, Debug, PartialEq)]
+#[serde(untagged)]
+pub enum ReasoningContent {
+    /// Flat string — single reasoning block or legacy backward-compat form.
+    Text(String),
+    /// Interleaved segments. segments[i] precedes tool_calls[i];
+    /// segments[N] is trailing reasoning after the last tool call.
+    /// segments.len() == tool_calls.len() + 1.
+    Segments(Vec<String>),
+}
+
+impl ReasoningContent {
+    /// Join all segments (or return text as-is) into a single flat string.
+    pub fn to_flat_string(&self) -> String {
+        match self {
+            ReasoningContent::Text(s) => s.clone(),
+            ReasoningContent::Segments(segs) => segs
+                .iter()
+                .filter(|s| !s.is_empty())
+                .cloned()
+                .collect::<Vec<_>>()
+                .join("\n"),
+        }
+    }
+
+    /// Returns the segments if this is the `Segments` variant, `None` for `Text`.
+    pub fn segments(&self) -> Option<&[String]> {
+        match self {
+            ReasoningContent::Segments(segs) => Some(segs),
+            ReasoningContent::Text(_) => None,
+        }
+    }
+}
+
 #[derive(ToSchema, Debug, Serialize, Deserialize, Default, Clone, Builder, PartialEq)]
 #[builder(name = "ChatCompletionRequestAssistantMessageArgs")]
 #[builder(pattern = "mutable")]
@@ -476,10 +520,13 @@ pub struct ChatCompletionRequestAssistantMessage {
    /// The contents of the assistant message. Required unless `tool_calls` or `function_call` is specified.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub content: Option<ChatCompletionRequestAssistantMessageContent>,
-    /// Optional internal reasoning content from a previous assistant turn.
-    /// Used by reasoning-capable models that consume prior chain-of-thought-like context.
+    /// Reasoning content from a previous assistant turn.
+    ///
+    /// When serialized as a plain string, represents a flat reasoning block (backward-compatible
+    /// with Jinja chat templates). When serialized as an array of strings, represents
+    /// interleaved reasoning segments preserving per-position order for KV cache correctness.
    #[serde(skip_serializing_if = "Option::is_none")]
-    pub reasoning_content: Option<String>,
+    pub reasoning_content: Option<ReasoningContent>,
    /// The refusal message by the assistant.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub refusal: Option<String>,
@@ -1280,7 +1327,7 @@ mod tests {
    }

    #[test]
-    fn test_assistant_request_reasoning_content_roundtrip() {
+    fn test_assistant_request_reasoning_content_text_roundtrip() {
        let json = r#"{
            "model": "deepseek-v3.2",
            "messages": [
@@ -1306,7 +1353,26 @@ mod tests {
            _ => panic!("expected assistant message"),
        };

-        assert_eq!(assistant.reasoning_content.as_deref(), Some("thinking..."));
+        assert_eq!(
+            assistant.reasoning_content,
+            Some(ReasoningContent::Text("thinking...".into()))
+        );
+        assert_eq!(
+            assistant
+                .reasoning_content
+                .as_ref()
+                .unwrap()
+                .to_flat_string(),
+            "thinking..."
+        );
+        assert!(
+            assistant
+                .reasoning_content
+                .as_ref()
+                .unwrap()
+                .segments()
+                .is_none()
+        );

        let serialized = serde_json::to_value(&request).unwrap();
        assert_eq!(
@@ -1314,4 +1380,63 @@ mod tests {
            serde_json::Value::String("thinking...".to_string())
        );
    }
+
+    #[test]
+    fn test_assistant_request_reasoning_content_segments_roundtrip() {
+        let json = r#"{
+            "model": "deepseek-v3.2",
+            "messages": [
+                {"role": "user", "content": "test"},
+                {
+                    "role": "assistant",
+                    "reasoning_content": ["seg1", "seg2", ""],
+                    "tool_calls": [{
+                        "id": "call_1",
+                        "type": "function",
+                        "function": {"name": "f1", "arguments": "{}"}
+                    }, {
+                        "id": "call_2",
+                        "type": "function",
+                        "function": {"name": "f2", "arguments": "{}"}
+                    }]
+                }
+            ]
+        }"#;
+
+        let request: CreateChatCompletionRequest = serde_json::from_str(json).unwrap();
+        let assistant = match &request.messages[1] {
+            ChatCompletionRequestMessage::Assistant(msg) => msg,
+            _ => panic!("expected assistant message"),
+        };
+
+        assert_eq!(
+            assistant.reasoning_content,
+            Some(ReasoningContent::Segments(vec![
+                "seg1".into(),
+                "seg2".into(),
+                "".into()
+            ]))
+        );
+        assert_eq!(
+            assistant
+                .reasoning_content
+                .as_ref()
+                .unwrap()
+                .to_flat_string(),
+            "seg1\nseg2"
+        );
+        let segs = assistant
+            .reasoning_content
+            .as_ref()
+            .unwrap()
+            .segments()
+            .expect("should be Segments");
+        assert_eq!(segs.len(), 3);
+
+        let serialized = serde_json::to_value(&request).unwrap();
+        assert_eq!(
+            serialized["messages"][1]["reasoning_content"],
+            serde_json::json!(["seg1", "seg2", ""])
+        );
+    }
 }
--- a/lib/llm/src/entrypoint/input/text.rs
+++ b/lib/llm/src/entrypoint/input/text.rs
@@ -187,7 +187,9 @@ async fn main_loop(
        let assistant_message = dynamo_async_openai::types::ChatCompletionRequestMessage::Assistant(
            dynamo_async_openai::types::ChatCompletionRequestAssistantMessage {
                content: Some(assistant_content),
-                reasoning_content: (!assistant_reasoning.is_empty()).then_some(assistant_reasoning),
+                reasoning_content: (!assistant_reasoning.is_empty()).then_some(
+                    dynamo_async_openai::types::ReasoningContent::Text(assistant_reasoning),
+                ),
                ..Default::default()
            },
        );

--- a/lib/llm/src/preprocessor/prompt/deepseek_v32.rs
+++ b/lib/llm/src/preprocessor/prompt/deepseek_v32.rs
@@ -275,13 +275,42 @@ fn render_message(
            // Handle reasoning content
            // NOTE: If this assistant comes after last user message, the opening <think>
            // was already added in the user message. We only need to add content and closing tag.
+            //
+            // Handle reasoning_content which may be a plain string or an array of segments.
+            // DeepSeek V3.2 always places its <think> block before all tool calls, so
+            // joining segments produces the correct flat form here.
            if thinking_mode == ThinkingMode::Thinking
                && last_user_idx.is_some_and(|idx| index > idx)
-                && let Some(reasoning) = msg.get("reasoning_content").and_then(|r| r.as_str())
            {
-                // DON'T add THINKING_START - it was already added in user message
-                prompt.push_str(reasoning);
-                prompt.push_str(tokens::THINKING_END);
+                let reasoning = msg.get("reasoning_content").and_then(|v| match v {
+                    serde_json::Value::String(s) => {
+                        if s.is_empty() {
+                            None
+                        } else {
+                            Some(s.clone())
+                        }
+                    }
+                    serde_json::Value::Array(arr) => {
+                        let joined = arr
+                            .iter()
+                            .filter_map(|v| v.as_str())
+                            .filter(|s| !s.is_empty())
+                            .collect::<Vec<_>>()
+                            .join("\n");
+                        if joined.is_empty() {
+                            None
+                        } else {
+                            Some(joined)
+                        }
+                    }
+                    _ => None,
+                });
+
+                if let Some(reasoning) = reasoning {
+                    // DON'T add THINKING_START - it was already added in user message
+                    prompt.push_str(&reasoning);
+                    prompt.push_str(tokens::THINKING_END);
+                }
            }

            // Handle content

--- a/lib/llm/src/protocols/anthropic/types.rs
+++ b/lib/llm/src/protocols/anthropic/types.rs
@@ -13,7 +13,7 @@ use dynamo_async_openai::types::{
    ChatCompletionRequestSystemMessageContent, ChatCompletionRequestToolMessage,
    ChatCompletionRequestToolMessageContent, ChatCompletionRequestUserMessage,
    ChatCompletionRequestUserMessageContent, ChatCompletionTool, ChatCompletionToolChoiceOption,
-    ChatCompletionToolType, FunctionName, FunctionObject,
+    ChatCompletionToolType, FunctionName, FunctionObject, ReasoningContent,
 };
 use serde::{Deserialize, Serialize};
 use uuid::Uuid;
@@ -557,6 +557,7 @@ impl TryFrom<AnthropicCreateMessageRequest> for NvCreateChatCompletionRequest {
                // Assistant with plain text
                (AnthropicRole::Assistant, AnthropicMessageContent::Text { content }) => {
                    messages.push(ChatCompletionRequestMessage::Assistant(
+                        #[allow(deprecated)]
                        ChatCompletionRequestAssistantMessage {
                            content: Some(ChatCompletionRequestAssistantMessageContent::Text(
                                content.clone(),
@@ -566,7 +567,6 @@ impl TryFrom<AnthropicCreateMessageRequest> for NvCreateChatCompletionRequest {
                            name: None,
                            audio: None,
                            tool_calls: None,
-                            #[allow(deprecated)]
                            function_call: None,
                        },
                    ));
@@ -685,15 +685,33 @@ fn convert_user_blocks(
 }

 /// Convert assistant-role content blocks into chat completion messages.
-/// Text blocks become an assistant message; tool_use blocks become tool_calls on an assistant message.
-/// Thinking blocks are passed through as `reasoning_content`.
+///
+/// Text blocks become an assistant message; tool_use blocks become tool_calls on an assistant
+/// message. Thinking blocks are preserved via `reasoning_content: Option<ReasoningContent>`:
+///
+/// - `ReasoningContent::Text(s)`: flat reasoning string (no tool calls present).
+/// - `ReasoningContent::Segments(segs)`: one entry **per position** in the interleaved sequence,
+///   enabling chat templates to reconstruct the exact token order:
+///   `<think>segments[0]</think><call>tc[0]</call><think>segments[1]</think><call>tc[1]</call>…<think>segments[N]</think>`
+///   - `segments[i]` is the thinking that immediately preceded `tool_calls[i]`
+///   - `segments[tool_calls.len()]` is any trailing thinking after the last tool call
+///   - `segments.len() == tool_calls.len() + 1` always
+///   - Individual entries may be empty strings (no reasoning at that position)
+/// - `None` when there is no reasoning content at all.
+///
+/// Preserving the original interleaved order is required for KV cache correctness: a prompt
+/// reconstructed from a flattened `reasoning_content` will differ token-by-token from the
+/// original assistant turn, causing a cache miss on every multi-tool exchange.
 fn convert_assistant_blocks(
    blocks: &[AnthropicContentBlock],
    messages: &mut Vec<ChatCompletionRequestMessage>,
 ) {
    let mut text_content = String::new();
-    let mut thinking_content = String::new();
    let mut tool_calls = Vec::new();
+    // One reasoning segment per tool call — segments[i] precedes tool_calls[i].
+    let mut segments: Vec<String> = Vec::new();
+    // Accumulates thinking text until the next tool_use block (or end of blocks).
+    let mut pending_reasoning = String::new();

    for block in blocks {
        match block {
@@ -701,12 +719,14 @@ fn convert_assistant_blocks(
                text_content.push_str(text);
            }
            AnthropicContentBlock::Thinking { thinking, .. } => {
-                if !thinking_content.is_empty() {
-                    thinking_content.push('\n');
+                if !pending_reasoning.is_empty() {
+                    pending_reasoning.push('\n');
                }
-                thinking_content.push_str(thinking);
+                pending_reasoning.push_str(thinking);
            }
            AnthropicContentBlock::ToolUse { id, name, input } => {
+                // Snapshot the reasoning that preceded this tool call.
+                segments.push(std::mem::take(&mut pending_reasoning));
                tool_calls.push(ChatCompletionMessageToolCall {
                    id: id.clone(),
                    r#type: ChatCompletionToolType::Function,
@@ -720,6 +740,11 @@ fn convert_assistant_blocks(
        }
    }

+    // Append any trailing reasoning (after the last tool call) as the final segment.
+    // This makes segments.len() == tool_calls.len() + 1, preserving the full interleaved
+    // order including reasoning that follows the last tool call.
+    segments.push(std::mem::take(&mut pending_reasoning));
+
    let content = if text_content.is_empty() {
        None
    } else {
@@ -728,10 +753,25 @@ fn convert_assistant_blocks(
        ))
    };

-    let reasoning = if thinking_content.is_empty() {
-        None
+    // Produce a single ReasoningContent value:
+    // - Segments variant when there are tool calls and at least one segment is non-empty
+    //   (genuine interleaving present).
+    // - Text variant when there's reasoning but no tool calls (flat form).
+    // - None when there's no reasoning at all.
+    let reasoning_content = if !tool_calls.is_empty() && segments.iter().any(|s| !s.is_empty()) {
+        Some(ReasoningContent::Segments(segments))
    } else {
-        Some(thinking_content)
+        let flat: String = segments
+            .iter()
+            .filter(|s| !s.is_empty())
+            .cloned()
+            .collect::<Vec<_>>()
+            .join("\n");
+        if flat.is_empty() {
+            None
+        } else {
+            Some(ReasoningContent::Text(flat))
+        }
    };

    let tc = if tool_calls.is_empty() {
@@ -743,7 +783,7 @@ fn convert_assistant_blocks(
    messages.push(ChatCompletionRequestMessage::Assistant(
        ChatCompletionRequestAssistantMessage {
            content,
-            reasoning_content: reasoning,
+            reasoning_content,
            refusal: None,
            name: None,
            audio: None,
@@ -1339,7 +1379,10 @@ mod tests {
        let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
        match &chat_req.inner.messages[0] {
            ChatCompletionRequestMessage::Assistant(a) => {
-                assert_eq!(a.reasoning_content.as_deref(), Some("I should think..."));
+                assert_eq!(
+                    a.reasoning_content,
+                    Some(ReasoningContent::Text("I should think...".into()))
+                );
                match &a.content {
                    Some(ChatCompletionRequestAssistantMessageContent::Text(t)) => {
                        assert_eq!(t, "Answer");
@@ -1476,4 +1519,261 @@ mod tests {
        // "Hello, world! This is a test message." (37) + "You are helpful." (16) + role (4) = 57 / 3 = 19
        assert_eq!(tokens, 19);
    }
+
+    // --- ReasoningContent enum tests ---
+
+    fn make_req(blocks: Vec<AnthropicContentBlock>) -> ChatCompletionRequestAssistantMessage {
+        let req = AnthropicCreateMessageRequest {
+            model: "test-model".into(),
+            max_tokens: 100,
+            messages: vec![AnthropicMessage {
+                role: AnthropicRole::Assistant,
+                content: AnthropicMessageContent::Blocks { content: blocks },
+            }],
+            system: None,
+            temperature: None,
+            top_p: None,
+            top_k: None,
+            stop_sequences: None,
+            stream: false,
+            metadata: None,
+            tools: None,
+            tool_choice: None,
+        };
+        let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
+        match chat_req.inner.messages.into_iter().next().unwrap() {
+            ChatCompletionRequestMessage::Assistant(a) => a,
+            other => panic!("expected assistant, got {other:?}"),
+        }
+    }
+
+    fn tool_use(id: &str) -> AnthropicContentBlock {
+        AnthropicContentBlock::ToolUse {
+            id: id.into(),
+            name: "fn".into(),
+            input: serde_json::json!({}),
+        }
+    }
+
+    fn thinking(text: &str) -> AnthropicContentBlock {
+        AnthropicContentBlock::Thinking {
+            thinking: text.into(),
+            signature: "sig".into(),
+        }
+    }
+
+    #[test]
+    fn test_interleaved_thinking_and_tool_calls() {
+        // [Thinking("A"), ToolUse("t1"), Thinking("B"), ToolUse("t2")]
+        // segments = ["A", "B", ""] (trailing empty), tool_calls = [t1, t2]
+        let msg = make_req(vec![
+            thinking("A"),
+            tool_use("t1"),
+            thinking("B"),
+            tool_use("t2"),
+        ]);
+
+        let segs = msg
+            .reasoning_content
+            .as_ref()
+            .expect("reasoning_content should be set")
+            .segments()
+            .expect("should be Segments variant");
+        assert_eq!(segs.len(), 3); // tool_calls.len() + 1
+        assert_eq!(segs[0], "A");
+        assert_eq!(segs[1], "B");
+        assert_eq!(segs[2], ""); // no trailing reasoning
+
+        assert_eq!(
+            msg.reasoning_content.as_ref().unwrap().to_flat_string(),
+            "A\nB"
+        );
+
+        let tcs = msg.tool_calls.as_ref().expect("tool_calls should be set");
+        assert_eq!(tcs.len(), 2);
+        assert_eq!(tcs[0].id, "t1");
+        assert_eq!(tcs[1].id, "t2");
+    }
+
+    #[test]
+    fn test_trailing_reasoning_preserved_in_segments() {
+        // [Thinking("A"), ToolUse("t1"), Thinking("B")]
+        // segments = ["A", "B"], trailing reasoning "B" must appear in segments[1]
+        let msg = make_req(vec![thinking("A"), tool_use("t1"), thinking("B")]);
+
+        let segs = msg
+            .reasoning_content
+            .as_ref()
+            .expect("reasoning_content should be set")
+            .segments()
+            .expect("should be Segments variant");
+        assert_eq!(segs.len(), 2); // 1 tool call + 1 trailing
+        assert_eq!(segs[0], "A");
+        assert_eq!(segs[1], "B"); // trailing reasoning preserved
+
+        assert_eq!(
+            msg.reasoning_content.as_ref().unwrap().to_flat_string(),
+            "A\nB"
+        );
+    }
+
+    #[test]
+    fn test_tool_use_before_thinking() {
+        // [ToolUse("t1"), Thinking("A"), ToolUse("t2")]
+        // segments = ["", "A", ""] — empty first segment, reasoning before t2
+        let msg = make_req(vec![tool_use("t1"), thinking("A"), tool_use("t2")]);
+
+        let segs = msg
+            .reasoning_content
+            .as_ref()
+            .expect("reasoning_content should be set")
+            .segments()
+            .expect("should be Segments variant");
+        assert_eq!(segs.len(), 3);
+        assert_eq!(segs[0], ""); // no reasoning before t1
+        assert_eq!(segs[1], "A");
+        assert_eq!(segs[2], ""); // no trailing
+
+        assert_eq!(
+            msg.reasoning_content.as_ref().unwrap().to_flat_string(),
+            "A"
+        );
+    }
+
+    #[test]
+    fn test_all_thinking_then_all_tools() {
+        // [Thinking("A"), Thinking("B"), ToolUse("t1"), ToolUse("t2")]
+        // segments = ["A\nB", "", ""] — all reasoning before first tool
+        let msg = make_req(vec![
+            thinking("A"),
+            thinking("B"),
+            tool_use("t1"),
+            tool_use("t2"),
+        ]);
+
+        let segs = msg
+            .reasoning_content
+            .as_ref()
+            .expect("reasoning_content should be set")
+            .segments()
+            .expect("should be Segments variant");
+        assert_eq!(segs.len(), 3);
+        assert_eq!(segs[0], "A\nB");
+        assert_eq!(segs[1], "");
+        assert_eq!(segs[2], "");
+
+        assert_eq!(
+            msg.reasoning_content.as_ref().unwrap().to_flat_string(),
+            "A\nB"
+        );
+    }
+
+    #[test]
+    fn test_tool_calls_no_thinking_produces_no_segments() {
+        // [ToolUse("t1"), ToolUse("t2")] — all empty segments → reasoning_content = None
+        let msg = make_req(vec![tool_use("t1"), tool_use("t2")]);
+
+        assert!(
+            msg.reasoning_content.is_none(),
+            "no reasoning means no reasoning_content"
+        );
+    }
+
+    #[test]
+    fn test_thinking_only_no_tools_produces_text_variant() {
+        // [Thinking("A"), Text("answer")] — no tool calls → ReasoningContent::Text
+        let msg = make_req(vec![
+            thinking("A"),
+            AnthropicContentBlock::Text {
+                text: "answer".into(),
+            },
+        ]);
+
+        assert_eq!(
+            msg.reasoning_content,
+            Some(ReasoningContent::Text("A".into()))
+        );
+        assert!(msg.reasoning_content.as_ref().unwrap().segments().is_none());
+        assert!(matches!(
+            msg.content,
+            Some(ChatCompletionRequestAssistantMessageContent::Text(ref t)) if t == "answer"
+        ));
+    }
+
+    #[test]
+    fn test_single_thinking_then_single_tool() {
+        // [Thinking("reason"), ToolUse("t1")] → Segments(["reason", ""])
+        let msg = make_req(vec![thinking("reason"), tool_use("t1")]);
+
+        let segs = msg
+            .reasoning_content
+            .as_ref()
+            .expect("reasoning_content should be set")
+            .segments()
+            .expect("should be Segments variant");
+        assert_eq!(segs.len(), 2);
+        assert_eq!(segs[0], "reason");
+        assert_eq!(segs[1], "");
+
+        assert_eq!(
+            msg.reasoning_content.as_ref().unwrap().to_flat_string(),
+            "reason"
+        );
+    }
+
+    // Regression test for the KV-cache flattening bug.
+    //
+    // OLD CODE: `convert_assistant_blocks` concatenated all thinking blocks into a
+    // single flat string — `reasoning_content = Text("A\nB")`.  A chat template
+    // given only that string can only reconstruct:
+    //
+    //     <think>A\nB</think> <call>t1</call> <call>t2</call>
+    //
+    // That token sequence diverges from what the model originally generated at the
+    // very first `</think>`, so the KV cache misses on every multi-tool exchange.
+    //
+    // NEW CODE: `convert_assistant_blocks` produces `Segments(["A", "B", ""])` so a
+    // template that understands segments can reconstruct byte-for-byte:
+    //
+    //     <think>A</think> <call>t1</call> <think>B</think> <call>t2</call>
+    //
+    // This test fails on the old code because the old code returns `Text("A\nB")` and
+    // `.segments()` returns `None`, causing the `expect` below to panic.
+    #[test]
+    fn test_interleaved_reasoning_not_flattened_regression() {
+        let msg = make_req(vec![
+            thinking("A"),
+            tool_use("t1"),
+            thinking("B"),
+            tool_use("t2"),
+        ]);
+
+        // Must be Segments, not Text.  Text("A\nB") is the old (broken) behaviour:
+        // it loses which reasoning block preceded which tool call.
+        assert!(
+            !matches!(msg.reasoning_content, Some(ReasoningContent::Text(_))),
+            "reasoning_content must NOT be flat Text when tool calls are interleaved; \
+             Text loses positional info and forces a KV cache miss on every multi-tool turn"
+        );
+
+        let segs = msg
+            .reasoning_content
+            .as_ref()
+            .expect("reasoning_content should be set")
+            .segments()
+            .expect(
+                "must be Segments so a chat template can reconstruct \
+                 <think>A</think><call>t1</call><think>B</think><call>t2</call> \
+                 rather than front-loading all reasoning before all calls",
+            );
+
+        // segs[i] precedes tool_calls[i] — the invariant a template relies on
+        assert_eq!(segs[0], "A", "reasoning before t1");
+        assert_eq!(segs[1], "B", "reasoning before t2");
+        assert_eq!(segs[2], "", "no trailing reasoning");
+
+        let tools = msg.tool_calls.as_ref().unwrap();
+        assert_eq!(tools[0].id, "t1");
+        assert_eq!(tools[1].id, "t2");
+    }
 }
--- a/lib/llm/tests/deepseek_v32_encoding.rs
+++ b/lib/llm/tests/deepseek_v32_encoding.rs
@@ -321,6 +321,76 @@ fn test_reasoning_content_survives_chat_request_parsing_and_rendering() {
    assert!(rendered.contains("</think>"));
 }

+// Regression test for the KV-cache flattening bug.
+//
+// Models like GLM-5 and Qwen3 (Pattern A) emit interleaved thinking:
+//
+//   <think>A</think> <call>t1</call> <think>B</think> <call>t2</call>
+//
+// `convert_assistant_blocks` now serialises this as a JSON *array*:
+//
+//   "reasoning_content": ["A", "B", ""]
+//
+// OLD CODE stored `reasoning_content: Option<String>` — a JSON array would fail
+// to deserialise into that type, so this test panics at `.unwrap()` on old code.
+// NEW CODE stores `Option<ReasoningContent>` which accepts both string and array,
+// and round-trips the array form faithfully.
+#[test]
+fn test_reasoning_segments_roundtrip_through_parse_and_render() {
+    // Simulate what convert_assistant_blocks produces for an interleaved GLM-5 turn:
+    //   [Think("A"), Tool(t1), Think("B"), Tool(t2)]  →  segments = ["A", "B", ""]
+    let json = r#"{
+        "model": "glm-5",
+        "messages": [
+            {"role": "user", "content": "call two tools"},
+            {
+                "role": "assistant",
+                "reasoning_content": ["A", "B", ""],
+                "tool_calls": [
+                    {"id": "t1", "type": "function", "function": {"name": "f1", "arguments": "{}"}},
+                    {"id": "t2", "type": "function", "function": {"name": "f2", "arguments": "{}"}}
+                ]
+            },
+            {"role": "tool", "tool_call_id": "t1", "content": "r1"},
+            {"role": "tool", "tool_call_id": "t2", "content": "r2"}
+        ]
+    }"#;
+
+    // OLD CODE: serde_json::from_str fails here because Option<String> can't
+    // deserialise a JSON array.  NEW CODE: succeeds.
+    let request: NvCreateChatCompletionRequest = serde_json::from_str(json).unwrap();
+
+    // Segments must survive the round-trip through serde_json
+    let messages_json = serde_json::to_value(request.messages()).unwrap();
+    assert!(
+        messages_json[1]["reasoning_content"].is_array(),
+        "reasoning_content must serialise as a JSON array to preserve positional info; \
+         a string would lose which reasoning preceded which tool call"
+    );
+    let segs = messages_json[1]["reasoning_content"].as_array().unwrap();
+    assert_eq!(segs.len(), 3);
+    assert_eq!(segs[0].as_str().unwrap(), "A"); // precedes t1
+    assert_eq!(segs[1].as_str().unwrap(), "B"); // precedes t2
+    assert_eq!(segs[2].as_str().unwrap(), ""); // no trailing reasoning
+
+    // The formatter must not drop the reasoning content when segments are used.
+    // (DeepSeek V3.2 joins segments into one <think> block; this confirms the
+    // content is not silently discarded.)
+    let formatter =
+        dynamo_llm::preprocessor::prompt::deepseek_v32::DeepSeekV32Formatter::new_thinking();
+    let rendered = formatter.render(&request).unwrap();
+    assert!(
+        rendered.contains("A"),
+        "segment A must appear in rendered output"
+    );
+    assert!(
+        rendered.contains("B"),
+        "segment B must appear in rendered output"
+    );
+    assert!(rendered.contains("<think>"));
+    assert!(rendered.contains("</think>"));
+}
+
 #[test]
 fn test_tool_call_formatting() {
    let messages = serde_json::json!([