fix: Anthropic streaming double-parsing + reasoning_content roundtrip (#7358)

Signed-off-by: Matej Kosec <mkosec@nvidia.com>

fix: Anthropic streaming double-parsing + reasoning_content roundtrip (#7358)
Signed-off-by: Matej Kosec <mkosec@nvidia.com>
f0d3ce63 · MatejKosec · GitHub · d58a6881 · f0d3ce63 · f0d3ce63
Unverified Commit f0d3ce63 authored Mar 30, 2026 by MatejKosec Committed by GitHub Mar 30, 2026
8 changed files
--- a/components/src/dynamo/common/utils/input_params.py
+++ b/components/src/dynamo/common/utils/input_params.py
@@ -4,6 +4,48 @@
 from typing import Any, Optional
+def _inject_reasoning_content(messages: list) -> None:
+    """Inject reasoning_content as <think> blocks into content.
+    Chat templates only reference message["content"] — they don't see
+    reasoning_content. This converts it back to <think> blocks so the
+    model sees its own prior chain-of-thought across turns.
+    """
+    for msg in messages:
+        if msg.get("role") != "assistant":
+            continue
+        reasoning = msg.get("reasoning_content")
+        if not reasoning:
+            continue
+        # Build <think> wrapped text
+        if isinstance(reasoning, str):
+            think_text = f"<think>{reasoning}</think>" if reasoning else ""
+        elif isinstance(reasoning, list):
+            # Segments variant: wrap each non-empty segment
+            parts = [f"<think>{seg}</think>" for seg in reasoning if seg]
+            think_text = "".join(parts)
+        else:
+            continue
+        if not think_text:
+            continue
+        # Prepend to content
+        existing = msg.get("content")
+        if isinstance(existing, str):
+            msg["content"] = think_text + existing
+        elif isinstance(existing, list):
+            # Multimodal content array — prepend as text part
+            msg["content"] = [{"type": "text", "text": think_text}] + existing
+        else:
+            # null or absent
+            msg["content"] = think_text
+        # Remove so template doesn't see both
+        msg.pop("reasoning_content", None)
 class InputParamManager:
    def __init__(self, tokenizer: Any) -> None:
        self.tokenizer = tokenizer
@@ -18,8 +60,32 @@ class InputParamManager:
                raise ValueError("Tokenizer is not available")
            if "messages" in request:
+                # Forward chat_template_args / chat_template_kwargs to the
+                # template so model-specific variables (e.g. enable_thinking)
+                # are available during rendering.
+                extra_kwargs = {}
+                if "chat_template_kwargs" in request:
+                    extra_kwargs.update(request["chat_template_kwargs"])
+                if "chat_template_args" in request:
+                    extra_kwargs.update(request["chat_template_args"])
+                # Strip keys that are already set explicitly to avoid
+                # TypeError: got multiple values for keyword argument.
+                for reserved in ("tokenize", "add_generation_prompt"):
+                    extra_kwargs.pop(reserved, None)
+                # Inject reasoning_content as <think> blocks into content,
+                # but only if the template doesn't handle it natively.
+                # Templates like Nemotron and Qwen3 reference reasoning_content
+                # directly — injecting would produce duplicate <think> blocks.
+                chat_template_src = getattr(self.tokenizer, "chat_template", "") or ""
+                if "reasoning_content" not in chat_template_src:
+                    _inject_reasoning_content(request["messages"])
                return self.tokenizer.apply_chat_template(
-                    request["messages"], tokenize=False, add_generation_prompt=True
+                    request["messages"],
+                    tokenize=False,
+                    add_generation_prompt=True,
+                    **extra_kwargs,
                )
            elif "prompt" in request:
                return self.tokenizer.encode(request["prompt"])

--- a/components/src/dynamo/common/utils/tests/test_inject_reasoning_content.py
+++ b/components/src/dynamo/common/utils/tests/test_inject_reasoning_content.py
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""
+Unit tests for _inject_reasoning_content in input_params.py.
+Verifies that reasoning_content from prior assistant turns is converted
+to <think> blocks in the content field before chat template rendering.
+"""
+import copy
+from dynamo.common.utils.input_params import _inject_reasoning_content
+class TestInjectReasoningContent:
+    """Test suite for _inject_reasoning_content"""
+    def test_text_variant_prepends_to_content(self):
+        """Text reasoning_content is wrapped in <think> and prepended."""
+        messages = [
+            {
+                "role": "assistant",
+                "content": "The answer is 12.",
+                "reasoning_content": "sqrt(144) = 12",
+            },
+        ]
+        _inject_reasoning_content(messages)
+        assert (
+            messages[0]["content"] == "<think>sqrt(144) = 12</think>The answer is 12."
+        )
+        assert "reasoning_content" not in messages[0]
+    def test_segments_variant_wraps_each_segment(self):
+        """Segments are individually wrapped in <think> blocks."""
+        messages = [
+            {
+                "role": "assistant",
+                "content": "Done.",
+                "reasoning_content": ["first thought", "second thought", ""],
+            },
+        ]
+        _inject_reasoning_content(messages)
+        content = messages[0]["content"]
+        assert content.startswith("<think>first thought</think>")
+        assert "<think>second thought</think>" in content
+        assert "<think></think>" not in content  # empty segment skipped
+        assert content.endswith("Done.")
+        assert "reasoning_content" not in messages[0]
+    def test_null_content_creates_from_reasoning(self):
+        """When content is null/None, reasoning becomes the content."""
+        messages = [
+            {"role": "assistant", "content": None, "reasoning_content": "Thinking..."},
+        ]
+        _inject_reasoning_content(messages)
+        assert messages[0]["content"] == "<think>Thinking...</think>"
+    def test_absent_content_creates_from_reasoning(self):
+        """When content key is absent, reasoning becomes the content."""
+        messages = [
+            {"role": "assistant", "reasoning_content": "Thinking..."},
+        ]
+        _inject_reasoning_content(messages)
+        assert messages[0]["content"] == "<think>Thinking...</think>"
+    def test_multimodal_content_prepends_text_part(self):
+        """Array content gets a text part prepended, not replaced."""
+        messages = [
+            {
+                "role": "assistant",
+                "content": [{"type": "text", "text": "Here is the image."}],
+                "reasoning_content": "Analyzing the image...",
+            },
+        ]
+        _inject_reasoning_content(messages)
+        content = messages[0]["content"]
+        assert isinstance(content, list)
+        assert len(content) == 2
+        assert content[0] == {
+            "type": "text",
+            "text": "<think>Analyzing the image...</think>",
+        }
+        assert content[1] == {"type": "text", "text": "Here is the image."}
+    def test_skips_non_assistant_messages(self):
+        """User and tool messages are not modified."""
+        messages = [
+            {
+                "role": "user",
+                "content": "hello",
+                "reasoning_content": "should not touch",
+            },
+            {
+                "role": "tool",
+                "content": "result",
+                "reasoning_content": "should not touch",
+            },
+        ]
+        original = copy.deepcopy(messages)
+        _inject_reasoning_content(messages)
+        assert messages == original
+    def test_skips_empty_reasoning(self):
+        """Empty string reasoning_content is skipped."""
+        messages = [
+            {"role": "assistant", "content": "Answer.", "reasoning_content": ""},
+        ]
+        _inject_reasoning_content(messages)
+        assert messages[0]["content"] == "Answer."
+        # reasoning_content not removed since we skipped (falsy check)
+    def test_agentic_multi_turn_tool_call_flow(self):
+        """Full agentic flow: reason → tool_call → tool_result → reason → answer."""
+        messages = [
+            {"role": "user", "content": "What is sqrt(144) + sqrt(256)?"},
+            {
+                "role": "assistant",
+                "content": None,
+                "reasoning_content": "I need to compute sqrt(144) first.",
+                "tool_calls": [
+                    {
+                        "id": "call_0",
+                        "type": "function",
+                        "function": {
+                            "name": "calc",
+                            "arguments": '{"expr": "sqrt(144)"}',
+                        },
+                    },
+                ],
+            },
+            {"role": "tool", "tool_call_id": "call_0", "content": "12"},
+            {
+                "role": "assistant",
+                "content": "The answer is 28.",
+                "reasoning_content": "Got 12. sqrt(256) = 16. Sum = 28.",
+            },
+            {"role": "user", "content": "Thanks!"},
+        ]
+        _inject_reasoning_content(messages)
+        # First assistant turn: reasoning injected, null content → reasoning only
+        assert (
+            messages[1]["content"]
+            == "<think>I need to compute sqrt(144) first.</think>"
+        )
+        assert "reasoning_content" not in messages[1]
+        assert "tool_calls" in messages[1]  # tool_calls untouched
+        # Tool message untouched
+        assert messages[2]["content"] == "12"
+        # Second assistant turn: reasoning prepended to content
+        assert (
+            messages[3]["content"]
+            == "<think>Got 12. sqrt(256) = 16. Sum = 28.</think>The answer is 28."
+        )
+        assert "reasoning_content" not in messages[3]
+        # User messages untouched
+        assert messages[0]["content"] == "What is sqrt(144) + sqrt(256)?"
+        assert messages[4]["content"] == "Thanks!"
+class TestInputParamManagerReasoningInjection:
+    """Test that InputParamManager respects template introspection."""
+    def test_injects_when_template_ignores_reasoning(self):
+        """Templates without reasoning_content get injection."""
+        from unittest.mock import MagicMock
+        tokenizer = MagicMock()
+        tokenizer.chat_template = (
+            "{% for m in messages %}{{ m.role }}: {{ m.content }}{% endfor %}"
+        )
+        tokenizer.apply_chat_template = MagicMock(return_value="rendered")
+        from dynamo.common.utils.input_params import InputParamManager
+        mgr = InputParamManager(tokenizer)
+        request = {
+            "messages": [
+                {
+                    "role": "assistant",
+                    "content": "Hi.",
+                    "reasoning_content": "thinking...",
+                },
+                {"role": "user", "content": "Bye"},
+            ]
+        }
+        mgr.get_input_param(request, use_tokenizer=True)
+        # Verify injection happened: reasoning_content removed, content has <think>
+        called_messages = tokenizer.apply_chat_template.call_args[0][0]
+        assert "reasoning_content" not in called_messages[0]
+        assert called_messages[0]["content"].startswith("<think>thinking...</think>")
+    def test_skips_injection_when_template_handles_reasoning(self):
+        """Templates with reasoning_content are left alone."""
+        from unittest.mock import MagicMock
+        tokenizer = MagicMock()
+        tokenizer.chat_template = (
+            "{% for m in messages %}"
+            "{% if m.reasoning_content %}<think>{{ m.reasoning_content }}</think>{% endif %}"
+            "{{ m.role }}: {{ m.content }}{% endfor %}"
+        )
+        tokenizer.apply_chat_template = MagicMock(return_value="rendered")
+        from dynamo.common.utils.input_params import InputParamManager
+        mgr = InputParamManager(tokenizer)
+        request = {
+            "messages": [
+                {
+                    "role": "assistant",
+                    "content": "Hi.",
+                    "reasoning_content": "thinking...",
+                },
+                {"role": "user", "content": "Bye"},
+            ]
+        }
+        mgr.get_input_param(request, use_tokenizer=True)
+        # Verify injection was skipped: reasoning_content still present, content unchanged
+        called_messages = tokenizer.apply_chat_template.call_args[0][0]
+        assert called_messages[0]["reasoning_content"] == "thinking..."
+        assert called_messages[0]["content"] == "Hi."
--- a/lib/llm/src/http/service/anthropic.rs
+++ b/lib/llm/src/http/service/anthropic.rs
@@ -33,7 +33,6 @@ use super::{
    metrics::{CancellationLabels, Endpoint, process_response_and_observe_metrics},
    service_v2,
 };
-use crate::preprocessor::OpenAIPreprocessor;
 use crate::protocols::anthropic::stream_converter::AnthropicStreamConverter;
 use crate::protocols::anthropic::types::{
    AnthropicCountTokensRequest, AnthropicCountTokensResponse, AnthropicCreateMessageRequest,
@@ -192,19 +191,30 @@ async fn anthropic_messages(
    tracing::trace!("Received Anthropic messages request: {:?}", &*request);
+    // Look up engine and parsing options early so we know whether a reasoning
+    // parser is configured before converting the request.
+    let (engine, parsing_options) = state
+        .manager()
+        .get_chat_completions_engine_with_parsing(&model)
+        .map_err(|_| {
+            anthropic_error(
+                StatusCode::NOT_FOUND,
+                "not_found_error",
+                &format!("Model '{}' not found", model),
+            )
+        })?;
    let (orig_request, context) = request.into_parts();
    let model_for_resp = orig_request.model.clone();
-    // Check if the Anthropic request explicitly enabled thinking. When thinking
+    // Check if the Anthropic request explicitly disabled thinking.
-    // is enabled, reasoning-capable models' chat templates typically inject
+    let thinking_explicitly_disabled = orig_request
-    // `<think>` into the prompt, so the completion starts mid-reasoning.
-    let thinking_enabled = orig_request
        .thinking
        .as_ref()
-        .is_some_and(|t| t.thinking_type == "enabled");
+        .is_some_and(|t| t.thinking_type == "disabled");
    // Convert Anthropic request -> Chat Completion request
-    let chat_request: NvCreateChatCompletionRequest =
+    let mut chat_request: NvCreateChatCompletionRequest =
        orig_request.try_into().map_err(|e: anyhow::Error| {
            tracing::error!(
                request_id,
@@ -218,20 +228,42 @@ async fn anthropic_messages(
            )
        })?;
-    let request = context.map(|_req| chat_request);
+    // When a reasoning parser is configured and the client hasn't explicitly
+    // disabled thinking, assume the model's chat template will inject `<think>`.
-    tracing::trace!("Getting chat completions engine for model: {}", model);
+    //
+    // Two things must be aligned:
+    //   1. chat_template_args must include enable_thinking=true so the backend's
+    //      template actually injects `<think>` into the prompt. For the
+    //      ModelInput::Text path (SGLang without --skip-tokenizer-init), the
+    //      backend applies the template — without explicit enable_thinking the
+    //      result depends on the template's default which varies by model.
+    //   2. prompt_injected_reasoning must be true so the parser starts in
+    //      reasoning mode with stripped_think_start=true, which is critical for
+    //      correct `</think>` boundary detection in the streaming path.
+    //
+    // The OpenAI path handles this in the preprocessor: it renders the template,
+    // inspects the formatted prompt for a trailing `<think>`, and sets
+    // prompt_injected_reasoning accordingly. The Anthropic path bypasses the
+    // preprocessor, so we infer prompt injection from the reasoning parser config.
+    let prompt_injected_reasoning =
+        parsing_options.reasoning_parser.is_some() && !thinking_explicitly_disabled;
+    if prompt_injected_reasoning {
+        let args = chat_request
+            .chat_template_args
+            .get_or_insert_with(Default::default);
+        args.entry("enable_thinking".to_string())
+            .or_insert(serde_json::Value::Bool(true));
+        // Preserve reasoning from prior turns. Some templates (Nemotron)
+        // strip historical <think> content by default to save context.
+        // For agentic flows the model needs to see why it made prior decisions.
+        // Ref: NVIDIA's SWE training config also sets this to false:
+        // https://github.com/NVIDIA-NeMo/Nemotron/blob/main/src/nemotron/recipes/super3/stage2_rl/stage2_swe2/config/default.yaml#L287
+        args.entry("truncate_history_thinking".to_string())
+            .or_insert(serde_json::Value::Bool(false));
+    }
-    let (engine, parsing_options) = state
+    let request = context.map(|_req| chat_request);
-        .manager()
-        .get_chat_completions_engine_with_parsing(&model)
-        .map_err(|_| {
-            anthropic_error(
-                StatusCode::NOT_FOUND,
-                "not_found_error",
-                &format!("Model '{}' not found", model),
-            )
-        })?;
    let mut response_collector = state.metrics_clone().create_response_collector(&model);
@@ -247,27 +279,25 @@ async fn anthropic_messages(
    let ctx = engine_stream.context();
-    // Apply reasoning parser to the engine stream if configured.
+    // NOTE: We intentionally do NOT apply a reasoning parser here.
-    // The preprocessor (which normally handles this for the OpenAI path) is
-    // bypassed by the Anthropic endpoint, so we apply the same stream
-    // transform here.  This populates `delta.reasoning_content` which the
-    // AnthropicStreamConverter translates into thinking content blocks.
    //
-    // When thinking is enabled, the model's chat template likely injected
+    // For ModelInput::Tokens backends (skip_tokenizer_init=True), the engine
-    // `<think>` into the prompt (e.g., Qwen3.5), so the parser must start
+    // pipeline includes the OpenAI preprocessor which already applies reasoning
-    // in reasoning mode — the completion begins mid-reasoning without an
+    // parsing in its backward edge (postprocessor_parsing_stream). The stream
-    // explicit `<think>` tag.
+    // arriving here already has reasoning_content and content correctly split.
+    // Applying a second parser would re-classify post-think content chunks
+    // (where reasoning_content=None, content=Some) as reasoning, because the
+    // </think> boundary was consumed by the first parser and doesn't appear
+    // in the detokenized text.
+    //
+    // For ModelInput::Text backends (PushRouter, no preprocessor), reasoning
+    // parsing is NOT handled in the streaming path — the backend puts raw text
+    // (including <think> tags) in delta.content with reasoning_content=None.
+    // This is a known gap that affects all streaming handlers (OpenAI, Anthropic,
+    // Responses API) equally.
    let engine_stream: Pin<
        Box<dyn futures::Stream<Item = Annotated<NvCreateChatCompletionStreamResponse>> + Send>,
-    > = if let Some(ref reasoning_parser_name) = parsing_options.reasoning_parser {
+    > = Box::pin(engine_stream);
-        Box::pin(OpenAIPreprocessor::parse_reasoning_content_from_stream(
-            engine_stream,
-            reasoning_parser_name.clone(),
-            thinking_enabled,
-        ))
-    } else {
-        Box::pin(engine_stream)
-    };
    let mut inflight_guard =
        state

--- a/lib/llm/src/preprocessor/prompt/template.rs
+++ b/lib/llm/src/preprocessor/prompt/template.rs
@@ -135,6 +135,9 @@ struct HfTokenizerConfigJsonFormatter {
    /// When true, strip tool definitions from the chat template when tool_choice is "none".
    /// This prevents models from generating raw XML tool calls in the content field.
    exclude_tools_when_tool_choice_none: bool,
+    /// True if the chat template natively references `reasoning_content`.
+    /// When true, skip injection — the template handles it.
+    template_handles_reasoning: bool,
 }
 // /// OpenAI Standard Prompt Formatter

--- a/lib/llm/src/preprocessor/prompt/template/formatters.rs
+++ b/lib/llm/src/preprocessor/prompt/template/formatters.rs
@@ -161,6 +161,12 @@ impl HfTokenizerConfigJsonFormatter {
        // Detect at model load time whether this template requires content arrays
        let requires_content_arrays = detect_content_array_usage(&env);
+        // Detect if the template natively handles reasoning_content (e.g. Nemotron, Qwen3).
+        // If so, we must NOT inject <think> blocks — the template does it itself.
+        let template_handles_reasoning = env
+            .templates()
+            .any(|(_, tmpl)| tmpl.source().contains("reasoning_content"));
        Ok(HfTokenizerConfigJsonFormatter {
            env,
            config,
@@ -168,6 +174,7 @@ impl HfTokenizerConfigJsonFormatter {
            supports_add_generation_prompt: supports_add_generation_prompt.unwrap_or(false),
            requires_content_arrays,
            exclude_tools_when_tool_choice_none,
+            template_handles_reasoning,
        })
    }
 }

--- a/lib/llm/src/preprocessor/prompt/template/oai.rs
+++ b/lib/llm/src/preprocessor/prompt/template/oai.rs
@@ -203,6 +203,82 @@ fn normalize_tool_arguments_in_messages(messages: &mut serde_json::Value) {
    }
 }
+/// Inject `reasoning_content` back into the `content` field as `<think>` blocks.
+///
+/// Chat templates only reference `{{ message.content }}` — they don't know about
+/// `reasoning_content`. Without this injection, the model's prior chain-of-thought
+/// is silently dropped across turns.
+///
+/// Uses `<think>`/`</think>` delimiters — the same tags that reasoning models emit
+/// and that the reasoning parser strips on output. Reasoning is prepended to content
+/// to match the original generation order (`<think>...</think> response`).
+///
+/// Segments are concatenated rather than interleaved with tool_calls because Jinja
+/// templates render `tool_calls` separately from `content`. The model still sees
+/// all reasoning text before the template-rendered tool call block.
+fn inject_reasoning_content_into_messages(messages: &mut serde_json::Value) {
+    let Some(msgs) = messages.as_array_mut() else {
+        return;
+    };
+    for msg in msgs.iter_mut() {
+        if msg.get("role").and_then(|r| r.as_str()) != Some("assistant") {
+            continue;
+        }
+        let reasoning = match msg.get("reasoning_content") {
+            Some(serde_json::Value::String(s)) if !s.is_empty() => {
+                format!("<think>{}</think>", s)
+            }
+            Some(serde_json::Value::Array(segments)) => {
+                let mut result = String::new();
+                for seg in segments {
+                    if let Some(s) = seg.as_str()
+                        && !s.is_empty()
+                    {
+                        result.push_str("<think>");
+                        result.push_str(s);
+                        result.push_str("</think>");
+                    }
+                }
+                if result.is_empty() {
+                    continue;
+                }
+                result
+            }
+            _ => continue,
+        };
+        match msg.get("content") {
+            // Content is a string or null — prepend reasoning as text
+            Some(serde_json::Value::String(s)) if !s.is_empty() => {
+                msg["content"] = serde_json::Value::String(format!("{}{}", reasoning, s));
+            }
+            None | Some(serde_json::Value::Null) | Some(serde_json::Value::String(_)) => {
+                msg["content"] = serde_json::Value::String(reasoning);
+            }
+            // Content is an array (multimodal) — prepend as a text part
+            Some(serde_json::Value::Array(_)) => {
+                let think_part = serde_json::json!({
+                    "type": "text",
+                    "text": reasoning
+                });
+                if let Some(arr) = msg.get_mut("content").and_then(|v| v.as_array_mut()) {
+                    arr.insert(0, think_part);
+                }
+            }
+            // Other types (number, bool, object) — skip, don't corrupt
+            _ => continue,
+        }
+        // Remove so the template doesn't see both the injected <think> in content
+        // and the original reasoning_content field.
+        if let Some(obj) = msg.as_object_mut() {
+            obj.remove("reasoning_content");
+        }
+    }
+}
 impl OAIChatLikeRequest for NvCreateChatCompletionRequest {
    fn model(&self) -> String {
        self.inner.model.clone()
@@ -378,6 +454,14 @@ impl OAIPromptFormatter for HfTokenizerConfigJsonFormatter {
        normalize_tool_arguments_in_messages(&mut messages_for_template);
+        // Inject reasoning_content as <think> blocks into content — but only if
+        // the template doesn't handle it natively. Templates like Nemotron and
+        // Qwen3 reference reasoning_content directly in their Jinja logic; injecting
+        // would produce duplicate <think> blocks.
+        if !self.template_handles_reasoning {
+            inject_reasoning_content_into_messages(&mut messages_for_template);
+        }
        let ctx = context! {
            messages => messages_for_template,
            tools => tools,
@@ -1312,4 +1396,337 @@ NORMAL_MODE
            result
        );
    }
+    #[test]
+    fn test_inject_reasoning_content_segments_with_tool_calls() {
+        // Assistant message with reasoning_content segments and tool_calls
+        let mut messages = serde_json::json!([
+            {
+                "role": "user",
+                "content": "What is sqrt(144) and sqrt(256)?"
+            },
+            {
+                "role": "assistant",
+                "content": "Let me calculate those.",
+                "reasoning_content": ["I need to compute sqrt(144)", "Now sqrt(256)", ""],
+                "tool_calls": [
+                    {
+                        "id": "call_0",
+                        "type": "function",
+                        "function": {
+                            "name": "calculator",
+                            "arguments": "{\"expr\": \"sqrt(144)\"}"
+                        }
+                    },
+                    {
+                        "id": "call_1",
+                        "type": "function",
+                        "function": {
+                            "name": "calculator",
+                            "arguments": "{\"expr\": \"sqrt(256)\"}"
+                        }
+                    }
+                ]
+            }
+        ]);
+        inject_reasoning_content_into_messages(&mut messages);
+        let assistant = &messages[1];
+        // reasoning_content should be removed
+        assert!(
+            assistant.get("reasoning_content").is_none(),
+            "reasoning_content should be removed after injection"
+        );
+        // content should have <think> blocks prepended (empty segment skipped)
+        let content = assistant["content"].as_str().unwrap();
+        assert!(
+            content.starts_with("<think>I need to compute sqrt(144)</think>"),
+            "content should start with first reasoning segment, got: {}",
+            content
+        );
+        assert!(
+            content.contains("<think>Now sqrt(256)</think>"),
+            "content should contain second reasoning segment"
+        );
+        // Empty third segment should NOT produce <think></think>
+        assert!(
+            !content.contains("<think></think>"),
+            "empty segments should be skipped"
+        );
+        // Original content should be preserved at the end
+        assert!(
+            content.ends_with("Let me calculate those."),
+            "original content should be at the end, got: {}",
+            content
+        );
+        // tool_calls should be untouched
+        assert!(assistant.get("tool_calls").is_some());
+        assert_eq!(assistant["tool_calls"].as_array().unwrap().len(), 2);
+    }
+    #[test]
+    fn test_inject_reasoning_content_text_variant() {
+        let mut messages = serde_json::json!([
+            {
+                "role": "assistant",
+                "content": "The answer is 42.",
+                "reasoning_content": "Let me think about this carefully."
+            }
+        ]);
+        inject_reasoning_content_into_messages(&mut messages);
+        let assistant = &messages[0];
+        assert!(assistant.get("reasoning_content").is_none());
+        let content = assistant["content"].as_str().unwrap();
+        assert_eq!(
+            content,
+            "<think>Let me think about this carefully.</think>The answer is 42."
+        );
+    }
+    #[test]
+    fn test_inject_reasoning_content_null_content() {
+        // reasoning_content present but content is null
+        let mut messages = serde_json::json!([
+            {
+                "role": "assistant",
+                "content": null,
+                "reasoning_content": "Thinking...",
+                "tool_calls": [{"id": "call_0", "type": "function", "function": {"name": "f", "arguments": "{}"}}]
+            }
+        ]);
+        inject_reasoning_content_into_messages(&mut messages);
+        let content = messages[0]["content"].as_str().unwrap();
+        assert_eq!(content, "<think>Thinking...</think>");
+        assert!(messages[0].get("reasoning_content").is_none());
+    }
+    #[test]
+    fn test_inject_reasoning_content_skips_non_assistant() {
+        let mut messages = serde_json::json!([
+            {
+                "role": "user",
+                "content": "hello",
+                "reasoning_content": "should not be touched"
+            }
+        ]);
+        inject_reasoning_content_into_messages(&mut messages);
+        // User message should be untouched
+        assert!(messages[0].get("reasoning_content").is_some());
+    }
+    // Helper: create a formatter with a minimal chat template for render tests
+    fn make_test_formatter() -> HfTokenizerConfigJsonFormatter {
+        use super::tokcfg::ChatTemplate;
+        use super::{ContextMixins, HfTokenizerConfigJsonFormatter};
+        // Minimal template that renders content verbatim — enough to verify
+        // that reasoning_content injection works through the full pipeline.
+        let template = r#"{%- for message in messages %}{{ message.role }}: {{ message.content }}
+{%- endfor %}
+{%- if add_generation_prompt %}assistant:{%- endif %}"#;
+        let chat_template: ChatTemplate = serde_json::from_value(serde_json::json!({
+            "chat_template": template
+        }))
+        .unwrap();
+        HfTokenizerConfigJsonFormatter::new(chat_template, ContextMixins::new(&[])).unwrap()
+    }
+    // Verify reasoning_content (Text variant) from a prior assistant turn
+    // appears as a <think> block in the rendered prompt.
+    #[test]
+    fn test_reasoning_content_text_roundtrip_render() {
+        use super::OAIPromptFormatter;
+        let formatter = make_test_formatter();
+        let request: NvCreateChatCompletionRequest = serde_json::from_value(serde_json::json!({
+            "model": "test-model",
+            "messages": [
+                {"role": "user", "content": "What is sqrt(144)?"},
+                {
+                    "role": "assistant",
+                    "content": "The answer is 12.",
+                    "reasoning_content": "I need to compute the square root of 144."
+                },
+                {"role": "user", "content": "Are you sure?"}
+            ]
+        }))
+        .unwrap();
+        let rendered = formatter.render(&request).unwrap();
+        assert!(
+            rendered.contains("<think>I need to compute the square root of 144.</think>"),
+            "reasoning_content must appear as <think> block, got: {}",
+            rendered
+        );
+        assert!(
+            rendered.contains("The answer is 12."),
+            "original content must be preserved"
+        );
+        assert!(
+            !rendered.contains("reasoning_content"),
+            "raw reasoning_content field should not leak into prompt"
+        );
+    }
+    // Verify a full agentic flow: assistant reasons, calls a tool, gets a
+    // result, then reasons again before answering. Both reasoning turns must
+    // survive into the rendered prompt.
+    #[test]
+    fn test_reasoning_content_agentic_tool_call_roundtrip_render() {
+        use super::OAIPromptFormatter;
+        let formatter = make_test_formatter();
+        let request: NvCreateChatCompletionRequest = serde_json::from_value(serde_json::json!({
+            "model": "test-model",
+            "messages": [
+                {"role": "user", "content": "What is sqrt(144) + sqrt(256)?"},
+                {
+                    "role": "assistant",
+                    "content": null,
+                    "reasoning_content": "I need to compute both square roots. Let me start with sqrt(144).",
+                    "tool_calls": [{
+                        "id": "call_0",
+                        "type": "function",
+                        "function": {
+                            "name": "calculator",
+                            "arguments": "{\"expr\": \"sqrt(144)\"}"
+                        }
+                    }]
+                },
+                {
+                    "role": "tool",
+                    "tool_call_id": "call_0",
+                    "content": "12"
+                },
+                {
+                    "role": "assistant",
+                    "content": "sqrt(144) = 12 and sqrt(256) = 16, so the answer is 28.",
+                    "reasoning_content": "Got 12 for sqrt(144). Now sqrt(256) = 16. Sum is 28."
+                },
+                {"role": "user", "content": "Thanks!"}
+            ]
+        }))
+        .unwrap();
+        let rendered = formatter.render(&request).unwrap();
+        // First assistant turn: reasoning with tool call, null content
+        assert!(
+            rendered.contains("<think>I need to compute both square roots"),
+            "first turn reasoning must be in prompt, got: {}",
+            rendered
+        );
+        // Second assistant turn: reasoning with final answer
+        assert!(
+            rendered.contains("<think>Got 12 for sqrt(144)"),
+            "second turn reasoning must be in prompt"
+        );
+        assert!(
+            rendered.contains("the answer is 28"),
+            "final answer content must be preserved"
+        );
+        // No raw reasoning_content in output
+        assert!(
+            !rendered.contains("reasoning_content"),
+            "raw reasoning_content field should not leak into prompt"
+        );
+    }
+    // Template that does NOT reference reasoning_content — injection should happen.
+    #[test]
+    fn test_reasoning_injected_when_template_ignores_it() {
+        use super::OAIPromptFormatter;
+        let formatter = make_test_formatter();
+        // Formatter uses a simple template that doesn't reference reasoning_content
+        assert!(!formatter.template_handles_reasoning);
+        let request: NvCreateChatCompletionRequest = serde_json::from_value(serde_json::json!({
+            "model": "test-model",
+            "messages": [
+                {"role": "user", "content": "Hello"},
+                {
+                    "role": "assistant",
+                    "content": "Hi.",
+                    "reasoning_content": "The user said hello."
+                },
+                {"role": "user", "content": "Bye"}
+            ]
+        }))
+        .unwrap();
+        let rendered = formatter.render(&request).unwrap();
+        assert!(
+            rendered.contains("<think>The user said hello.</think>"),
+            "injection must happen when template ignores reasoning_content, got: {}",
+            rendered
+        );
+    }
+    // Template that DOES reference reasoning_content — injection must be skipped.
+    #[test]
+    fn test_reasoning_not_injected_when_template_handles_it() {
+        use super::tokcfg::ChatTemplate;
+        use super::{ContextMixins, HfTokenizerConfigJsonFormatter, OAIPromptFormatter};
+        // Template that natively renders reasoning_content (like Nemotron/Qwen3)
+        let template = r#"{%- for message in messages %}{%- if message.role == "assistant" and message.reasoning_content is defined and message.reasoning_content %}<think>{{ message.reasoning_content }}</think>
+{%- endif %}{{ message.role }}: {{ message.content }}
+{%- endfor %}
+{%- if add_generation_prompt %}assistant:{%- endif %}"#;
+        let chat_template: ChatTemplate = serde_json::from_value(serde_json::json!({
+            "chat_template": template
+        }))
+        .unwrap();
+        let formatter =
+            HfTokenizerConfigJsonFormatter::new(chat_template, ContextMixins::new(&[])).unwrap();
+        // Verify detection worked
+        assert!(formatter.template_handles_reasoning);
+        let request: NvCreateChatCompletionRequest = serde_json::from_value(serde_json::json!({
+            "model": "test-model",
+            "messages": [
+                {"role": "user", "content": "Hello"},
+                {
+                    "role": "assistant",
+                    "content": "Hi.",
+                    "reasoning_content": "The user said hello."
+                },
+                {"role": "user", "content": "Bye"}
+            ]
+        }))
+        .unwrap();
+        let rendered = formatter.render(&request).unwrap();
+        // Template renders reasoning natively — no duplicate injection
+        assert!(
+            rendered.contains("<think>The user said hello.</think>"),
+            "template must render reasoning_content natively, got: {}",
+            rendered
+        );
+        // Must NOT have double <think> blocks
+        let think_count = rendered.matches("<think>").count();
+        assert_eq!(
+            think_count, 1,
+            "must have exactly one <think> block (from template), got {} in: {}",
+            think_count, rendered
+        );
+    }
 }
--- a/lib/llm/src/protocols/anthropic/types.rs
+++ b/lib/llm/src/protocols/anthropic/types.rs
@@ -154,7 +154,22 @@ impl TryFrom<AnthropicCreateMessageRequest> for NvCreateChatCompletionRequest {
                    ..Default::default()
                })
            },
-            chat_template_args: None,
+            // chat_template_args may be augmented by the Anthropic handler
+            // (anthropic.rs) after conversion — e.g., setting enable_thinking=true
+            // when a reasoning parser is configured. The conversion layer only
+            // forwards the client's explicit thinking preference here; the handler
+            // has access to parsing_options and makes the final decision.
+            chat_template_args: if req
+                .thinking
+                .as_ref()
+                .is_some_and(|t| t.thinking_type == "enabled")
+            {
+                let mut args = std::collections::HashMap::new();
+                args.insert("enable_thinking".to_string(), serde_json::Value::Bool(true));
+                Some(args)
+            } else {
+                None
+            },
            media_io_kwargs: None,
            unsupported_fields: Default::default(),
        })

--- a/lib/parsers/src/reasoning/mod.rs
+++ b/lib/parsers/src/reasoning/mod.rs
@@ -389,4 +389,75 @@ mod tests {
        assert_eq!(r_k25.reasoning_text, "reasoning");
        assert_eq!(r_k25.normal_text, "answer");
    }
+    // Scenario 1: Normal streaming flow with force_reasoning + set_in_reasoning.
+    // Simulates the OpenAI path where the preprocessor detects prompt-injected
+    // reasoning and calls set_in_reasoning(true). The parser should correctly
+    // transition from reasoning to content when </think> arrives.
+    #[test]
+    fn test_nemotron_streaming_with_set_in_reasoning() {
+        let mut parser = ReasoningParserType::DeepseekR1.get_reasoning_parser();
+        parser.set_in_reasoning(true); // OpenAI path calls this
+        let tokens = &["Think", "ing about", " this", ".\n\n", "</think>", "Four"];
+        let mut all_reasoning = String::new();
+        let mut all_content = String::new();
+        for token in tokens {
+            let r = parser.parse_reasoning_streaming_incremental(token, &[]);
+            all_reasoning.push_str(&r.reasoning_text);
+            all_content.push_str(&r.normal_text);
+        }
+        assert_eq!(all_reasoning, "Thinking about this.\n\n");
+        assert_eq!(all_content, "Four");
+    }
+    // Scenario 2: Streaming with force_reasoning but WITHOUT set_in_reasoning.
+    // Simulates the Anthropic path bug where thinking_enabled=false and
+    // set_in_reasoning is never called. The parser still starts in reasoning
+    // mode (force_reasoning=true) but stripped_think_start=false. The </think>
+    // boundary must still be detected correctly.
+    #[test]
+    fn test_nemotron_streaming_force_reasoning_without_set_in_reasoning() {
+        // DeepseekR1 has force_reasoning=true but we do NOT call set_in_reasoning
+        let mut parser = ReasoningParserType::DeepseekR1.get_reasoning_parser();
+        let tokens = &["Think", "ing about", " this", ".\n\n", "</think>", "Four"];
+        let mut all_reasoning = String::new();
+        let mut all_content = String::new();
+        for token in tokens {
+            let r = parser.parse_reasoning_streaming_incremental(token, &[]);
+            all_reasoning.push_str(&r.reasoning_text);
+            all_content.push_str(&r.normal_text);
+        }
+        assert_eq!(all_reasoning, "Thinking about this.\n\n");
+        assert_eq!(all_content, "Four");
+    }
+    // Scenario 3: Token-by-token </think> split across chunks.
+    // The '<' in '</think>' is a prefix of '<think>'. When stripped_think_start
+    // is false, the parser's prefix-check could buffer '<' and interfere with
+    // </think> detection. This test verifies the boundary is detected even when
+    // </think> arrives as individual characters.
+    #[test]
+    fn test_nemotron_streaming_split_end_think_tokens() {
+        let mut parser = ReasoningParserType::DeepseekR1.get_reasoning_parser();
+        parser.set_in_reasoning(true);
+        // Simulate token-by-token arrival including </think> split across chunks
+        let tokens = &[
+            "reason", "ing", " done", ".", "</", "think", ">", "Hello", " world",
+        ];
+        let mut all_reasoning = String::new();
+        let mut all_content = String::new();
+        for token in tokens {
+            let r = parser.parse_reasoning_streaming_incremental(token, &[]);
+            all_reasoning.push_str(&r.reasoning_text);
+            all_content.push_str(&r.normal_text);
+        }
+        assert_eq!(all_reasoning, "reasoning done.");
+        assert_eq!(all_content, "Hello world");
+    }
 }