Unverified Commit f0d3ce63 authored by MatejKosec's avatar MatejKosec Committed by GitHub
Browse files

fix: Anthropic streaming double-parsing + reasoning_content roundtrip (#7358)


Signed-off-by: default avatarMatej Kosec <mkosec@nvidia.com>
parent d58a6881
...@@ -4,6 +4,48 @@ ...@@ -4,6 +4,48 @@
from typing import Any, Optional from typing import Any, Optional
def _inject_reasoning_content(messages: list) -> None:
"""Inject reasoning_content as <think> blocks into content.
Chat templates only reference message["content"] — they don't see
reasoning_content. This converts it back to <think> blocks so the
model sees its own prior chain-of-thought across turns.
"""
for msg in messages:
if msg.get("role") != "assistant":
continue
reasoning = msg.get("reasoning_content")
if not reasoning:
continue
# Build <think> wrapped text
if isinstance(reasoning, str):
think_text = f"<think>{reasoning}</think>" if reasoning else ""
elif isinstance(reasoning, list):
# Segments variant: wrap each non-empty segment
parts = [f"<think>{seg}</think>" for seg in reasoning if seg]
think_text = "".join(parts)
else:
continue
if not think_text:
continue
# Prepend to content
existing = msg.get("content")
if isinstance(existing, str):
msg["content"] = think_text + existing
elif isinstance(existing, list):
# Multimodal content array — prepend as text part
msg["content"] = [{"type": "text", "text": think_text}] + existing
else:
# null or absent
msg["content"] = think_text
# Remove so template doesn't see both
msg.pop("reasoning_content", None)
class InputParamManager: class InputParamManager:
def __init__(self, tokenizer: Any) -> None: def __init__(self, tokenizer: Any) -> None:
self.tokenizer = tokenizer self.tokenizer = tokenizer
...@@ -18,8 +60,32 @@ class InputParamManager: ...@@ -18,8 +60,32 @@ class InputParamManager:
raise ValueError("Tokenizer is not available") raise ValueError("Tokenizer is not available")
if "messages" in request: if "messages" in request:
# Forward chat_template_args / chat_template_kwargs to the
# template so model-specific variables (e.g. enable_thinking)
# are available during rendering.
extra_kwargs = {}
if "chat_template_kwargs" in request:
extra_kwargs.update(request["chat_template_kwargs"])
if "chat_template_args" in request:
extra_kwargs.update(request["chat_template_args"])
# Strip keys that are already set explicitly to avoid
# TypeError: got multiple values for keyword argument.
for reserved in ("tokenize", "add_generation_prompt"):
extra_kwargs.pop(reserved, None)
# Inject reasoning_content as <think> blocks into content,
# but only if the template doesn't handle it natively.
# Templates like Nemotron and Qwen3 reference reasoning_content
# directly — injecting would produce duplicate <think> blocks.
chat_template_src = getattr(self.tokenizer, "chat_template", "") or ""
if "reasoning_content" not in chat_template_src:
_inject_reasoning_content(request["messages"])
return self.tokenizer.apply_chat_template( return self.tokenizer.apply_chat_template(
request["messages"], tokenize=False, add_generation_prompt=True request["messages"],
tokenize=False,
add_generation_prompt=True,
**extra_kwargs,
) )
elif "prompt" in request: elif "prompt" in request:
return self.tokenizer.encode(request["prompt"]) return self.tokenizer.encode(request["prompt"])
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""
Unit tests for _inject_reasoning_content in input_params.py.
Verifies that reasoning_content from prior assistant turns is converted
to <think> blocks in the content field before chat template rendering.
"""
import copy
from dynamo.common.utils.input_params import _inject_reasoning_content
class TestInjectReasoningContent:
"""Test suite for _inject_reasoning_content"""
def test_text_variant_prepends_to_content(self):
"""Text reasoning_content is wrapped in <think> and prepended."""
messages = [
{
"role": "assistant",
"content": "The answer is 12.",
"reasoning_content": "sqrt(144) = 12",
},
]
_inject_reasoning_content(messages)
assert (
messages[0]["content"] == "<think>sqrt(144) = 12</think>The answer is 12."
)
assert "reasoning_content" not in messages[0]
def test_segments_variant_wraps_each_segment(self):
"""Segments are individually wrapped in <think> blocks."""
messages = [
{
"role": "assistant",
"content": "Done.",
"reasoning_content": ["first thought", "second thought", ""],
},
]
_inject_reasoning_content(messages)
content = messages[0]["content"]
assert content.startswith("<think>first thought</think>")
assert "<think>second thought</think>" in content
assert "<think></think>" not in content # empty segment skipped
assert content.endswith("Done.")
assert "reasoning_content" not in messages[0]
def test_null_content_creates_from_reasoning(self):
"""When content is null/None, reasoning becomes the content."""
messages = [
{"role": "assistant", "content": None, "reasoning_content": "Thinking..."},
]
_inject_reasoning_content(messages)
assert messages[0]["content"] == "<think>Thinking...</think>"
def test_absent_content_creates_from_reasoning(self):
"""When content key is absent, reasoning becomes the content."""
messages = [
{"role": "assistant", "reasoning_content": "Thinking..."},
]
_inject_reasoning_content(messages)
assert messages[0]["content"] == "<think>Thinking...</think>"
def test_multimodal_content_prepends_text_part(self):
"""Array content gets a text part prepended, not replaced."""
messages = [
{
"role": "assistant",
"content": [{"type": "text", "text": "Here is the image."}],
"reasoning_content": "Analyzing the image...",
},
]
_inject_reasoning_content(messages)
content = messages[0]["content"]
assert isinstance(content, list)
assert len(content) == 2
assert content[0] == {
"type": "text",
"text": "<think>Analyzing the image...</think>",
}
assert content[1] == {"type": "text", "text": "Here is the image."}
def test_skips_non_assistant_messages(self):
"""User and tool messages are not modified."""
messages = [
{
"role": "user",
"content": "hello",
"reasoning_content": "should not touch",
},
{
"role": "tool",
"content": "result",
"reasoning_content": "should not touch",
},
]
original = copy.deepcopy(messages)
_inject_reasoning_content(messages)
assert messages == original
def test_skips_empty_reasoning(self):
"""Empty string reasoning_content is skipped."""
messages = [
{"role": "assistant", "content": "Answer.", "reasoning_content": ""},
]
_inject_reasoning_content(messages)
assert messages[0]["content"] == "Answer."
# reasoning_content not removed since we skipped (falsy check)
def test_agentic_multi_turn_tool_call_flow(self):
"""Full agentic flow: reason → tool_call → tool_result → reason → answer."""
messages = [
{"role": "user", "content": "What is sqrt(144) + sqrt(256)?"},
{
"role": "assistant",
"content": None,
"reasoning_content": "I need to compute sqrt(144) first.",
"tool_calls": [
{
"id": "call_0",
"type": "function",
"function": {
"name": "calc",
"arguments": '{"expr": "sqrt(144)"}',
},
},
],
},
{"role": "tool", "tool_call_id": "call_0", "content": "12"},
{
"role": "assistant",
"content": "The answer is 28.",
"reasoning_content": "Got 12. sqrt(256) = 16. Sum = 28.",
},
{"role": "user", "content": "Thanks!"},
]
_inject_reasoning_content(messages)
# First assistant turn: reasoning injected, null content → reasoning only
assert (
messages[1]["content"]
== "<think>I need to compute sqrt(144) first.</think>"
)
assert "reasoning_content" not in messages[1]
assert "tool_calls" in messages[1] # tool_calls untouched
# Tool message untouched
assert messages[2]["content"] == "12"
# Second assistant turn: reasoning prepended to content
assert (
messages[3]["content"]
== "<think>Got 12. sqrt(256) = 16. Sum = 28.</think>The answer is 28."
)
assert "reasoning_content" not in messages[3]
# User messages untouched
assert messages[0]["content"] == "What is sqrt(144) + sqrt(256)?"
assert messages[4]["content"] == "Thanks!"
class TestInputParamManagerReasoningInjection:
"""Test that InputParamManager respects template introspection."""
def test_injects_when_template_ignores_reasoning(self):
"""Templates without reasoning_content get injection."""
from unittest.mock import MagicMock
tokenizer = MagicMock()
tokenizer.chat_template = (
"{% for m in messages %}{{ m.role }}: {{ m.content }}{% endfor %}"
)
tokenizer.apply_chat_template = MagicMock(return_value="rendered")
from dynamo.common.utils.input_params import InputParamManager
mgr = InputParamManager(tokenizer)
request = {
"messages": [
{
"role": "assistant",
"content": "Hi.",
"reasoning_content": "thinking...",
},
{"role": "user", "content": "Bye"},
]
}
mgr.get_input_param(request, use_tokenizer=True)
# Verify injection happened: reasoning_content removed, content has <think>
called_messages = tokenizer.apply_chat_template.call_args[0][0]
assert "reasoning_content" not in called_messages[0]
assert called_messages[0]["content"].startswith("<think>thinking...</think>")
def test_skips_injection_when_template_handles_reasoning(self):
"""Templates with reasoning_content are left alone."""
from unittest.mock import MagicMock
tokenizer = MagicMock()
tokenizer.chat_template = (
"{% for m in messages %}"
"{% if m.reasoning_content %}<think>{{ m.reasoning_content }}</think>{% endif %}"
"{{ m.role }}: {{ m.content }}{% endfor %}"
)
tokenizer.apply_chat_template = MagicMock(return_value="rendered")
from dynamo.common.utils.input_params import InputParamManager
mgr = InputParamManager(tokenizer)
request = {
"messages": [
{
"role": "assistant",
"content": "Hi.",
"reasoning_content": "thinking...",
},
{"role": "user", "content": "Bye"},
]
}
mgr.get_input_param(request, use_tokenizer=True)
# Verify injection was skipped: reasoning_content still present, content unchanged
called_messages = tokenizer.apply_chat_template.call_args[0][0]
assert called_messages[0]["reasoning_content"] == "thinking..."
assert called_messages[0]["content"] == "Hi."
...@@ -33,7 +33,6 @@ use super::{ ...@@ -33,7 +33,6 @@ use super::{
metrics::{CancellationLabels, Endpoint, process_response_and_observe_metrics}, metrics::{CancellationLabels, Endpoint, process_response_and_observe_metrics},
service_v2, service_v2,
}; };
use crate::preprocessor::OpenAIPreprocessor;
use crate::protocols::anthropic::stream_converter::AnthropicStreamConverter; use crate::protocols::anthropic::stream_converter::AnthropicStreamConverter;
use crate::protocols::anthropic::types::{ use crate::protocols::anthropic::types::{
AnthropicCountTokensRequest, AnthropicCountTokensResponse, AnthropicCreateMessageRequest, AnthropicCountTokensRequest, AnthropicCountTokensResponse, AnthropicCreateMessageRequest,
...@@ -192,19 +191,30 @@ async fn anthropic_messages( ...@@ -192,19 +191,30 @@ async fn anthropic_messages(
tracing::trace!("Received Anthropic messages request: {:?}", &*request); tracing::trace!("Received Anthropic messages request: {:?}", &*request);
// Look up engine and parsing options early so we know whether a reasoning
// parser is configured before converting the request.
let (engine, parsing_options) = state
.manager()
.get_chat_completions_engine_with_parsing(&model)
.map_err(|_| {
anthropic_error(
StatusCode::NOT_FOUND,
"not_found_error",
&format!("Model '{}' not found", model),
)
})?;
let (orig_request, context) = request.into_parts(); let (orig_request, context) = request.into_parts();
let model_for_resp = orig_request.model.clone(); let model_for_resp = orig_request.model.clone();
// Check if the Anthropic request explicitly enabled thinking. When thinking // Check if the Anthropic request explicitly disabled thinking.
// is enabled, reasoning-capable models' chat templates typically inject let thinking_explicitly_disabled = orig_request
// `<think>` into the prompt, so the completion starts mid-reasoning.
let thinking_enabled = orig_request
.thinking .thinking
.as_ref() .as_ref()
.is_some_and(|t| t.thinking_type == "enabled"); .is_some_and(|t| t.thinking_type == "disabled");
// Convert Anthropic request -> Chat Completion request // Convert Anthropic request -> Chat Completion request
let chat_request: NvCreateChatCompletionRequest = let mut chat_request: NvCreateChatCompletionRequest =
orig_request.try_into().map_err(|e: anyhow::Error| { orig_request.try_into().map_err(|e: anyhow::Error| {
tracing::error!( tracing::error!(
request_id, request_id,
...@@ -218,20 +228,42 @@ async fn anthropic_messages( ...@@ -218,20 +228,42 @@ async fn anthropic_messages(
) )
})?; })?;
let request = context.map(|_req| chat_request); // When a reasoning parser is configured and the client hasn't explicitly
// disabled thinking, assume the model's chat template will inject `<think>`.
tracing::trace!("Getting chat completions engine for model: {}", model); //
// Two things must be aligned:
// 1. chat_template_args must include enable_thinking=true so the backend's
// template actually injects `<think>` into the prompt. For the
// ModelInput::Text path (SGLang without --skip-tokenizer-init), the
// backend applies the template — without explicit enable_thinking the
// result depends on the template's default which varies by model.
// 2. prompt_injected_reasoning must be true so the parser starts in
// reasoning mode with stripped_think_start=true, which is critical for
// correct `</think>` boundary detection in the streaming path.
//
// The OpenAI path handles this in the preprocessor: it renders the template,
// inspects the formatted prompt for a trailing `<think>`, and sets
// prompt_injected_reasoning accordingly. The Anthropic path bypasses the
// preprocessor, so we infer prompt injection from the reasoning parser config.
let prompt_injected_reasoning =
parsing_options.reasoning_parser.is_some() && !thinking_explicitly_disabled;
if prompt_injected_reasoning {
let args = chat_request
.chat_template_args
.get_or_insert_with(Default::default);
args.entry("enable_thinking".to_string())
.or_insert(serde_json::Value::Bool(true));
// Preserve reasoning from prior turns. Some templates (Nemotron)
// strip historical <think> content by default to save context.
// For agentic flows the model needs to see why it made prior decisions.
// Ref: NVIDIA's SWE training config also sets this to false:
// https://github.com/NVIDIA-NeMo/Nemotron/blob/main/src/nemotron/recipes/super3/stage2_rl/stage2_swe2/config/default.yaml#L287
args.entry("truncate_history_thinking".to_string())
.or_insert(serde_json::Value::Bool(false));
}
let (engine, parsing_options) = state let request = context.map(|_req| chat_request);
.manager()
.get_chat_completions_engine_with_parsing(&model)
.map_err(|_| {
anthropic_error(
StatusCode::NOT_FOUND,
"not_found_error",
&format!("Model '{}' not found", model),
)
})?;
let mut response_collector = state.metrics_clone().create_response_collector(&model); let mut response_collector = state.metrics_clone().create_response_collector(&model);
...@@ -247,27 +279,25 @@ async fn anthropic_messages( ...@@ -247,27 +279,25 @@ async fn anthropic_messages(
let ctx = engine_stream.context(); let ctx = engine_stream.context();
// Apply reasoning parser to the engine stream if configured. // NOTE: We intentionally do NOT apply a reasoning parser here.
// The preprocessor (which normally handles this for the OpenAI path) is
// bypassed by the Anthropic endpoint, so we apply the same stream
// transform here. This populates `delta.reasoning_content` which the
// AnthropicStreamConverter translates into thinking content blocks.
// //
// When thinking is enabled, the model's chat template likely injected // For ModelInput::Tokens backends (skip_tokenizer_init=True), the engine
// `<think>` into the prompt (e.g., Qwen3.5), so the parser must start // pipeline includes the OpenAI preprocessor which already applies reasoning
// in reasoning mode — the completion begins mid-reasoning without an // parsing in its backward edge (postprocessor_parsing_stream). The stream
// explicit `<think>` tag. // arriving here already has reasoning_content and content correctly split.
// Applying a second parser would re-classify post-think content chunks
// (where reasoning_content=None, content=Some) as reasoning, because the
// </think> boundary was consumed by the first parser and doesn't appear
// in the detokenized text.
//
// For ModelInput::Text backends (PushRouter, no preprocessor), reasoning
// parsing is NOT handled in the streaming path — the backend puts raw text
// (including <think> tags) in delta.content with reasoning_content=None.
// This is a known gap that affects all streaming handlers (OpenAI, Anthropic,
// Responses API) equally.
let engine_stream: Pin< let engine_stream: Pin<
Box<dyn futures::Stream<Item = Annotated<NvCreateChatCompletionStreamResponse>> + Send>, Box<dyn futures::Stream<Item = Annotated<NvCreateChatCompletionStreamResponse>> + Send>,
> = if let Some(ref reasoning_parser_name) = parsing_options.reasoning_parser { > = Box::pin(engine_stream);
Box::pin(OpenAIPreprocessor::parse_reasoning_content_from_stream(
engine_stream,
reasoning_parser_name.clone(),
thinking_enabled,
))
} else {
Box::pin(engine_stream)
};
let mut inflight_guard = let mut inflight_guard =
state state
......
...@@ -135,6 +135,9 @@ struct HfTokenizerConfigJsonFormatter { ...@@ -135,6 +135,9 @@ struct HfTokenizerConfigJsonFormatter {
/// When true, strip tool definitions from the chat template when tool_choice is "none". /// When true, strip tool definitions from the chat template when tool_choice is "none".
/// This prevents models from generating raw XML tool calls in the content field. /// This prevents models from generating raw XML tool calls in the content field.
exclude_tools_when_tool_choice_none: bool, exclude_tools_when_tool_choice_none: bool,
/// True if the chat template natively references `reasoning_content`.
/// When true, skip injection — the template handles it.
template_handles_reasoning: bool,
} }
// /// OpenAI Standard Prompt Formatter // /// OpenAI Standard Prompt Formatter
......
...@@ -161,6 +161,12 @@ impl HfTokenizerConfigJsonFormatter { ...@@ -161,6 +161,12 @@ impl HfTokenizerConfigJsonFormatter {
// Detect at model load time whether this template requires content arrays // Detect at model load time whether this template requires content arrays
let requires_content_arrays = detect_content_array_usage(&env); let requires_content_arrays = detect_content_array_usage(&env);
// Detect if the template natively handles reasoning_content (e.g. Nemotron, Qwen3).
// If so, we must NOT inject <think> blocks — the template does it itself.
let template_handles_reasoning = env
.templates()
.any(|(_, tmpl)| tmpl.source().contains("reasoning_content"));
Ok(HfTokenizerConfigJsonFormatter { Ok(HfTokenizerConfigJsonFormatter {
env, env,
config, config,
...@@ -168,6 +174,7 @@ impl HfTokenizerConfigJsonFormatter { ...@@ -168,6 +174,7 @@ impl HfTokenizerConfigJsonFormatter {
supports_add_generation_prompt: supports_add_generation_prompt.unwrap_or(false), supports_add_generation_prompt: supports_add_generation_prompt.unwrap_or(false),
requires_content_arrays, requires_content_arrays,
exclude_tools_when_tool_choice_none, exclude_tools_when_tool_choice_none,
template_handles_reasoning,
}) })
} }
} }
......
...@@ -203,6 +203,82 @@ fn normalize_tool_arguments_in_messages(messages: &mut serde_json::Value) { ...@@ -203,6 +203,82 @@ fn normalize_tool_arguments_in_messages(messages: &mut serde_json::Value) {
} }
} }
/// Inject `reasoning_content` back into the `content` field as `<think>` blocks.
///
/// Chat templates only reference `{{ message.content }}` — they don't know about
/// `reasoning_content`. Without this injection, the model's prior chain-of-thought
/// is silently dropped across turns.
///
/// Uses `<think>`/`</think>` delimiters — the same tags that reasoning models emit
/// and that the reasoning parser strips on output. Reasoning is prepended to content
/// to match the original generation order (`<think>...</think> response`).
///
/// Segments are concatenated rather than interleaved with tool_calls because Jinja
/// templates render `tool_calls` separately from `content`. The model still sees
/// all reasoning text before the template-rendered tool call block.
fn inject_reasoning_content_into_messages(messages: &mut serde_json::Value) {
let Some(msgs) = messages.as_array_mut() else {
return;
};
for msg in msgs.iter_mut() {
if msg.get("role").and_then(|r| r.as_str()) != Some("assistant") {
continue;
}
let reasoning = match msg.get("reasoning_content") {
Some(serde_json::Value::String(s)) if !s.is_empty() => {
format!("<think>{}</think>", s)
}
Some(serde_json::Value::Array(segments)) => {
let mut result = String::new();
for seg in segments {
if let Some(s) = seg.as_str()
&& !s.is_empty()
{
result.push_str("<think>");
result.push_str(s);
result.push_str("</think>");
}
}
if result.is_empty() {
continue;
}
result
}
_ => continue,
};
match msg.get("content") {
// Content is a string or null — prepend reasoning as text
Some(serde_json::Value::String(s)) if !s.is_empty() => {
msg["content"] = serde_json::Value::String(format!("{}{}", reasoning, s));
}
None | Some(serde_json::Value::Null) | Some(serde_json::Value::String(_)) => {
msg["content"] = serde_json::Value::String(reasoning);
}
// Content is an array (multimodal) — prepend as a text part
Some(serde_json::Value::Array(_)) => {
let think_part = serde_json::json!({
"type": "text",
"text": reasoning
});
if let Some(arr) = msg.get_mut("content").and_then(|v| v.as_array_mut()) {
arr.insert(0, think_part);
}
}
// Other types (number, bool, object) — skip, don't corrupt
_ => continue,
}
// Remove so the template doesn't see both the injected <think> in content
// and the original reasoning_content field.
if let Some(obj) = msg.as_object_mut() {
obj.remove("reasoning_content");
}
}
}
impl OAIChatLikeRequest for NvCreateChatCompletionRequest { impl OAIChatLikeRequest for NvCreateChatCompletionRequest {
fn model(&self) -> String { fn model(&self) -> String {
self.inner.model.clone() self.inner.model.clone()
...@@ -378,6 +454,14 @@ impl OAIPromptFormatter for HfTokenizerConfigJsonFormatter { ...@@ -378,6 +454,14 @@ impl OAIPromptFormatter for HfTokenizerConfigJsonFormatter {
normalize_tool_arguments_in_messages(&mut messages_for_template); normalize_tool_arguments_in_messages(&mut messages_for_template);
// Inject reasoning_content as <think> blocks into content — but only if
// the template doesn't handle it natively. Templates like Nemotron and
// Qwen3 reference reasoning_content directly in their Jinja logic; injecting
// would produce duplicate <think> blocks.
if !self.template_handles_reasoning {
inject_reasoning_content_into_messages(&mut messages_for_template);
}
let ctx = context! { let ctx = context! {
messages => messages_for_template, messages => messages_for_template,
tools => tools, tools => tools,
...@@ -1312,4 +1396,337 @@ NORMAL_MODE ...@@ -1312,4 +1396,337 @@ NORMAL_MODE
result result
); );
} }
#[test]
fn test_inject_reasoning_content_segments_with_tool_calls() {
// Assistant message with reasoning_content segments and tool_calls
let mut messages = serde_json::json!([
{
"role": "user",
"content": "What is sqrt(144) and sqrt(256)?"
},
{
"role": "assistant",
"content": "Let me calculate those.",
"reasoning_content": ["I need to compute sqrt(144)", "Now sqrt(256)", ""],
"tool_calls": [
{
"id": "call_0",
"type": "function",
"function": {
"name": "calculator",
"arguments": "{\"expr\": \"sqrt(144)\"}"
}
},
{
"id": "call_1",
"type": "function",
"function": {
"name": "calculator",
"arguments": "{\"expr\": \"sqrt(256)\"}"
}
}
]
}
]);
inject_reasoning_content_into_messages(&mut messages);
let assistant = &messages[1];
// reasoning_content should be removed
assert!(
assistant.get("reasoning_content").is_none(),
"reasoning_content should be removed after injection"
);
// content should have <think> blocks prepended (empty segment skipped)
let content = assistant["content"].as_str().unwrap();
assert!(
content.starts_with("<think>I need to compute sqrt(144)</think>"),
"content should start with first reasoning segment, got: {}",
content
);
assert!(
content.contains("<think>Now sqrt(256)</think>"),
"content should contain second reasoning segment"
);
// Empty third segment should NOT produce <think></think>
assert!(
!content.contains("<think></think>"),
"empty segments should be skipped"
);
// Original content should be preserved at the end
assert!(
content.ends_with("Let me calculate those."),
"original content should be at the end, got: {}",
content
);
// tool_calls should be untouched
assert!(assistant.get("tool_calls").is_some());
assert_eq!(assistant["tool_calls"].as_array().unwrap().len(), 2);
}
#[test]
fn test_inject_reasoning_content_text_variant() {
let mut messages = serde_json::json!([
{
"role": "assistant",
"content": "The answer is 42.",
"reasoning_content": "Let me think about this carefully."
}
]);
inject_reasoning_content_into_messages(&mut messages);
let assistant = &messages[0];
assert!(assistant.get("reasoning_content").is_none());
let content = assistant["content"].as_str().unwrap();
assert_eq!(
content,
"<think>Let me think about this carefully.</think>The answer is 42."
);
}
#[test]
fn test_inject_reasoning_content_null_content() {
// reasoning_content present but content is null
let mut messages = serde_json::json!([
{
"role": "assistant",
"content": null,
"reasoning_content": "Thinking...",
"tool_calls": [{"id": "call_0", "type": "function", "function": {"name": "f", "arguments": "{}"}}]
}
]);
inject_reasoning_content_into_messages(&mut messages);
let content = messages[0]["content"].as_str().unwrap();
assert_eq!(content, "<think>Thinking...</think>");
assert!(messages[0].get("reasoning_content").is_none());
}
#[test]
fn test_inject_reasoning_content_skips_non_assistant() {
let mut messages = serde_json::json!([
{
"role": "user",
"content": "hello",
"reasoning_content": "should not be touched"
}
]);
inject_reasoning_content_into_messages(&mut messages);
// User message should be untouched
assert!(messages[0].get("reasoning_content").is_some());
}
// Helper: create a formatter with a minimal chat template for render tests
fn make_test_formatter() -> HfTokenizerConfigJsonFormatter {
use super::tokcfg::ChatTemplate;
use super::{ContextMixins, HfTokenizerConfigJsonFormatter};
// Minimal template that renders content verbatim — enough to verify
// that reasoning_content injection works through the full pipeline.
let template = r#"{%- for message in messages %}{{ message.role }}: {{ message.content }}
{%- endfor %}
{%- if add_generation_prompt %}assistant:{%- endif %}"#;
let chat_template: ChatTemplate = serde_json::from_value(serde_json::json!({
"chat_template": template
}))
.unwrap();
HfTokenizerConfigJsonFormatter::new(chat_template, ContextMixins::new(&[])).unwrap()
}
// Verify reasoning_content (Text variant) from a prior assistant turn
// appears as a <think> block in the rendered prompt.
#[test]
fn test_reasoning_content_text_roundtrip_render() {
use super::OAIPromptFormatter;
let formatter = make_test_formatter();
let request: NvCreateChatCompletionRequest = serde_json::from_value(serde_json::json!({
"model": "test-model",
"messages": [
{"role": "user", "content": "What is sqrt(144)?"},
{
"role": "assistant",
"content": "The answer is 12.",
"reasoning_content": "I need to compute the square root of 144."
},
{"role": "user", "content": "Are you sure?"}
]
}))
.unwrap();
let rendered = formatter.render(&request).unwrap();
assert!(
rendered.contains("<think>I need to compute the square root of 144.</think>"),
"reasoning_content must appear as <think> block, got: {}",
rendered
);
assert!(
rendered.contains("The answer is 12."),
"original content must be preserved"
);
assert!(
!rendered.contains("reasoning_content"),
"raw reasoning_content field should not leak into prompt"
);
}
// Verify a full agentic flow: assistant reasons, calls a tool, gets a
// result, then reasons again before answering. Both reasoning turns must
// survive into the rendered prompt.
#[test]
fn test_reasoning_content_agentic_tool_call_roundtrip_render() {
use super::OAIPromptFormatter;
let formatter = make_test_formatter();
let request: NvCreateChatCompletionRequest = serde_json::from_value(serde_json::json!({
"model": "test-model",
"messages": [
{"role": "user", "content": "What is sqrt(144) + sqrt(256)?"},
{
"role": "assistant",
"content": null,
"reasoning_content": "I need to compute both square roots. Let me start with sqrt(144).",
"tool_calls": [{
"id": "call_0",
"type": "function",
"function": {
"name": "calculator",
"arguments": "{\"expr\": \"sqrt(144)\"}"
}
}]
},
{
"role": "tool",
"tool_call_id": "call_0",
"content": "12"
},
{
"role": "assistant",
"content": "sqrt(144) = 12 and sqrt(256) = 16, so the answer is 28.",
"reasoning_content": "Got 12 for sqrt(144). Now sqrt(256) = 16. Sum is 28."
},
{"role": "user", "content": "Thanks!"}
]
}))
.unwrap();
let rendered = formatter.render(&request).unwrap();
// First assistant turn: reasoning with tool call, null content
assert!(
rendered.contains("<think>I need to compute both square roots"),
"first turn reasoning must be in prompt, got: {}",
rendered
);
// Second assistant turn: reasoning with final answer
assert!(
rendered.contains("<think>Got 12 for sqrt(144)"),
"second turn reasoning must be in prompt"
);
assert!(
rendered.contains("the answer is 28"),
"final answer content must be preserved"
);
// No raw reasoning_content in output
assert!(
!rendered.contains("reasoning_content"),
"raw reasoning_content field should not leak into prompt"
);
}
// Template that does NOT reference reasoning_content — injection should happen.
#[test]
fn test_reasoning_injected_when_template_ignores_it() {
use super::OAIPromptFormatter;
let formatter = make_test_formatter();
// Formatter uses a simple template that doesn't reference reasoning_content
assert!(!formatter.template_handles_reasoning);
let request: NvCreateChatCompletionRequest = serde_json::from_value(serde_json::json!({
"model": "test-model",
"messages": [
{"role": "user", "content": "Hello"},
{
"role": "assistant",
"content": "Hi.",
"reasoning_content": "The user said hello."
},
{"role": "user", "content": "Bye"}
]
}))
.unwrap();
let rendered = formatter.render(&request).unwrap();
assert!(
rendered.contains("<think>The user said hello.</think>"),
"injection must happen when template ignores reasoning_content, got: {}",
rendered
);
}
// Template that DOES reference reasoning_content — injection must be skipped.
#[test]
fn test_reasoning_not_injected_when_template_handles_it() {
use super::tokcfg::ChatTemplate;
use super::{ContextMixins, HfTokenizerConfigJsonFormatter, OAIPromptFormatter};
// Template that natively renders reasoning_content (like Nemotron/Qwen3)
let template = r#"{%- for message in messages %}{%- if message.role == "assistant" and message.reasoning_content is defined and message.reasoning_content %}<think>{{ message.reasoning_content }}</think>
{%- endif %}{{ message.role }}: {{ message.content }}
{%- endfor %}
{%- if add_generation_prompt %}assistant:{%- endif %}"#;
let chat_template: ChatTemplate = serde_json::from_value(serde_json::json!({
"chat_template": template
}))
.unwrap();
let formatter =
HfTokenizerConfigJsonFormatter::new(chat_template, ContextMixins::new(&[])).unwrap();
// Verify detection worked
assert!(formatter.template_handles_reasoning);
let request: NvCreateChatCompletionRequest = serde_json::from_value(serde_json::json!({
"model": "test-model",
"messages": [
{"role": "user", "content": "Hello"},
{
"role": "assistant",
"content": "Hi.",
"reasoning_content": "The user said hello."
},
{"role": "user", "content": "Bye"}
]
}))
.unwrap();
let rendered = formatter.render(&request).unwrap();
// Template renders reasoning natively — no duplicate injection
assert!(
rendered.contains("<think>The user said hello.</think>"),
"template must render reasoning_content natively, got: {}",
rendered
);
// Must NOT have double <think> blocks
let think_count = rendered.matches("<think>").count();
assert_eq!(
think_count, 1,
"must have exactly one <think> block (from template), got {} in: {}",
think_count, rendered
);
}
} }
...@@ -154,7 +154,22 @@ impl TryFrom<AnthropicCreateMessageRequest> for NvCreateChatCompletionRequest { ...@@ -154,7 +154,22 @@ impl TryFrom<AnthropicCreateMessageRequest> for NvCreateChatCompletionRequest {
..Default::default() ..Default::default()
}) })
}, },
chat_template_args: None, // chat_template_args may be augmented by the Anthropic handler
// (anthropic.rs) after conversion — e.g., setting enable_thinking=true
// when a reasoning parser is configured. The conversion layer only
// forwards the client's explicit thinking preference here; the handler
// has access to parsing_options and makes the final decision.
chat_template_args: if req
.thinking
.as_ref()
.is_some_and(|t| t.thinking_type == "enabled")
{
let mut args = std::collections::HashMap::new();
args.insert("enable_thinking".to_string(), serde_json::Value::Bool(true));
Some(args)
} else {
None
},
media_io_kwargs: None, media_io_kwargs: None,
unsupported_fields: Default::default(), unsupported_fields: Default::default(),
}) })
......
...@@ -389,4 +389,75 @@ mod tests { ...@@ -389,4 +389,75 @@ mod tests {
assert_eq!(r_k25.reasoning_text, "reasoning"); assert_eq!(r_k25.reasoning_text, "reasoning");
assert_eq!(r_k25.normal_text, "answer"); assert_eq!(r_k25.normal_text, "answer");
} }
// Scenario 1: Normal streaming flow with force_reasoning + set_in_reasoning.
// Simulates the OpenAI path where the preprocessor detects prompt-injected
// reasoning and calls set_in_reasoning(true). The parser should correctly
// transition from reasoning to content when </think> arrives.
#[test]
fn test_nemotron_streaming_with_set_in_reasoning() {
let mut parser = ReasoningParserType::DeepseekR1.get_reasoning_parser();
parser.set_in_reasoning(true); // OpenAI path calls this
let tokens = &["Think", "ing about", " this", ".\n\n", "</think>", "Four"];
let mut all_reasoning = String::new();
let mut all_content = String::new();
for token in tokens {
let r = parser.parse_reasoning_streaming_incremental(token, &[]);
all_reasoning.push_str(&r.reasoning_text);
all_content.push_str(&r.normal_text);
}
assert_eq!(all_reasoning, "Thinking about this.\n\n");
assert_eq!(all_content, "Four");
}
// Scenario 2: Streaming with force_reasoning but WITHOUT set_in_reasoning.
// Simulates the Anthropic path bug where thinking_enabled=false and
// set_in_reasoning is never called. The parser still starts in reasoning
// mode (force_reasoning=true) but stripped_think_start=false. The </think>
// boundary must still be detected correctly.
#[test]
fn test_nemotron_streaming_force_reasoning_without_set_in_reasoning() {
// DeepseekR1 has force_reasoning=true but we do NOT call set_in_reasoning
let mut parser = ReasoningParserType::DeepseekR1.get_reasoning_parser();
let tokens = &["Think", "ing about", " this", ".\n\n", "</think>", "Four"];
let mut all_reasoning = String::new();
let mut all_content = String::new();
for token in tokens {
let r = parser.parse_reasoning_streaming_incremental(token, &[]);
all_reasoning.push_str(&r.reasoning_text);
all_content.push_str(&r.normal_text);
}
assert_eq!(all_reasoning, "Thinking about this.\n\n");
assert_eq!(all_content, "Four");
}
// Scenario 3: Token-by-token </think> split across chunks.
// The '<' in '</think>' is a prefix of '<think>'. When stripped_think_start
// is false, the parser's prefix-check could buffer '<' and interfere with
// </think> detection. This test verifies the boundary is detected even when
// </think> arrives as individual characters.
#[test]
fn test_nemotron_streaming_split_end_think_tokens() {
let mut parser = ReasoningParserType::DeepseekR1.get_reasoning_parser();
parser.set_in_reasoning(true);
// Simulate token-by-token arrival including </think> split across chunks
let tokens = &[
"reason", "ing", " done", ".", "</", "think", ">", "Hello", " world",
];
let mut all_reasoning = String::new();
let mut all_content = String::new();
for token in tokens {
let r = parser.parse_reasoning_streaming_incremental(token, &[]);
all_reasoning.push_str(&r.reasoning_text);
all_content.push_str(&r.normal_text);
}
assert_eq!(all_reasoning, "reasoning done.");
assert_eq!(all_content, "Hello world");
}
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment