"docs/components/vscode:/vscode.git/clone" did not exist on "fce8bbc2508c1be0156841dce5c2e8bf25764c77"
Unverified Commit a9e06960 authored by MatejKosec's avatar MatejKosec Committed by GitHub
Browse files

fix(api): preserve interleaved reasoning order for KV cache correctness (#6442)


Signed-off-by: default avatarMatej Kosec <mkosec@nvidia.com>
parent 5277fb9b
......@@ -466,6 +466,50 @@ pub struct ChatCompletionRequestAssistantMessageAudio {
pub id: String,
}
/// Reasoning content from a previous assistant turn.
///
/// This is an untagged enum that deserializes from either:
/// - A plain string: `"reasoning_content": "thinking..."` -> `Text("thinking...")`
/// - An array of strings: `"reasoning_content": ["seg1", "seg2"]` -> `Segments(["seg1", "seg2"])`
///
/// The `Segments` variant preserves interleaved reasoning order needed for KV cache–correct
/// context reconstruction. `segments[i]` is the reasoning that preceded `tool_calls[i]`;
/// `segments[tool_calls.len()]` is any trailing reasoning after the last tool call.
/// `segments.len() == tool_calls.len() + 1` always when set.
#[derive(ToSchema, Serialize, Deserialize, Clone, Debug, PartialEq)]
#[serde(untagged)]
pub enum ReasoningContent {
/// Flat string — single reasoning block or legacy backward-compat form.
Text(String),
/// Interleaved segments. segments[i] precedes tool_calls[i];
/// segments[N] is trailing reasoning after the last tool call.
/// segments.len() == tool_calls.len() + 1.
Segments(Vec<String>),
}
impl ReasoningContent {
/// Join all segments (or return text as-is) into a single flat string.
pub fn to_flat_string(&self) -> String {
match self {
ReasoningContent::Text(s) => s.clone(),
ReasoningContent::Segments(segs) => segs
.iter()
.filter(|s| !s.is_empty())
.cloned()
.collect::<Vec<_>>()
.join("\n"),
}
}
/// Returns the segments if this is the `Segments` variant, `None` for `Text`.
pub fn segments(&self) -> Option<&[String]> {
match self {
ReasoningContent::Segments(segs) => Some(segs),
ReasoningContent::Text(_) => None,
}
}
}
#[derive(ToSchema, Debug, Serialize, Deserialize, Default, Clone, Builder, PartialEq)]
#[builder(name = "ChatCompletionRequestAssistantMessageArgs")]
#[builder(pattern = "mutable")]
......@@ -476,10 +520,13 @@ pub struct ChatCompletionRequestAssistantMessage {
/// The contents of the assistant message. Required unless `tool_calls` or `function_call` is specified.
#[serde(skip_serializing_if = "Option::is_none")]
pub content: Option<ChatCompletionRequestAssistantMessageContent>,
/// Optional internal reasoning content from a previous assistant turn.
/// Used by reasoning-capable models that consume prior chain-of-thought-like context.
/// Reasoning content from a previous assistant turn.
///
/// When serialized as a plain string, represents a flat reasoning block (backward-compatible
/// with Jinja chat templates). When serialized as an array of strings, represents
/// interleaved reasoning segments preserving per-position order for KV cache correctness.
#[serde(skip_serializing_if = "Option::is_none")]
pub reasoning_content: Option<String>,
pub reasoning_content: Option<ReasoningContent>,
/// The refusal message by the assistant.
#[serde(skip_serializing_if = "Option::is_none")]
pub refusal: Option<String>,
......@@ -1280,7 +1327,7 @@ mod tests {
}
#[test]
fn test_assistant_request_reasoning_content_roundtrip() {
fn test_assistant_request_reasoning_content_text_roundtrip() {
let json = r#"{
"model": "deepseek-v3.2",
"messages": [
......@@ -1306,7 +1353,26 @@ mod tests {
_ => panic!("expected assistant message"),
};
assert_eq!(assistant.reasoning_content.as_deref(), Some("thinking..."));
assert_eq!(
assistant.reasoning_content,
Some(ReasoningContent::Text("thinking...".into()))
);
assert_eq!(
assistant
.reasoning_content
.as_ref()
.unwrap()
.to_flat_string(),
"thinking..."
);
assert!(
assistant
.reasoning_content
.as_ref()
.unwrap()
.segments()
.is_none()
);
let serialized = serde_json::to_value(&request).unwrap();
assert_eq!(
......@@ -1314,4 +1380,63 @@ mod tests {
serde_json::Value::String("thinking...".to_string())
);
}
#[test]
fn test_assistant_request_reasoning_content_segments_roundtrip() {
let json = r#"{
"model": "deepseek-v3.2",
"messages": [
{"role": "user", "content": "test"},
{
"role": "assistant",
"reasoning_content": ["seg1", "seg2", ""],
"tool_calls": [{
"id": "call_1",
"type": "function",
"function": {"name": "f1", "arguments": "{}"}
}, {
"id": "call_2",
"type": "function",
"function": {"name": "f2", "arguments": "{}"}
}]
}
]
}"#;
let request: CreateChatCompletionRequest = serde_json::from_str(json).unwrap();
let assistant = match &request.messages[1] {
ChatCompletionRequestMessage::Assistant(msg) => msg,
_ => panic!("expected assistant message"),
};
assert_eq!(
assistant.reasoning_content,
Some(ReasoningContent::Segments(vec![
"seg1".into(),
"seg2".into(),
"".into()
]))
);
assert_eq!(
assistant
.reasoning_content
.as_ref()
.unwrap()
.to_flat_string(),
"seg1\nseg2"
);
let segs = assistant
.reasoning_content
.as_ref()
.unwrap()
.segments()
.expect("should be Segments");
assert_eq!(segs.len(), 3);
let serialized = serde_json::to_value(&request).unwrap();
assert_eq!(
serialized["messages"][1]["reasoning_content"],
serde_json::json!(["seg1", "seg2", ""])
);
}
}
......@@ -187,7 +187,9 @@ async fn main_loop(
let assistant_message = dynamo_async_openai::types::ChatCompletionRequestMessage::Assistant(
dynamo_async_openai::types::ChatCompletionRequestAssistantMessage {
content: Some(assistant_content),
reasoning_content: (!assistant_reasoning.is_empty()).then_some(assistant_reasoning),
reasoning_content: (!assistant_reasoning.is_empty()).then_some(
dynamo_async_openai::types::ReasoningContent::Text(assistant_reasoning),
),
..Default::default()
},
);
......
......@@ -275,13 +275,42 @@ fn render_message(
// Handle reasoning content
// NOTE: If this assistant comes after last user message, the opening <think>
// was already added in the user message. We only need to add content and closing tag.
//
// Handle reasoning_content which may be a plain string or an array of segments.
// DeepSeek V3.2 always places its <think> block before all tool calls, so
// joining segments produces the correct flat form here.
if thinking_mode == ThinkingMode::Thinking
&& last_user_idx.is_some_and(|idx| index > idx)
&& let Some(reasoning) = msg.get("reasoning_content").and_then(|r| r.as_str())
{
// DON'T add THINKING_START - it was already added in user message
prompt.push_str(reasoning);
prompt.push_str(tokens::THINKING_END);
let reasoning = msg.get("reasoning_content").and_then(|v| match v {
serde_json::Value::String(s) => {
if s.is_empty() {
None
} else {
Some(s.clone())
}
}
serde_json::Value::Array(arr) => {
let joined = arr
.iter()
.filter_map(|v| v.as_str())
.filter(|s| !s.is_empty())
.collect::<Vec<_>>()
.join("\n");
if joined.is_empty() {
None
} else {
Some(joined)
}
}
_ => None,
});
if let Some(reasoning) = reasoning {
// DON'T add THINKING_START - it was already added in user message
prompt.push_str(&reasoning);
prompt.push_str(tokens::THINKING_END);
}
}
// Handle content
......
......@@ -13,7 +13,7 @@ use dynamo_async_openai::types::{
ChatCompletionRequestSystemMessageContent, ChatCompletionRequestToolMessage,
ChatCompletionRequestToolMessageContent, ChatCompletionRequestUserMessage,
ChatCompletionRequestUserMessageContent, ChatCompletionTool, ChatCompletionToolChoiceOption,
ChatCompletionToolType, FunctionName, FunctionObject,
ChatCompletionToolType, FunctionName, FunctionObject, ReasoningContent,
};
use serde::{Deserialize, Serialize};
use uuid::Uuid;
......@@ -557,6 +557,7 @@ impl TryFrom<AnthropicCreateMessageRequest> for NvCreateChatCompletionRequest {
// Assistant with plain text
(AnthropicRole::Assistant, AnthropicMessageContent::Text { content }) => {
messages.push(ChatCompletionRequestMessage::Assistant(
#[allow(deprecated)]
ChatCompletionRequestAssistantMessage {
content: Some(ChatCompletionRequestAssistantMessageContent::Text(
content.clone(),
......@@ -566,7 +567,6 @@ impl TryFrom<AnthropicCreateMessageRequest> for NvCreateChatCompletionRequest {
name: None,
audio: None,
tool_calls: None,
#[allow(deprecated)]
function_call: None,
},
));
......@@ -685,15 +685,33 @@ fn convert_user_blocks(
}
/// Convert assistant-role content blocks into chat completion messages.
/// Text blocks become an assistant message; tool_use blocks become tool_calls on an assistant message.
/// Thinking blocks are passed through as `reasoning_content`.
///
/// Text blocks become an assistant message; tool_use blocks become tool_calls on an assistant
/// message. Thinking blocks are preserved via `reasoning_content: Option<ReasoningContent>`:
///
/// - `ReasoningContent::Text(s)`: flat reasoning string (no tool calls present).
/// - `ReasoningContent::Segments(segs)`: one entry **per position** in the interleaved sequence,
/// enabling chat templates to reconstruct the exact token order:
/// `<think>segments[0]</think><call>tc[0]</call><think>segments[1]</think><call>tc[1]</call>…<think>segments[N]</think>`
/// - `segments[i]` is the thinking that immediately preceded `tool_calls[i]`
/// - `segments[tool_calls.len()]` is any trailing thinking after the last tool call
/// - `segments.len() == tool_calls.len() + 1` always
/// - Individual entries may be empty strings (no reasoning at that position)
/// - `None` when there is no reasoning content at all.
///
/// Preserving the original interleaved order is required for KV cache correctness: a prompt
/// reconstructed from a flattened `reasoning_content` will differ token-by-token from the
/// original assistant turn, causing a cache miss on every multi-tool exchange.
fn convert_assistant_blocks(
blocks: &[AnthropicContentBlock],
messages: &mut Vec<ChatCompletionRequestMessage>,
) {
let mut text_content = String::new();
let mut thinking_content = String::new();
let mut tool_calls = Vec::new();
// One reasoning segment per tool call — segments[i] precedes tool_calls[i].
let mut segments: Vec<String> = Vec::new();
// Accumulates thinking text until the next tool_use block (or end of blocks).
let mut pending_reasoning = String::new();
for block in blocks {
match block {
......@@ -701,12 +719,14 @@ fn convert_assistant_blocks(
text_content.push_str(text);
}
AnthropicContentBlock::Thinking { thinking, .. } => {
if !thinking_content.is_empty() {
thinking_content.push('\n');
if !pending_reasoning.is_empty() {
pending_reasoning.push('\n');
}
thinking_content.push_str(thinking);
pending_reasoning.push_str(thinking);
}
AnthropicContentBlock::ToolUse { id, name, input } => {
// Snapshot the reasoning that preceded this tool call.
segments.push(std::mem::take(&mut pending_reasoning));
tool_calls.push(ChatCompletionMessageToolCall {
id: id.clone(),
r#type: ChatCompletionToolType::Function,
......@@ -720,6 +740,11 @@ fn convert_assistant_blocks(
}
}
// Append any trailing reasoning (after the last tool call) as the final segment.
// This makes segments.len() == tool_calls.len() + 1, preserving the full interleaved
// order including reasoning that follows the last tool call.
segments.push(std::mem::take(&mut pending_reasoning));
let content = if text_content.is_empty() {
None
} else {
......@@ -728,10 +753,25 @@ fn convert_assistant_blocks(
))
};
let reasoning = if thinking_content.is_empty() {
None
// Produce a single ReasoningContent value:
// - Segments variant when there are tool calls and at least one segment is non-empty
// (genuine interleaving present).
// - Text variant when there's reasoning but no tool calls (flat form).
// - None when there's no reasoning at all.
let reasoning_content = if !tool_calls.is_empty() && segments.iter().any(|s| !s.is_empty()) {
Some(ReasoningContent::Segments(segments))
} else {
Some(thinking_content)
let flat: String = segments
.iter()
.filter(|s| !s.is_empty())
.cloned()
.collect::<Vec<_>>()
.join("\n");
if flat.is_empty() {
None
} else {
Some(ReasoningContent::Text(flat))
}
};
let tc = if tool_calls.is_empty() {
......@@ -743,7 +783,7 @@ fn convert_assistant_blocks(
messages.push(ChatCompletionRequestMessage::Assistant(
ChatCompletionRequestAssistantMessage {
content,
reasoning_content: reasoning,
reasoning_content,
refusal: None,
name: None,
audio: None,
......@@ -1339,7 +1379,10 @@ mod tests {
let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
match &chat_req.inner.messages[0] {
ChatCompletionRequestMessage::Assistant(a) => {
assert_eq!(a.reasoning_content.as_deref(), Some("I should think..."));
assert_eq!(
a.reasoning_content,
Some(ReasoningContent::Text("I should think...".into()))
);
match &a.content {
Some(ChatCompletionRequestAssistantMessageContent::Text(t)) => {
assert_eq!(t, "Answer");
......@@ -1476,4 +1519,261 @@ mod tests {
// "Hello, world! This is a test message." (37) + "You are helpful." (16) + role (4) = 57 / 3 = 19
assert_eq!(tokens, 19);
}
// --- ReasoningContent enum tests ---
fn make_req(blocks: Vec<AnthropicContentBlock>) -> ChatCompletionRequestAssistantMessage {
let req = AnthropicCreateMessageRequest {
model: "test-model".into(),
max_tokens: 100,
messages: vec![AnthropicMessage {
role: AnthropicRole::Assistant,
content: AnthropicMessageContent::Blocks { content: blocks },
}],
system: None,
temperature: None,
top_p: None,
top_k: None,
stop_sequences: None,
stream: false,
metadata: None,
tools: None,
tool_choice: None,
};
let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
match chat_req.inner.messages.into_iter().next().unwrap() {
ChatCompletionRequestMessage::Assistant(a) => a,
other => panic!("expected assistant, got {other:?}"),
}
}
fn tool_use(id: &str) -> AnthropicContentBlock {
AnthropicContentBlock::ToolUse {
id: id.into(),
name: "fn".into(),
input: serde_json::json!({}),
}
}
fn thinking(text: &str) -> AnthropicContentBlock {
AnthropicContentBlock::Thinking {
thinking: text.into(),
signature: "sig".into(),
}
}
#[test]
fn test_interleaved_thinking_and_tool_calls() {
// [Thinking("A"), ToolUse("t1"), Thinking("B"), ToolUse("t2")]
// segments = ["A", "B", ""] (trailing empty), tool_calls = [t1, t2]
let msg = make_req(vec![
thinking("A"),
tool_use("t1"),
thinking("B"),
tool_use("t2"),
]);
let segs = msg
.reasoning_content
.as_ref()
.expect("reasoning_content should be set")
.segments()
.expect("should be Segments variant");
assert_eq!(segs.len(), 3); // tool_calls.len() + 1
assert_eq!(segs[0], "A");
assert_eq!(segs[1], "B");
assert_eq!(segs[2], ""); // no trailing reasoning
assert_eq!(
msg.reasoning_content.as_ref().unwrap().to_flat_string(),
"A\nB"
);
let tcs = msg.tool_calls.as_ref().expect("tool_calls should be set");
assert_eq!(tcs.len(), 2);
assert_eq!(tcs[0].id, "t1");
assert_eq!(tcs[1].id, "t2");
}
#[test]
fn test_trailing_reasoning_preserved_in_segments() {
// [Thinking("A"), ToolUse("t1"), Thinking("B")]
// segments = ["A", "B"], trailing reasoning "B" must appear in segments[1]
let msg = make_req(vec![thinking("A"), tool_use("t1"), thinking("B")]);
let segs = msg
.reasoning_content
.as_ref()
.expect("reasoning_content should be set")
.segments()
.expect("should be Segments variant");
assert_eq!(segs.len(), 2); // 1 tool call + 1 trailing
assert_eq!(segs[0], "A");
assert_eq!(segs[1], "B"); // trailing reasoning preserved
assert_eq!(
msg.reasoning_content.as_ref().unwrap().to_flat_string(),
"A\nB"
);
}
#[test]
fn test_tool_use_before_thinking() {
// [ToolUse("t1"), Thinking("A"), ToolUse("t2")]
// segments = ["", "A", ""] — empty first segment, reasoning before t2
let msg = make_req(vec![tool_use("t1"), thinking("A"), tool_use("t2")]);
let segs = msg
.reasoning_content
.as_ref()
.expect("reasoning_content should be set")
.segments()
.expect("should be Segments variant");
assert_eq!(segs.len(), 3);
assert_eq!(segs[0], ""); // no reasoning before t1
assert_eq!(segs[1], "A");
assert_eq!(segs[2], ""); // no trailing
assert_eq!(
msg.reasoning_content.as_ref().unwrap().to_flat_string(),
"A"
);
}
#[test]
fn test_all_thinking_then_all_tools() {
// [Thinking("A"), Thinking("B"), ToolUse("t1"), ToolUse("t2")]
// segments = ["A\nB", "", ""] — all reasoning before first tool
let msg = make_req(vec![
thinking("A"),
thinking("B"),
tool_use("t1"),
tool_use("t2"),
]);
let segs = msg
.reasoning_content
.as_ref()
.expect("reasoning_content should be set")
.segments()
.expect("should be Segments variant");
assert_eq!(segs.len(), 3);
assert_eq!(segs[0], "A\nB");
assert_eq!(segs[1], "");
assert_eq!(segs[2], "");
assert_eq!(
msg.reasoning_content.as_ref().unwrap().to_flat_string(),
"A\nB"
);
}
#[test]
fn test_tool_calls_no_thinking_produces_no_segments() {
// [ToolUse("t1"), ToolUse("t2")] — all empty segments → reasoning_content = None
let msg = make_req(vec![tool_use("t1"), tool_use("t2")]);
assert!(
msg.reasoning_content.is_none(),
"no reasoning means no reasoning_content"
);
}
#[test]
fn test_thinking_only_no_tools_produces_text_variant() {
// [Thinking("A"), Text("answer")] — no tool calls → ReasoningContent::Text
let msg = make_req(vec![
thinking("A"),
AnthropicContentBlock::Text {
text: "answer".into(),
},
]);
assert_eq!(
msg.reasoning_content,
Some(ReasoningContent::Text("A".into()))
);
assert!(msg.reasoning_content.as_ref().unwrap().segments().is_none());
assert!(matches!(
msg.content,
Some(ChatCompletionRequestAssistantMessageContent::Text(ref t)) if t == "answer"
));
}
#[test]
fn test_single_thinking_then_single_tool() {
// [Thinking("reason"), ToolUse("t1")] → Segments(["reason", ""])
let msg = make_req(vec![thinking("reason"), tool_use("t1")]);
let segs = msg
.reasoning_content
.as_ref()
.expect("reasoning_content should be set")
.segments()
.expect("should be Segments variant");
assert_eq!(segs.len(), 2);
assert_eq!(segs[0], "reason");
assert_eq!(segs[1], "");
assert_eq!(
msg.reasoning_content.as_ref().unwrap().to_flat_string(),
"reason"
);
}
// Regression test for the KV-cache flattening bug.
//
// OLD CODE: `convert_assistant_blocks` concatenated all thinking blocks into a
// single flat string — `reasoning_content = Text("A\nB")`. A chat template
// given only that string can only reconstruct:
//
// <think>A\nB</think> <call>t1</call> <call>t2</call>
//
// That token sequence diverges from what the model originally generated at the
// very first `</think>`, so the KV cache misses on every multi-tool exchange.
//
// NEW CODE: `convert_assistant_blocks` produces `Segments(["A", "B", ""])` so a
// template that understands segments can reconstruct byte-for-byte:
//
// <think>A</think> <call>t1</call> <think>B</think> <call>t2</call>
//
// This test fails on the old code because the old code returns `Text("A\nB")` and
// `.segments()` returns `None`, causing the `expect` below to panic.
#[test]
fn test_interleaved_reasoning_not_flattened_regression() {
let msg = make_req(vec![
thinking("A"),
tool_use("t1"),
thinking("B"),
tool_use("t2"),
]);
// Must be Segments, not Text. Text("A\nB") is the old (broken) behaviour:
// it loses which reasoning block preceded which tool call.
assert!(
!matches!(msg.reasoning_content, Some(ReasoningContent::Text(_))),
"reasoning_content must NOT be flat Text when tool calls are interleaved; \
Text loses positional info and forces a KV cache miss on every multi-tool turn"
);
let segs = msg
.reasoning_content
.as_ref()
.expect("reasoning_content should be set")
.segments()
.expect(
"must be Segments so a chat template can reconstruct \
<think>A</think><call>t1</call><think>B</think><call>t2</call> \
rather than front-loading all reasoning before all calls",
);
// segs[i] precedes tool_calls[i] — the invariant a template relies on
assert_eq!(segs[0], "A", "reasoning before t1");
assert_eq!(segs[1], "B", "reasoning before t2");
assert_eq!(segs[2], "", "no trailing reasoning");
let tools = msg.tool_calls.as_ref().unwrap();
assert_eq!(tools[0].id, "t1");
assert_eq!(tools[1].id, "t2");
}
}
......@@ -321,6 +321,76 @@ fn test_reasoning_content_survives_chat_request_parsing_and_rendering() {
assert!(rendered.contains("</think>"));
}
// Regression test for the KV-cache flattening bug.
//
// Models like GLM-5 and Qwen3 (Pattern A) emit interleaved thinking:
//
// <think>A</think> <call>t1</call> <think>B</think> <call>t2</call>
//
// `convert_assistant_blocks` now serialises this as a JSON *array*:
//
// "reasoning_content": ["A", "B", ""]
//
// OLD CODE stored `reasoning_content: Option<String>` — a JSON array would fail
// to deserialise into that type, so this test panics at `.unwrap()` on old code.
// NEW CODE stores `Option<ReasoningContent>` which accepts both string and array,
// and round-trips the array form faithfully.
#[test]
fn test_reasoning_segments_roundtrip_through_parse_and_render() {
// Simulate what convert_assistant_blocks produces for an interleaved GLM-5 turn:
// [Think("A"), Tool(t1), Think("B"), Tool(t2)] → segments = ["A", "B", ""]
let json = r#"{
"model": "glm-5",
"messages": [
{"role": "user", "content": "call two tools"},
{
"role": "assistant",
"reasoning_content": ["A", "B", ""],
"tool_calls": [
{"id": "t1", "type": "function", "function": {"name": "f1", "arguments": "{}"}},
{"id": "t2", "type": "function", "function": {"name": "f2", "arguments": "{}"}}
]
},
{"role": "tool", "tool_call_id": "t1", "content": "r1"},
{"role": "tool", "tool_call_id": "t2", "content": "r2"}
]
}"#;
// OLD CODE: serde_json::from_str fails here because Option<String> can't
// deserialise a JSON array. NEW CODE: succeeds.
let request: NvCreateChatCompletionRequest = serde_json::from_str(json).unwrap();
// Segments must survive the round-trip through serde_json
let messages_json = serde_json::to_value(request.messages()).unwrap();
assert!(
messages_json[1]["reasoning_content"].is_array(),
"reasoning_content must serialise as a JSON array to preserve positional info; \
a string would lose which reasoning preceded which tool call"
);
let segs = messages_json[1]["reasoning_content"].as_array().unwrap();
assert_eq!(segs.len(), 3);
assert_eq!(segs[0].as_str().unwrap(), "A"); // precedes t1
assert_eq!(segs[1].as_str().unwrap(), "B"); // precedes t2
assert_eq!(segs[2].as_str().unwrap(), ""); // no trailing reasoning
// The formatter must not drop the reasoning content when segments are used.
// (DeepSeek V3.2 joins segments into one <think> block; this confirms the
// content is not silently discarded.)
let formatter =
dynamo_llm::preprocessor::prompt::deepseek_v32::DeepSeekV32Formatter::new_thinking();
let rendered = formatter.render(&request).unwrap();
assert!(
rendered.contains("A"),
"segment A must appear in rendered output"
);
assert!(
rendered.contains("B"),
"segment B must appear in rendered output"
);
assert!(rendered.contains("<think>"));
assert!(rendered.contains("</think>"));
}
#[test]
fn test_tool_call_formatting() {
let messages = serde_json::json!([
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment