Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
a9e06960
Unverified
Commit
a9e06960
authored
Feb 23, 2026
by
MatejKosec
Committed by
GitHub
Feb 24, 2026
Browse files
fix(api): preserve interleaved reasoning order for KV cache correctness (#6442)
Signed-off-by:
Matej Kosec
<
mkosec@nvidia.com
>
parent
5277fb9b
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
549 additions
and
23 deletions
+549
-23
lib/async-openai/src/types/chat.rs
lib/async-openai/src/types/chat.rs
+130
-5
lib/llm/src/entrypoint/input/text.rs
lib/llm/src/entrypoint/input/text.rs
+3
-1
lib/llm/src/preprocessor/prompt/deepseek_v32.rs
lib/llm/src/preprocessor/prompt/deepseek_v32.rs
+33
-4
lib/llm/src/protocols/anthropic/types.rs
lib/llm/src/protocols/anthropic/types.rs
+313
-13
lib/llm/tests/deepseek_v32_encoding.rs
lib/llm/tests/deepseek_v32_encoding.rs
+70
-0
No files found.
lib/async-openai/src/types/chat.rs
View file @
a9e06960
...
@@ -466,6 +466,50 @@ pub struct ChatCompletionRequestAssistantMessageAudio {
...
@@ -466,6 +466,50 @@ pub struct ChatCompletionRequestAssistantMessageAudio {
pub
id
:
String
,
pub
id
:
String
,
}
}
/// Reasoning content from a previous assistant turn.
///
/// This is an untagged enum that deserializes from either:
/// - A plain string: `"reasoning_content": "thinking..."` -> `Text("thinking...")`
/// - An array of strings: `"reasoning_content": ["seg1", "seg2"]` -> `Segments(["seg1", "seg2"])`
///
/// The `Segments` variant preserves interleaved reasoning order needed for KV cache–correct
/// context reconstruction. `segments[i]` is the reasoning that preceded `tool_calls[i]`;
/// `segments[tool_calls.len()]` is any trailing reasoning after the last tool call.
/// `segments.len() == tool_calls.len() + 1` always when set.
#[derive(ToSchema,
Serialize,
Deserialize,
Clone,
Debug,
PartialEq)]
#[serde(untagged)]
pub
enum
ReasoningContent
{
/// Flat string — single reasoning block or legacy backward-compat form.
Text
(
String
),
/// Interleaved segments. segments[i] precedes tool_calls[i];
/// segments[N] is trailing reasoning after the last tool call.
/// segments.len() == tool_calls.len() + 1.
Segments
(
Vec
<
String
>
),
}
impl
ReasoningContent
{
/// Join all segments (or return text as-is) into a single flat string.
pub
fn
to_flat_string
(
&
self
)
->
String
{
match
self
{
ReasoningContent
::
Text
(
s
)
=>
s
.clone
(),
ReasoningContent
::
Segments
(
segs
)
=>
segs
.iter
()
.filter
(|
s
|
!
s
.is_empty
())
.cloned
()
.collect
::
<
Vec
<
_
>>
()
.join
(
"
\n
"
),
}
}
/// Returns the segments if this is the `Segments` variant, `None` for `Text`.
pub
fn
segments
(
&
self
)
->
Option
<&
[
String
]
>
{
match
self
{
ReasoningContent
::
Segments
(
segs
)
=>
Some
(
segs
),
ReasoningContent
::
Text
(
_
)
=>
None
,
}
}
}
#[derive(ToSchema,
Debug,
Serialize,
Deserialize,
Default,
Clone,
Builder,
PartialEq)]
#[derive(ToSchema,
Debug,
Serialize,
Deserialize,
Default,
Clone,
Builder,
PartialEq)]
#[builder(name
=
"ChatCompletionRequestAssistantMessageArgs"
)]
#[builder(name
=
"ChatCompletionRequestAssistantMessageArgs"
)]
#[builder(pattern
=
"mutable"
)]
#[builder(pattern
=
"mutable"
)]
...
@@ -476,10 +520,13 @@ pub struct ChatCompletionRequestAssistantMessage {
...
@@ -476,10 +520,13 @@ pub struct ChatCompletionRequestAssistantMessage {
/// The contents of the assistant message. Required unless `tool_calls` or `function_call` is specified.
/// The contents of the assistant message. Required unless `tool_calls` or `function_call` is specified.
#[serde(skip_serializing_if
=
"Option::is_none"
)]
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
content
:
Option
<
ChatCompletionRequestAssistantMessageContent
>
,
pub
content
:
Option
<
ChatCompletionRequestAssistantMessageContent
>
,
/// Optional internal reasoning content from a previous assistant turn.
/// Reasoning content from a previous assistant turn.
/// Used by reasoning-capable models that consume prior chain-of-thought-like context.
///
/// When serialized as a plain string, represents a flat reasoning block (backward-compatible
/// with Jinja chat templates). When serialized as an array of strings, represents
/// interleaved reasoning segments preserving per-position order for KV cache correctness.
#[serde(skip_serializing_if
=
"Option::is_none"
)]
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
reasoning_content
:
Option
<
String
>
,
pub
reasoning_content
:
Option
<
ReasoningContent
>
,
/// The refusal message by the assistant.
/// The refusal message by the assistant.
#[serde(skip_serializing_if
=
"Option::is_none"
)]
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
refusal
:
Option
<
String
>
,
pub
refusal
:
Option
<
String
>
,
...
@@ -1280,7 +1327,7 @@ mod tests {
...
@@ -1280,7 +1327,7 @@ mod tests {
}
}
#[test]
#[test]
fn
test_assistant_request_reasoning_content_roundtrip
()
{
fn
test_assistant_request_reasoning_content_
text_
roundtrip
()
{
let
json
=
r#"{
let
json
=
r#"{
"model": "deepseek-v3.2",
"model": "deepseek-v3.2",
"messages": [
"messages": [
...
@@ -1306,7 +1353,26 @@ mod tests {
...
@@ -1306,7 +1353,26 @@ mod tests {
_
=>
panic!
(
"expected assistant message"
),
_
=>
panic!
(
"expected assistant message"
),
};
};
assert_eq!
(
assistant
.reasoning_content
.as_deref
(),
Some
(
"thinking..."
));
assert_eq!
(
assistant
.reasoning_content
,
Some
(
ReasoningContent
::
Text
(
"thinking..."
.into
()))
);
assert_eq!
(
assistant
.reasoning_content
.as_ref
()
.unwrap
()
.to_flat_string
(),
"thinking..."
);
assert
!
(
assistant
.reasoning_content
.as_ref
()
.unwrap
()
.segments
()
.is_none
()
);
let
serialized
=
serde_json
::
to_value
(
&
request
)
.unwrap
();
let
serialized
=
serde_json
::
to_value
(
&
request
)
.unwrap
();
assert_eq!
(
assert_eq!
(
...
@@ -1314,4 +1380,63 @@ mod tests {
...
@@ -1314,4 +1380,63 @@ mod tests {
serde_json
::
Value
::
String
(
"thinking..."
.to_string
())
serde_json
::
Value
::
String
(
"thinking..."
.to_string
())
);
);
}
}
#[test]
fn
test_assistant_request_reasoning_content_segments_roundtrip
()
{
let
json
=
r#"{
"model": "deepseek-v3.2",
"messages": [
{"role": "user", "content": "test"},
{
"role": "assistant",
"reasoning_content": ["seg1", "seg2", ""],
"tool_calls": [{
"id": "call_1",
"type": "function",
"function": {"name": "f1", "arguments": "{}"}
}, {
"id": "call_2",
"type": "function",
"function": {"name": "f2", "arguments": "{}"}
}]
}
]
}"#
;
let
request
:
CreateChatCompletionRequest
=
serde_json
::
from_str
(
json
)
.unwrap
();
let
assistant
=
match
&
request
.messages
[
1
]
{
ChatCompletionRequestMessage
::
Assistant
(
msg
)
=>
msg
,
_
=>
panic!
(
"expected assistant message"
),
};
assert_eq!
(
assistant
.reasoning_content
,
Some
(
ReasoningContent
::
Segments
(
vec!
[
"seg1"
.into
(),
"seg2"
.into
(),
""
.into
()
]))
);
assert_eq!
(
assistant
.reasoning_content
.as_ref
()
.unwrap
()
.to_flat_string
(),
"seg1
\n
seg2"
);
let
segs
=
assistant
.reasoning_content
.as_ref
()
.unwrap
()
.segments
()
.expect
(
"should be Segments"
);
assert_eq!
(
segs
.len
(),
3
);
let
serialized
=
serde_json
::
to_value
(
&
request
)
.unwrap
();
assert_eq!
(
serialized
[
"messages"
][
1
][
"reasoning_content"
],
serde_json
::
json!
([
"seg1"
,
"seg2"
,
""
])
);
}
}
}
lib/llm/src/entrypoint/input/text.rs
View file @
a9e06960
...
@@ -187,7 +187,9 @@ async fn main_loop(
...
@@ -187,7 +187,9 @@ async fn main_loop(
let
assistant_message
=
dynamo_async_openai
::
types
::
ChatCompletionRequestMessage
::
Assistant
(
let
assistant_message
=
dynamo_async_openai
::
types
::
ChatCompletionRequestMessage
::
Assistant
(
dynamo_async_openai
::
types
::
ChatCompletionRequestAssistantMessage
{
dynamo_async_openai
::
types
::
ChatCompletionRequestAssistantMessage
{
content
:
Some
(
assistant_content
),
content
:
Some
(
assistant_content
),
reasoning_content
:
(
!
assistant_reasoning
.is_empty
())
.then_some
(
assistant_reasoning
),
reasoning_content
:
(
!
assistant_reasoning
.is_empty
())
.then_some
(
dynamo_async_openai
::
types
::
ReasoningContent
::
Text
(
assistant_reasoning
),
),
..
Default
::
default
()
..
Default
::
default
()
},
},
);
);
...
...
lib/llm/src/preprocessor/prompt/deepseek_v32.rs
View file @
a9e06960
...
@@ -275,13 +275,42 @@ fn render_message(
...
@@ -275,13 +275,42 @@ fn render_message(
// Handle reasoning content
// Handle reasoning content
// NOTE: If this assistant comes after last user message, the opening <think>
// NOTE: If this assistant comes after last user message, the opening <think>
// was already added in the user message. We only need to add content and closing tag.
// was already added in the user message. We only need to add content and closing tag.
//
// Handle reasoning_content which may be a plain string or an array of segments.
// DeepSeek V3.2 always places its <think> block before all tool calls, so
// joining segments produces the correct flat form here.
if
thinking_mode
==
ThinkingMode
::
Thinking
if
thinking_mode
==
ThinkingMode
::
Thinking
&&
last_user_idx
.is_some_and
(|
idx
|
index
>
idx
)
&&
last_user_idx
.is_some_and
(|
idx
|
index
>
idx
)
&&
let
Some
(
reasoning
)
=
msg
.get
(
"reasoning_content"
)
.and_then
(|
r
|
r
.as_str
())
{
{
// DON'T add THINKING_START - it was already added in user message
let
reasoning
=
msg
.get
(
"reasoning_content"
)
.and_then
(|
v
|
match
v
{
prompt
.push_str
(
reasoning
);
serde_json
::
Value
::
String
(
s
)
=>
{
prompt
.push_str
(
tokens
::
THINKING_END
);
if
s
.is_empty
()
{
None
}
else
{
Some
(
s
.clone
())
}
}
serde_json
::
Value
::
Array
(
arr
)
=>
{
let
joined
=
arr
.iter
()
.filter_map
(|
v
|
v
.as_str
())
.filter
(|
s
|
!
s
.is_empty
())
.collect
::
<
Vec
<
_
>>
()
.join
(
"
\n
"
);
if
joined
.is_empty
()
{
None
}
else
{
Some
(
joined
)
}
}
_
=>
None
,
});
if
let
Some
(
reasoning
)
=
reasoning
{
// DON'T add THINKING_START - it was already added in user message
prompt
.push_str
(
&
reasoning
);
prompt
.push_str
(
tokens
::
THINKING_END
);
}
}
}
// Handle content
// Handle content
...
...
lib/llm/src/protocols/anthropic/types.rs
View file @
a9e06960
...
@@ -13,7 +13,7 @@ use dynamo_async_openai::types::{
...
@@ -13,7 +13,7 @@ use dynamo_async_openai::types::{
ChatCompletionRequestSystemMessageContent
,
ChatCompletionRequestToolMessage
,
ChatCompletionRequestSystemMessageContent
,
ChatCompletionRequestToolMessage
,
ChatCompletionRequestToolMessageContent
,
ChatCompletionRequestUserMessage
,
ChatCompletionRequestToolMessageContent
,
ChatCompletionRequestUserMessage
,
ChatCompletionRequestUserMessageContent
,
ChatCompletionTool
,
ChatCompletionToolChoiceOption
,
ChatCompletionRequestUserMessageContent
,
ChatCompletionTool
,
ChatCompletionToolChoiceOption
,
ChatCompletionToolType
,
FunctionName
,
FunctionObject
,
ChatCompletionToolType
,
FunctionName
,
FunctionObject
,
ReasoningContent
,
};
};
use
serde
::{
Deserialize
,
Serialize
};
use
serde
::{
Deserialize
,
Serialize
};
use
uuid
::
Uuid
;
use
uuid
::
Uuid
;
...
@@ -557,6 +557,7 @@ impl TryFrom<AnthropicCreateMessageRequest> for NvCreateChatCompletionRequest {
...
@@ -557,6 +557,7 @@ impl TryFrom<AnthropicCreateMessageRequest> for NvCreateChatCompletionRequest {
// Assistant with plain text
// Assistant with plain text
(
AnthropicRole
::
Assistant
,
AnthropicMessageContent
::
Text
{
content
})
=>
{
(
AnthropicRole
::
Assistant
,
AnthropicMessageContent
::
Text
{
content
})
=>
{
messages
.push
(
ChatCompletionRequestMessage
::
Assistant
(
messages
.push
(
ChatCompletionRequestMessage
::
Assistant
(
#[allow(deprecated)]
ChatCompletionRequestAssistantMessage
{
ChatCompletionRequestAssistantMessage
{
content
:
Some
(
ChatCompletionRequestAssistantMessageContent
::
Text
(
content
:
Some
(
ChatCompletionRequestAssistantMessageContent
::
Text
(
content
.clone
(),
content
.clone
(),
...
@@ -566,7 +567,6 @@ impl TryFrom<AnthropicCreateMessageRequest> for NvCreateChatCompletionRequest {
...
@@ -566,7 +567,6 @@ impl TryFrom<AnthropicCreateMessageRequest> for NvCreateChatCompletionRequest {
name
:
None
,
name
:
None
,
audio
:
None
,
audio
:
None
,
tool_calls
:
None
,
tool_calls
:
None
,
#[allow(deprecated)]
function_call
:
None
,
function_call
:
None
,
},
},
));
));
...
@@ -685,15 +685,33 @@ fn convert_user_blocks(
...
@@ -685,15 +685,33 @@ fn convert_user_blocks(
}
}
/// Convert assistant-role content blocks into chat completion messages.
/// Convert assistant-role content blocks into chat completion messages.
/// Text blocks become an assistant message; tool_use blocks become tool_calls on an assistant message.
///
/// Thinking blocks are passed through as `reasoning_content`.
/// Text blocks become an assistant message; tool_use blocks become tool_calls on an assistant
/// message. Thinking blocks are preserved via `reasoning_content: Option<ReasoningContent>`:
///
/// - `ReasoningContent::Text(s)`: flat reasoning string (no tool calls present).
/// - `ReasoningContent::Segments(segs)`: one entry **per position** in the interleaved sequence,
/// enabling chat templates to reconstruct the exact token order:
/// `<think>segments[0]</think><call>tc[0]</call><think>segments[1]</think><call>tc[1]</call>…<think>segments[N]</think>`
/// - `segments[i]` is the thinking that immediately preceded `tool_calls[i]`
/// - `segments[tool_calls.len()]` is any trailing thinking after the last tool call
/// - `segments.len() == tool_calls.len() + 1` always
/// - Individual entries may be empty strings (no reasoning at that position)
/// - `None` when there is no reasoning content at all.
///
/// Preserving the original interleaved order is required for KV cache correctness: a prompt
/// reconstructed from a flattened `reasoning_content` will differ token-by-token from the
/// original assistant turn, causing a cache miss on every multi-tool exchange.
fn
convert_assistant_blocks
(
fn
convert_assistant_blocks
(
blocks
:
&
[
AnthropicContentBlock
],
blocks
:
&
[
AnthropicContentBlock
],
messages
:
&
mut
Vec
<
ChatCompletionRequestMessage
>
,
messages
:
&
mut
Vec
<
ChatCompletionRequestMessage
>
,
)
{
)
{
let
mut
text_content
=
String
::
new
();
let
mut
text_content
=
String
::
new
();
let
mut
thinking_content
=
String
::
new
();
let
mut
tool_calls
=
Vec
::
new
();
let
mut
tool_calls
=
Vec
::
new
();
// One reasoning segment per tool call — segments[i] precedes tool_calls[i].
let
mut
segments
:
Vec
<
String
>
=
Vec
::
new
();
// Accumulates thinking text until the next tool_use block (or end of blocks).
let
mut
pending_reasoning
=
String
::
new
();
for
block
in
blocks
{
for
block
in
blocks
{
match
block
{
match
block
{
...
@@ -701,12 +719,14 @@ fn convert_assistant_blocks(
...
@@ -701,12 +719,14 @@ fn convert_assistant_blocks(
text_content
.push_str
(
text
);
text_content
.push_str
(
text
);
}
}
AnthropicContentBlock
::
Thinking
{
thinking
,
..
}
=>
{
AnthropicContentBlock
::
Thinking
{
thinking
,
..
}
=>
{
if
!
thinking_content
.is_empty
()
{
if
!
pending_reasoning
.is_empty
()
{
thinking_content
.push
(
'\n'
);
pending_reasoning
.push
(
'\n'
);
}
}
thinking_content
.push_str
(
thinking
);
pending_reasoning
.push_str
(
thinking
);
}
}
AnthropicContentBlock
::
ToolUse
{
id
,
name
,
input
}
=>
{
AnthropicContentBlock
::
ToolUse
{
id
,
name
,
input
}
=>
{
// Snapshot the reasoning that preceded this tool call.
segments
.push
(
std
::
mem
::
take
(
&
mut
pending_reasoning
));
tool_calls
.push
(
ChatCompletionMessageToolCall
{
tool_calls
.push
(
ChatCompletionMessageToolCall
{
id
:
id
.clone
(),
id
:
id
.clone
(),
r
#
type
:
ChatCompletionToolType
::
Function
,
r
#
type
:
ChatCompletionToolType
::
Function
,
...
@@ -720,6 +740,11 @@ fn convert_assistant_blocks(
...
@@ -720,6 +740,11 @@ fn convert_assistant_blocks(
}
}
}
}
// Append any trailing reasoning (after the last tool call) as the final segment.
// This makes segments.len() == tool_calls.len() + 1, preserving the full interleaved
// order including reasoning that follows the last tool call.
segments
.push
(
std
::
mem
::
take
(
&
mut
pending_reasoning
));
let
content
=
if
text_content
.is_empty
()
{
let
content
=
if
text_content
.is_empty
()
{
None
None
}
else
{
}
else
{
...
@@ -728,10 +753,25 @@ fn convert_assistant_blocks(
...
@@ -728,10 +753,25 @@ fn convert_assistant_blocks(
))
))
};
};
let
reasoning
=
if
thinking_content
.is_empty
()
{
// Produce a single ReasoningContent value:
None
// - Segments variant when there are tool calls and at least one segment is non-empty
// (genuine interleaving present).
// - Text variant when there's reasoning but no tool calls (flat form).
// - None when there's no reasoning at all.
let
reasoning_content
=
if
!
tool_calls
.is_empty
()
&&
segments
.iter
()
.any
(|
s
|
!
s
.is_empty
())
{
Some
(
ReasoningContent
::
Segments
(
segments
))
}
else
{
}
else
{
Some
(
thinking_content
)
let
flat
:
String
=
segments
.iter
()
.filter
(|
s
|
!
s
.is_empty
())
.cloned
()
.collect
::
<
Vec
<
_
>>
()
.join
(
"
\n
"
);
if
flat
.is_empty
()
{
None
}
else
{
Some
(
ReasoningContent
::
Text
(
flat
))
}
};
};
let
tc
=
if
tool_calls
.is_empty
()
{
let
tc
=
if
tool_calls
.is_empty
()
{
...
@@ -743,7 +783,7 @@ fn convert_assistant_blocks(
...
@@ -743,7 +783,7 @@ fn convert_assistant_blocks(
messages
.push
(
ChatCompletionRequestMessage
::
Assistant
(
messages
.push
(
ChatCompletionRequestMessage
::
Assistant
(
ChatCompletionRequestAssistantMessage
{
ChatCompletionRequestAssistantMessage
{
content
,
content
,
reasoning_content
:
reasoning
,
reasoning_content
,
refusal
:
None
,
refusal
:
None
,
name
:
None
,
name
:
None
,
audio
:
None
,
audio
:
None
,
...
@@ -1339,7 +1379,10 @@ mod tests {
...
@@ -1339,7 +1379,10 @@ mod tests {
let
chat_req
:
NvCreateChatCompletionRequest
=
req
.try_into
()
.unwrap
();
let
chat_req
:
NvCreateChatCompletionRequest
=
req
.try_into
()
.unwrap
();
match
&
chat_req
.inner.messages
[
0
]
{
match
&
chat_req
.inner.messages
[
0
]
{
ChatCompletionRequestMessage
::
Assistant
(
a
)
=>
{
ChatCompletionRequestMessage
::
Assistant
(
a
)
=>
{
assert_eq!
(
a
.reasoning_content
.as_deref
(),
Some
(
"I should think..."
));
assert_eq!
(
a
.reasoning_content
,
Some
(
ReasoningContent
::
Text
(
"I should think..."
.into
()))
);
match
&
a
.content
{
match
&
a
.content
{
Some
(
ChatCompletionRequestAssistantMessageContent
::
Text
(
t
))
=>
{
Some
(
ChatCompletionRequestAssistantMessageContent
::
Text
(
t
))
=>
{
assert_eq!
(
t
,
"Answer"
);
assert_eq!
(
t
,
"Answer"
);
...
@@ -1476,4 +1519,261 @@ mod tests {
...
@@ -1476,4 +1519,261 @@ mod tests {
// "Hello, world! This is a test message." (37) + "You are helpful." (16) + role (4) = 57 / 3 = 19
// "Hello, world! This is a test message." (37) + "You are helpful." (16) + role (4) = 57 / 3 = 19
assert_eq!
(
tokens
,
19
);
assert_eq!
(
tokens
,
19
);
}
}
// --- ReasoningContent enum tests ---
fn
make_req
(
blocks
:
Vec
<
AnthropicContentBlock
>
)
->
ChatCompletionRequestAssistantMessage
{
let
req
=
AnthropicCreateMessageRequest
{
model
:
"test-model"
.into
(),
max_tokens
:
100
,
messages
:
vec!
[
AnthropicMessage
{
role
:
AnthropicRole
::
Assistant
,
content
:
AnthropicMessageContent
::
Blocks
{
content
:
blocks
},
}],
system
:
None
,
temperature
:
None
,
top_p
:
None
,
top_k
:
None
,
stop_sequences
:
None
,
stream
:
false
,
metadata
:
None
,
tools
:
None
,
tool_choice
:
None
,
};
let
chat_req
:
NvCreateChatCompletionRequest
=
req
.try_into
()
.unwrap
();
match
chat_req
.inner.messages
.into_iter
()
.next
()
.unwrap
()
{
ChatCompletionRequestMessage
::
Assistant
(
a
)
=>
a
,
other
=>
panic!
(
"expected assistant, got {other:?}"
),
}
}
fn
tool_use
(
id
:
&
str
)
->
AnthropicContentBlock
{
AnthropicContentBlock
::
ToolUse
{
id
:
id
.into
(),
name
:
"fn"
.into
(),
input
:
serde_json
::
json!
({}),
}
}
fn
thinking
(
text
:
&
str
)
->
AnthropicContentBlock
{
AnthropicContentBlock
::
Thinking
{
thinking
:
text
.into
(),
signature
:
"sig"
.into
(),
}
}
#[test]
fn
test_interleaved_thinking_and_tool_calls
()
{
// [Thinking("A"), ToolUse("t1"), Thinking("B"), ToolUse("t2")]
// segments = ["A", "B", ""] (trailing empty), tool_calls = [t1, t2]
let
msg
=
make_req
(
vec!
[
thinking
(
"A"
),
tool_use
(
"t1"
),
thinking
(
"B"
),
tool_use
(
"t2"
),
]);
let
segs
=
msg
.reasoning_content
.as_ref
()
.expect
(
"reasoning_content should be set"
)
.segments
()
.expect
(
"should be Segments variant"
);
assert_eq!
(
segs
.len
(),
3
);
// tool_calls.len() + 1
assert_eq!
(
segs
[
0
],
"A"
);
assert_eq!
(
segs
[
1
],
"B"
);
assert_eq!
(
segs
[
2
],
""
);
// no trailing reasoning
assert_eq!
(
msg
.reasoning_content
.as_ref
()
.unwrap
()
.to_flat_string
(),
"A
\n
B"
);
let
tcs
=
msg
.tool_calls
.as_ref
()
.expect
(
"tool_calls should be set"
);
assert_eq!
(
tcs
.len
(),
2
);
assert_eq!
(
tcs
[
0
]
.id
,
"t1"
);
assert_eq!
(
tcs
[
1
]
.id
,
"t2"
);
}
#[test]
fn
test_trailing_reasoning_preserved_in_segments
()
{
// [Thinking("A"), ToolUse("t1"), Thinking("B")]
// segments = ["A", "B"], trailing reasoning "B" must appear in segments[1]
let
msg
=
make_req
(
vec!
[
thinking
(
"A"
),
tool_use
(
"t1"
),
thinking
(
"B"
)]);
let
segs
=
msg
.reasoning_content
.as_ref
()
.expect
(
"reasoning_content should be set"
)
.segments
()
.expect
(
"should be Segments variant"
);
assert_eq!
(
segs
.len
(),
2
);
// 1 tool call + 1 trailing
assert_eq!
(
segs
[
0
],
"A"
);
assert_eq!
(
segs
[
1
],
"B"
);
// trailing reasoning preserved
assert_eq!
(
msg
.reasoning_content
.as_ref
()
.unwrap
()
.to_flat_string
(),
"A
\n
B"
);
}
#[test]
fn
test_tool_use_before_thinking
()
{
// [ToolUse("t1"), Thinking("A"), ToolUse("t2")]
// segments = ["", "A", ""] — empty first segment, reasoning before t2
let
msg
=
make_req
(
vec!
[
tool_use
(
"t1"
),
thinking
(
"A"
),
tool_use
(
"t2"
)]);
let
segs
=
msg
.reasoning_content
.as_ref
()
.expect
(
"reasoning_content should be set"
)
.segments
()
.expect
(
"should be Segments variant"
);
assert_eq!
(
segs
.len
(),
3
);
assert_eq!
(
segs
[
0
],
""
);
// no reasoning before t1
assert_eq!
(
segs
[
1
],
"A"
);
assert_eq!
(
segs
[
2
],
""
);
// no trailing
assert_eq!
(
msg
.reasoning_content
.as_ref
()
.unwrap
()
.to_flat_string
(),
"A"
);
}
#[test]
fn
test_all_thinking_then_all_tools
()
{
// [Thinking("A"), Thinking("B"), ToolUse("t1"), ToolUse("t2")]
// segments = ["A\nB", "", ""] — all reasoning before first tool
let
msg
=
make_req
(
vec!
[
thinking
(
"A"
),
thinking
(
"B"
),
tool_use
(
"t1"
),
tool_use
(
"t2"
),
]);
let
segs
=
msg
.reasoning_content
.as_ref
()
.expect
(
"reasoning_content should be set"
)
.segments
()
.expect
(
"should be Segments variant"
);
assert_eq!
(
segs
.len
(),
3
);
assert_eq!
(
segs
[
0
],
"A
\n
B"
);
assert_eq!
(
segs
[
1
],
""
);
assert_eq!
(
segs
[
2
],
""
);
assert_eq!
(
msg
.reasoning_content
.as_ref
()
.unwrap
()
.to_flat_string
(),
"A
\n
B"
);
}
#[test]
fn
test_tool_calls_no_thinking_produces_no_segments
()
{
// [ToolUse("t1"), ToolUse("t2")] — all empty segments → reasoning_content = None
let
msg
=
make_req
(
vec!
[
tool_use
(
"t1"
),
tool_use
(
"t2"
)]);
assert
!
(
msg
.reasoning_content
.is_none
(),
"no reasoning means no reasoning_content"
);
}
#[test]
fn
test_thinking_only_no_tools_produces_text_variant
()
{
// [Thinking("A"), Text("answer")] — no tool calls → ReasoningContent::Text
let
msg
=
make_req
(
vec!
[
thinking
(
"A"
),
AnthropicContentBlock
::
Text
{
text
:
"answer"
.into
(),
},
]);
assert_eq!
(
msg
.reasoning_content
,
Some
(
ReasoningContent
::
Text
(
"A"
.into
()))
);
assert
!
(
msg
.reasoning_content
.as_ref
()
.unwrap
()
.segments
()
.is_none
());
assert
!
(
matches!
(
msg
.content
,
Some
(
ChatCompletionRequestAssistantMessageContent
::
Text
(
ref
t
))
if
t
==
"answer"
));
}
#[test]
fn
test_single_thinking_then_single_tool
()
{
// [Thinking("reason"), ToolUse("t1")] → Segments(["reason", ""])
let
msg
=
make_req
(
vec!
[
thinking
(
"reason"
),
tool_use
(
"t1"
)]);
let
segs
=
msg
.reasoning_content
.as_ref
()
.expect
(
"reasoning_content should be set"
)
.segments
()
.expect
(
"should be Segments variant"
);
assert_eq!
(
segs
.len
(),
2
);
assert_eq!
(
segs
[
0
],
"reason"
);
assert_eq!
(
segs
[
1
],
""
);
assert_eq!
(
msg
.reasoning_content
.as_ref
()
.unwrap
()
.to_flat_string
(),
"reason"
);
}
// Regression test for the KV-cache flattening bug.
//
// OLD CODE: `convert_assistant_blocks` concatenated all thinking blocks into a
// single flat string — `reasoning_content = Text("A\nB")`. A chat template
// given only that string can only reconstruct:
//
// <think>A\nB</think> <call>t1</call> <call>t2</call>
//
// That token sequence diverges from what the model originally generated at the
// very first `</think>`, so the KV cache misses on every multi-tool exchange.
//
// NEW CODE: `convert_assistant_blocks` produces `Segments(["A", "B", ""])` so a
// template that understands segments can reconstruct byte-for-byte:
//
// <think>A</think> <call>t1</call> <think>B</think> <call>t2</call>
//
// This test fails on the old code because the old code returns `Text("A\nB")` and
// `.segments()` returns `None`, causing the `expect` below to panic.
#[test]
fn
test_interleaved_reasoning_not_flattened_regression
()
{
let
msg
=
make_req
(
vec!
[
thinking
(
"A"
),
tool_use
(
"t1"
),
thinking
(
"B"
),
tool_use
(
"t2"
),
]);
// Must be Segments, not Text. Text("A\nB") is the old (broken) behaviour:
// it loses which reasoning block preceded which tool call.
assert
!
(
!
matches!
(
msg
.reasoning_content
,
Some
(
ReasoningContent
::
Text
(
_
))),
"reasoning_content must NOT be flat Text when tool calls are interleaved;
\
Text loses positional info and forces a KV cache miss on every multi-tool turn"
);
let
segs
=
msg
.reasoning_content
.as_ref
()
.expect
(
"reasoning_content should be set"
)
.segments
()
.expect
(
"must be Segments so a chat template can reconstruct
\
<think>A</think><call>t1</call><think>B</think><call>t2</call>
\
rather than front-loading all reasoning before all calls"
,
);
// segs[i] precedes tool_calls[i] — the invariant a template relies on
assert_eq!
(
segs
[
0
],
"A"
,
"reasoning before t1"
);
assert_eq!
(
segs
[
1
],
"B"
,
"reasoning before t2"
);
assert_eq!
(
segs
[
2
],
""
,
"no trailing reasoning"
);
let
tools
=
msg
.tool_calls
.as_ref
()
.unwrap
();
assert_eq!
(
tools
[
0
]
.id
,
"t1"
);
assert_eq!
(
tools
[
1
]
.id
,
"t2"
);
}
}
}
lib/llm/tests/deepseek_v32_encoding.rs
View file @
a9e06960
...
@@ -321,6 +321,76 @@ fn test_reasoning_content_survives_chat_request_parsing_and_rendering() {
...
@@ -321,6 +321,76 @@ fn test_reasoning_content_survives_chat_request_parsing_and_rendering() {
assert
!
(
rendered
.contains
(
"</think>"
));
assert
!
(
rendered
.contains
(
"</think>"
));
}
}
// Regression test for the KV-cache flattening bug.
//
// Models like GLM-5 and Qwen3 (Pattern A) emit interleaved thinking:
//
// <think>A</think> <call>t1</call> <think>B</think> <call>t2</call>
//
// `convert_assistant_blocks` now serialises this as a JSON *array*:
//
// "reasoning_content": ["A", "B", ""]
//
// OLD CODE stored `reasoning_content: Option<String>` — a JSON array would fail
// to deserialise into that type, so this test panics at `.unwrap()` on old code.
// NEW CODE stores `Option<ReasoningContent>` which accepts both string and array,
// and round-trips the array form faithfully.
#[test]
fn
test_reasoning_segments_roundtrip_through_parse_and_render
()
{
// Simulate what convert_assistant_blocks produces for an interleaved GLM-5 turn:
// [Think("A"), Tool(t1), Think("B"), Tool(t2)] → segments = ["A", "B", ""]
let
json
=
r#"{
"model": "glm-5",
"messages": [
{"role": "user", "content": "call two tools"},
{
"role": "assistant",
"reasoning_content": ["A", "B", ""],
"tool_calls": [
{"id": "t1", "type": "function", "function": {"name": "f1", "arguments": "{}"}},
{"id": "t2", "type": "function", "function": {"name": "f2", "arguments": "{}"}}
]
},
{"role": "tool", "tool_call_id": "t1", "content": "r1"},
{"role": "tool", "tool_call_id": "t2", "content": "r2"}
]
}"#
;
// OLD CODE: serde_json::from_str fails here because Option<String> can't
// deserialise a JSON array. NEW CODE: succeeds.
let
request
:
NvCreateChatCompletionRequest
=
serde_json
::
from_str
(
json
)
.unwrap
();
// Segments must survive the round-trip through serde_json
let
messages_json
=
serde_json
::
to_value
(
request
.messages
())
.unwrap
();
assert
!
(
messages_json
[
1
][
"reasoning_content"
]
.is_array
(),
"reasoning_content must serialise as a JSON array to preserve positional info;
\
a string would lose which reasoning preceded which tool call"
);
let
segs
=
messages_json
[
1
][
"reasoning_content"
]
.as_array
()
.unwrap
();
assert_eq!
(
segs
.len
(),
3
);
assert_eq!
(
segs
[
0
]
.as_str
()
.unwrap
(),
"A"
);
// precedes t1
assert_eq!
(
segs
[
1
]
.as_str
()
.unwrap
(),
"B"
);
// precedes t2
assert_eq!
(
segs
[
2
]
.as_str
()
.unwrap
(),
""
);
// no trailing reasoning
// The formatter must not drop the reasoning content when segments are used.
// (DeepSeek V3.2 joins segments into one <think> block; this confirms the
// content is not silently discarded.)
let
formatter
=
dynamo_llm
::
preprocessor
::
prompt
::
deepseek_v32
::
DeepSeekV32Formatter
::
new_thinking
();
let
rendered
=
formatter
.render
(
&
request
)
.unwrap
();
assert
!
(
rendered
.contains
(
"A"
),
"segment A must appear in rendered output"
);
assert
!
(
rendered
.contains
(
"B"
),
"segment B must appear in rendered output"
);
assert
!
(
rendered
.contains
(
"<think>"
));
assert
!
(
rendered
.contains
(
"</think>"
));
}
#[test]
#[test]
fn
test_tool_call_formatting
()
{
fn
test_tool_call_formatting
()
{
let
messages
=
serde_json
::
json!
([
let
messages
=
serde_json
::
json!
([
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment