"tests/vscode:/vscode.git/clone" did not exist on "5b2a9422f0f4cbdd69a9fed1dc1605838314ff81"
Unverified Commit 47cfbd46 authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

fix(parsers): recover Kimi K2 tool calls when section_end is missing (#8208)


Signed-off-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent adfc02d5
......@@ -843,7 +843,12 @@ impl JailedStream {
// parallel tool calls instead of splitting at the first end tag.
// Fall back to Path 1 when parsing fails (e.g. malformed content).
if early_exit {
// For early exit, find where the complete tool call ends
// For early exit, find where the complete tool call ends.
// `find_tool_call_end_position` returns `None` when the
// section wrapper isn't closed (e.g. kimi_k2 without
// section_end). In that case, don't early-exit — more
// parallel calls may follow. The calls will be recovered
// by `finalize()` at stream end.
if let Some(parser) = &self.tool_call_parser {
let tools_slice = self.tool_definitions.as_deref();
if let Ok((_, _)) = try_tool_call_parse_aggregate(
......@@ -853,8 +858,9 @@ impl JailedStream {
)
.await
{
let split_pos =
find_tool_call_end_position(accumulated_content, Some(parser));
if let Some(split_pos) =
find_tool_call_end_position(accumulated_content, Some(parser))
{
(true, split_pos)
} else {
(false, accumulated_content.len())
......@@ -862,6 +868,9 @@ impl JailedStream {
} else {
(false, accumulated_content.len())
}
} else {
(false, accumulated_content.len())
}
} else if let Some((end_pos, _)) = end_marker_info {
(true, end_pos)
} else {
......
......@@ -1105,4 +1105,155 @@ mod tests {
"finish_reason validation failed for non-tool call case"
);
}
// ---- Kimi K2 streaming jail reproduction tests ----
//
// These reproduce the customer-reported issue (DIS-1765): Kimi K2 agentic
// workflows hitting finish_reason=length repeatedly because the jail never
// exits when section_end is missing.
/// Helper: build a single streaming chunk with optional finish_reason
fn make_chunk(
content: &str,
finish_reason: Option<FinishReason>,
) -> Annotated<NvCreateChatCompletionStreamResponse> {
#[allow(deprecated)]
let choice = ChatChoiceStream {
index: 0,
delta: dynamo_protocols::types::ChatCompletionStreamResponseDelta {
role: Some(dynamo_protocols::types::Role::Assistant),
content: Some(ChatCompletionMessageContent::Text(content.to_string())),
tool_calls: None,
function_call: None,
refusal: None,
reasoning_content: None,
},
finish_reason,
stop_reason: None,
logprobs: None,
};
Annotated {
id: Some("test-kimi".to_string()),
data: Some(NvCreateChatCompletionStreamResponse {
inner: dynamo_protocols::types::CreateChatCompletionStreamResponse {
id: "test-kimi".to_string(),
choices: vec![choice],
created: 1234567890,
model: "kimi-k2".to_string(),
system_fingerprint: None,
object: "chat.completion.chunk".to_string(),
usage: None,
service_tier: None,
},
nvext: None,
}),
event: None,
comment: None,
error: None,
}
}
/// Repro: complete Kimi K2 tool call stream, section_end present → should work.
/// This is the baseline / control test.
#[tokio::test]
async fn test_kimi_k2_streaming_complete_section() {
let chunks = vec![
make_chunk("<|tool_calls_section_begin|>", None),
make_chunk("<|tool_call_begin|>functions.get_weather:0", None),
make_chunk("<|tool_call_argument_begin|>", None),
make_chunk(r#"{"location":"NYC"}"#, None),
make_chunk("<|tool_call_end|>", None),
make_chunk("<|tool_calls_section_end|>", Some(FinishReason::Stop)),
];
let input_stream = stream::iter(chunks);
let output_chunks =
parse_response_stream(input_stream, true, false, Some("kimi_k2".to_string()), None)
.await;
let aggregated = aggregate_content_from_chunks(&output_chunks);
assert!(
aggregated.has_tool_calls,
"Baseline: complete Kimi K2 section should produce tool calls"
);
assert_eq!(aggregated.tool_calls.len(), 1);
assert_eq!(
aggregated.tool_calls[0]["function"]["name"].as_str(),
Some("get_weather")
);
}
/// Repro for DIS-1765: model hits max_tokens BEFORE emitting section_end.
/// Individual tool call is complete (call_begin + args + call_end), but
/// section_end is missing. The jail should still extract the tool call at
/// finalize time instead of emitting raw marker text.
#[tokio::test]
async fn test_kimi_k2_streaming_missing_section_end_max_tokens() {
let chunks = vec![
make_chunk("<|tool_calls_section_begin|>", None),
make_chunk("<|tool_call_begin|>functions.get_weather:0", None),
make_chunk("<|tool_call_argument_begin|>", None),
make_chunk(r#"{"location":"NYC"}"#, None),
make_chunk("<|tool_call_end|>", None),
// Stream ends here — model hit max_tokens, no section_end.
make_chunk("", Some(FinishReason::Length)),
];
let input_stream = stream::iter(chunks);
let output_chunks =
parse_response_stream(input_stream, true, false, Some("kimi_k2".to_string()), None)
.await;
let aggregated = aggregate_content_from_chunks(&output_chunks);
// BUG: currently the jail stays open, finalize calls the parser which
// requires section_end, returns 0 tool calls, and the accumulated
// content (with raw markers) is emitted as plain text. The client sees
// garbage instead of a structured tool call.
assert!(
aggregated.has_tool_calls,
"Should extract tool calls even when section_end is missing (max_tokens truncation). \
Currently broken: jail emits raw marker text as content instead."
);
assert_eq!(aggregated.tool_calls.len(), 1);
assert_eq!(
aggregated.tool_calls[0]["function"]["name"].as_str(),
Some("get_weather")
);
}
/// Repro: multiple complete tool calls, no section_end (max_tokens).
#[tokio::test]
async fn test_kimi_k2_streaming_multiple_calls_missing_section_end() {
let chunks = vec![
make_chunk("<|tool_calls_section_begin|>", None),
make_chunk(
"<|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>",
None,
),
make_chunk(r#"{"location":"NYC"}<|tool_call_end|>"#, None),
make_chunk(
"<|tool_call_begin|>functions.get_time:1<|tool_call_argument_begin|>",
None,
),
make_chunk(
r#"{"timezone":"EST"}<|tool_call_end|>"#,
Some(FinishReason::Length),
),
];
let input_stream = stream::iter(chunks);
let output_chunks =
parse_response_stream(input_stream, true, false, Some("kimi_k2".to_string()), None)
.await;
let aggregated = aggregate_content_from_chunks(&output_chunks);
assert!(
aggregated.has_tool_calls,
"Should extract both tool calls even without section_end"
);
assert_eq!(aggregated.tool_calls.len(), 2);
}
}
......@@ -164,7 +164,12 @@ pub fn detect_tool_call_start(chunk: &str, parser_str: Option<&str>) -> anyhow::
}
}
pub fn find_tool_call_end_position(chunk: &str, parser_str: Option<&str>) -> usize {
/// Returns the byte offset immediately after the last complete tool-call
/// section in `chunk`. Returns `None` when the parser detects that the section
/// is not properly closed (e.g. kimi_k2 without `section_end`), signalling
/// that the caller should NOT treat the current buffer as a complete result —
/// more content may follow.
pub fn find_tool_call_end_position(chunk: &str, parser_str: Option<&str>) -> Option<usize> {
let parser_map = get_tool_parser_map();
let parser_key = match parser_str {
Some(s) if !s.is_empty() => s,
......@@ -174,35 +179,36 @@ pub fn find_tool_call_end_position(chunk: &str, parser_str: Option<&str>) -> usi
match parser_map.get(parser_key) {
Some(config) => match &config.parser_config {
ParserConfig::Json(json_config) => {
// For "default", use "nemotron_deci" as the effective parser; otherwise, use the provided parser_key
let effective_parser = if parser_key == "default" {
"nemotron_deci"
} else {
parser_key
};
find_tool_call_end_position_json(chunk, effective_parser, json_config)
Some(find_tool_call_end_position_json(
chunk,
effective_parser,
json_config,
))
}
ParserConfig::Harmony(json_config) => {
find_tool_call_end_position_harmony(chunk, json_config)
Some(find_tool_call_end_position_harmony(chunk, json_config))
}
ParserConfig::Pythonic => find_tool_call_end_position_pythonic(chunk),
ParserConfig::Typescript => {
// Typescript parser not implemented
chunk.len()
ParserConfig::Pythonic => Some(find_tool_call_end_position_pythonic(chunk)),
ParserConfig::Typescript => Some(chunk.len()),
ParserConfig::Xml(xml_config) => {
Some(find_tool_call_end_position_xml(chunk, xml_config))
}
ParserConfig::Dsml(dsml_config) => {
Some(find_tool_call_end_position_dsml(chunk, dsml_config))
}
ParserConfig::Xml(xml_config) => find_tool_call_end_position_xml(chunk, xml_config),
ParserConfig::Dsml(dsml_config) => find_tool_call_end_position_dsml(chunk, dsml_config),
ParserConfig::Glm47(glm47_config) => {
find_tool_call_end_position_glm47(chunk, glm47_config)
Some(find_tool_call_end_position_glm47(chunk, glm47_config))
}
ParserConfig::KimiK2(kimi_config) => {
find_tool_call_end_position_kimi_k2(chunk, kimi_config)
}
},
None => {
// Unknown parser, return full content length
chunk.len()
}
None => Some(chunk.len()),
}
}
// Tests
......
......@@ -69,8 +69,13 @@ pub fn detect_tool_call_start_kimi_k2(chunk: &str, config: &KimiK2ParserConfig)
/// Returns the position after `<|tool_calls_section_end|>` (or singular variant) or the length
/// of the chunk if not found.
pub fn find_tool_call_end_position_kimi_k2(chunk: &str, config: &KimiK2ParserConfig) -> usize {
// Find the earliest matching end token variant.
/// Returns `Some(pos)` when `section_end` is found, or `None` when it is
/// missing. `None` tells the streaming jail that the section is not properly
/// closed and it should keep accumulating instead of early-exiting.
pub fn find_tool_call_end_position_kimi_k2(
chunk: &str,
config: &KimiK2ParserConfig,
) -> Option<usize> {
let mut earliest: Option<usize> = None;
for end_token in &config.section_end_variants {
if let Some(pos) = chunk.find(end_token.as_str()) {
......@@ -78,7 +83,7 @@ pub fn find_tool_call_end_position_kimi_k2(chunk: &str, config: &KimiK2ParserCon
earliest = Some(earliest.map_or(end_pos, |e: usize| e.min(end_pos)));
}
}
earliest.unwrap_or(chunk.len())
earliest
}
/// Format:
......@@ -142,6 +147,31 @@ fn find_section_end(
}
/// Extract tool calls and normal text from message.
///
/// ## Difference from Moonshot's reference implementation
///
/// The reference parser in
/// [tool_call_guidance.md](https://huggingface.co/moonshotai/Kimi-K2-Instruct/blob/main/docs/tool_call_guidance.md)
/// requires `section_end` to extract any tool calls:
///
/// ```python
/// pattern = r"<\|tool_calls_section_begin\|>(.*?)<\|tool_calls_section_end\|>"
/// tool_calls_sections = re.findall(pattern, tool_call_rsp, re.DOTALL)
/// ```
///
/// When `section_end` is missing (model hit max_tokens, EOS, or stop sequence),
/// `re.findall` returns `[]` and all complete individual tool calls are silently
/// dropped — even when individual calls have complete `call_begin` + args +
/// `call_end` markers.
///
/// This implementation treats a missing `section_end` as "section extends to
/// end-of-string", equivalent to:
///
/// ```python
/// pattern = r"<\|tool_calls_section_begin\|>(.*?)(?:<\|tool_calls_section_end\|>|$)"
/// ```
///
/// This allows recovery of complete individual tool calls from truncated output.
fn extract_tool_calls(
text: &str,
config: &KimiK2ParserConfig,
......@@ -158,21 +188,23 @@ fn extract_tool_calls(
// Add text before tool call section to normal parts.
normal_parts.push(&text[cursor..abs_start]);
let (block, next_cursor) =
if let Some((end_pos, end_len)) = find_section_end(text, abs_start, config) {
let abs_end = abs_start + end_pos + end_len;
let block = &text[abs_start..abs_end];
(&text[abs_start..abs_end], abs_end)
} else {
// No section_end found — treat rest of string as section
// body. Complete individual calls can still be extracted;
// truly truncated calls (no call_end) are ignored by
// parse_section_block's regex.
(&text[abs_start..], text.len())
};
// Parse individual tool calls within this section block.
if let Ok(mut parsed_calls) = parse_section_block(block, config, tools) {
calls.append(&mut parsed_calls);
}
cursor = abs_end;
} else {
// No end token found -> treat the rest as normal text.
normal_parts.push(&text[abs_start..]);
break;
}
cursor = next_cursor;
} else {
// No more tool call sections.
normal_parts.push(&text[cursor..]);
......@@ -300,11 +332,12 @@ mod tests {
let config = default_config();
let text = "<|tool_calls_section_begin|><|tool_call_begin|>functions.test:0<|tool_call_argument_begin|>{}<|tool_call_end|><|tool_calls_section_end|>more text";
let pos = find_tool_call_end_position_kimi_k2(text, &config);
assert_eq!(&text[pos..], "more text");
assert_eq!(pos, Some(text.len() - "more text".len()));
assert_eq!(&text[pos.unwrap()..], "more text");
let text_no_end = "<|tool_calls_section_begin|><|tool_call_begin|>functions.test:0";
let pos = find_tool_call_end_position_kimi_k2(text_no_end, &config);
assert_eq!(pos, text_no_end.len());
assert_eq!(pos, None, "should return None when section_end is missing");
}
#[test]
......@@ -425,18 +458,66 @@ mod tests {
let config = default_config();
let input = r#"<|tool_calls_section_begin|><|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{"location":"NYC"}<|tool_call_end|>"#;
// Should handle gracefully - section_end not found so whole text is treated as normal
// Missing section_end but individual tool call is complete (call_begin + args + call_end).
// This happens when the model hits max_tokens before emitting section_end.
// The parser should still extract the complete individual tool calls.
let (calls, _normal) = try_tool_call_parse_kimi_k2(input, &config, None).unwrap();
assert_eq!(
calls.len(),
1,
"Should parse complete tool calls even without section_end (max_tokens truncation)"
);
assert_eq!(calls[0].function.name, "get_weather");
}
#[test]
fn test_parse_truncated_mid_argument_no_section_end() {
let config = default_config();
// Model hit max_tokens mid-argument — no call_end, no section_end.
// Truly incomplete tool call, nothing salvageable.
let input = r#"<|tool_calls_section_begin|><|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{"location":"NY"#;
let (calls, normal) = try_tool_call_parse_kimi_k2(input, &config, None).unwrap();
assert_eq!(
calls.len(),
0,
"No tool calls should be parsed without section end"
"Truly truncated call (no call_end) should return 0 tool calls"
);
// The section body is consumed by parse_section_block (which finds no
// complete calls), so normal content is empty — the raw markers are not
// re-emitted as user-visible text.
assert_eq!(normal, Some("".to_string()));
}
#[test]
fn test_parse_multiple_calls_no_section_end() {
let config = default_config();
// Two complete individual tool calls, but model stopped before section_end.
let input = r#"<|tool_calls_section_begin|><|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{"location":"NYC"}<|tool_call_end|><|tool_call_begin|>functions.get_time:1<|tool_call_argument_begin|>{"timezone":"EST"}<|tool_call_end|>"#;
let (calls, _) = try_tool_call_parse_kimi_k2(input, &config, None).unwrap();
assert_eq!(
normal,
Some(input.to_string()),
"Input should be preserved as normal text"
calls.len(),
2,
"Should parse both complete tool calls even without section_end"
);
assert_eq!(calls[0].function.name, "get_weather");
assert_eq!(calls[1].function.name, "get_time");
}
#[test]
fn test_parse_complete_plus_truncated_no_section_end() {
let config = default_config();
// First call is complete, second is truncated mid-argument.
let input = r#"<|tool_calls_section_begin|><|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{"location":"NYC"}<|tool_call_end|><|tool_call_begin|>functions.get_time:1<|tool_call_argument_begin|>{"tz"#;
let (calls, _) = try_tool_call_parse_kimi_k2(input, &config, None).unwrap();
assert_eq!(
calls.len(),
1,
"Should parse the one complete tool call, ignoring the truncated second"
);
assert_eq!(calls[0].function.name, "get_weather");
}
#[test]
......@@ -526,7 +607,7 @@ mod tests {
// Singular variant end token
let text = "<|tool_call_section_begin|><|tool_call_begin|>functions.test:0<|tool_call_argument_begin|>{}<|tool_call_end|><|tool_call_section_end|>more text";
let pos = find_tool_call_end_position_kimi_k2(text, &config);
assert_eq!(&text[pos..], "more text");
assert_eq!(&text[pos.unwrap()..], "more text");
}
// --- Tests inspired by vllm/sglang coverage gaps ---
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment