fix(parsers): recover Kimi K2 tool calls when section_end is missing (#8208)

Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com>

fix(parsers): recover Kimi K2 tool calls when section_end is missing (#8208)
Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com>
47cfbd46 · Keiven C · GitHub · adfc02d5 · 47cfbd46 · 47cfbd46
Unverified Commit 47cfbd46 authored Apr 20, 2026 by Keiven C Committed by GitHub Apr 20, 2026
4 changed files
--- a/lib/llm/src/protocols/openai/chat_completions/jail.rs
+++ b/lib/llm/src/protocols/openai/chat_completions/jail.rs
@@ -843,7 +843,12 @@ impl JailedStream {
                // parallel tool calls instead of splitting at the first end tag.
                // Fall back to Path 1 when parsing fails (e.g. malformed content).
                if early_exit {
-                    // For early exit, find where the complete tool call ends
+                    // For early exit, find where the complete tool call ends.
+                    // `find_tool_call_end_position` returns `None` when the
+                    // section wrapper isn't closed (e.g. kimi_k2 without
+                    // section_end). In that case, don't early-exit — more
+                    // parallel calls may follow. The calls will be recovered
+                    // by `finalize()` at stream end.
                    if let Some(parser) = &self.tool_call_parser {
                        let tools_slice = self.tool_definitions.as_deref();
                        if let Ok((_, _)) = try_tool_call_parse_aggregate(
@@ -853,8 +858,9 @@ impl JailedStream {
                        )
                        .await
                        {
-                            let split_pos =
-                                find_tool_call_end_position(accumulated_content, Some(parser));
+                            if let Some(split_pos) =
+                                find_tool_call_end_position(accumulated_content, Some(parser))
+                            {
                                (true, split_pos)
                            } else {
                                (false, accumulated_content.len())
@@ -862,6 +868,9 @@ impl JailedStream {
                        } else {
                            (false, accumulated_content.len())
                        }
+                    } else {
+                        (false, accumulated_content.len())
+                    }
                } else if let Some((end_pos, _)) = end_marker_info {
                    (true, end_pos)
                } else {

--- a/lib/llm/tests/test_streaming_tool_parsers.rs
+++ b/lib/llm/tests/test_streaming_tool_parsers.rs
@@ -1105,4 +1105,155 @@ mod tests {
            "finish_reason validation failed for non-tool call case"
        );
    }
+
+    // ---- Kimi K2 streaming jail reproduction tests ----
+    //
+    // These reproduce the customer-reported issue (DIS-1765): Kimi K2 agentic
+    // workflows hitting finish_reason=length repeatedly because the jail never
+    // exits when section_end is missing.
+
+    /// Helper: build a single streaming chunk with optional finish_reason
+    fn make_chunk(
+        content: &str,
+        finish_reason: Option<FinishReason>,
+    ) -> Annotated<NvCreateChatCompletionStreamResponse> {
+        #[allow(deprecated)]
+        let choice = ChatChoiceStream {
+            index: 0,
+            delta: dynamo_protocols::types::ChatCompletionStreamResponseDelta {
+                role: Some(dynamo_protocols::types::Role::Assistant),
+                content: Some(ChatCompletionMessageContent::Text(content.to_string())),
+                tool_calls: None,
+                function_call: None,
+                refusal: None,
+                reasoning_content: None,
+            },
+            finish_reason,
+            stop_reason: None,
+            logprobs: None,
+        };
+        Annotated {
+            id: Some("test-kimi".to_string()),
+            data: Some(NvCreateChatCompletionStreamResponse {
+                inner: dynamo_protocols::types::CreateChatCompletionStreamResponse {
+                    id: "test-kimi".to_string(),
+                    choices: vec![choice],
+                    created: 1234567890,
+                    model: "kimi-k2".to_string(),
+                    system_fingerprint: None,
+                    object: "chat.completion.chunk".to_string(),
+                    usage: None,
+                    service_tier: None,
+                },
+                nvext: None,
+            }),
+            event: None,
+            comment: None,
+            error: None,
+        }
+    }
+
+    /// Repro: complete Kimi K2 tool call stream, section_end present → should work.
+    /// This is the baseline / control test.
+    #[tokio::test]
+    async fn test_kimi_k2_streaming_complete_section() {
+        let chunks = vec![
+            make_chunk("<|tool_calls_section_begin|>", None),
+            make_chunk("<|tool_call_begin|>functions.get_weather:0", None),
+            make_chunk("<|tool_call_argument_begin|>", None),
+            make_chunk(r#"{"location":"NYC"}"#, None),
+            make_chunk("<|tool_call_end|>", None),
+            make_chunk("<|tool_calls_section_end|>", Some(FinishReason::Stop)),
+        ];
+
+        let input_stream = stream::iter(chunks);
+        let output_chunks =
+            parse_response_stream(input_stream, true, false, Some("kimi_k2".to_string()), None)
+                .await;
+
+        let aggregated = aggregate_content_from_chunks(&output_chunks);
+
+        assert!(
+            aggregated.has_tool_calls,
+            "Baseline: complete Kimi K2 section should produce tool calls"
+        );
+        assert_eq!(aggregated.tool_calls.len(), 1);
+        assert_eq!(
+            aggregated.tool_calls[0]["function"]["name"].as_str(),
+            Some("get_weather")
+        );
+    }
+
+    /// Repro for DIS-1765: model hits max_tokens BEFORE emitting section_end.
+    /// Individual tool call is complete (call_begin + args + call_end), but
+    /// section_end is missing. The jail should still extract the tool call at
+    /// finalize time instead of emitting raw marker text.
+    #[tokio::test]
+    async fn test_kimi_k2_streaming_missing_section_end_max_tokens() {
+        let chunks = vec![
+            make_chunk("<|tool_calls_section_begin|>", None),
+            make_chunk("<|tool_call_begin|>functions.get_weather:0", None),
+            make_chunk("<|tool_call_argument_begin|>", None),
+            make_chunk(r#"{"location":"NYC"}"#, None),
+            make_chunk("<|tool_call_end|>", None),
+            // Stream ends here — model hit max_tokens, no section_end.
+            make_chunk("", Some(FinishReason::Length)),
+        ];
+
+        let input_stream = stream::iter(chunks);
+        let output_chunks =
+            parse_response_stream(input_stream, true, false, Some("kimi_k2".to_string()), None)
+                .await;
+
+        let aggregated = aggregate_content_from_chunks(&output_chunks);
+
+        // BUG: currently the jail stays open, finalize calls the parser which
+        // requires section_end, returns 0 tool calls, and the accumulated
+        // content (with raw markers) is emitted as plain text. The client sees
+        // garbage instead of a structured tool call.
+        assert!(
+            aggregated.has_tool_calls,
+            "Should extract tool calls even when section_end is missing (max_tokens truncation). \
+             Currently broken: jail emits raw marker text as content instead."
+        );
+        assert_eq!(aggregated.tool_calls.len(), 1);
+        assert_eq!(
+            aggregated.tool_calls[0]["function"]["name"].as_str(),
+            Some("get_weather")
+        );
+    }
+
+    /// Repro: multiple complete tool calls, no section_end (max_tokens).
+    #[tokio::test]
+    async fn test_kimi_k2_streaming_multiple_calls_missing_section_end() {
+        let chunks = vec![
+            make_chunk("<|tool_calls_section_begin|>", None),
+            make_chunk(
+                "<|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>",
+                None,
+            ),
+            make_chunk(r#"{"location":"NYC"}<|tool_call_end|>"#, None),
+            make_chunk(
+                "<|tool_call_begin|>functions.get_time:1<|tool_call_argument_begin|>",
+                None,
+            ),
+            make_chunk(
+                r#"{"timezone":"EST"}<|tool_call_end|>"#,
+                Some(FinishReason::Length),
+            ),
+        ];
+
+        let input_stream = stream::iter(chunks);
+        let output_chunks =
+            parse_response_stream(input_stream, true, false, Some("kimi_k2".to_string()), None)
+                .await;
+
+        let aggregated = aggregate_content_from_chunks(&output_chunks);
+
+        assert!(
+            aggregated.has_tool_calls,
+            "Should extract both tool calls even without section_end"
+        );
+        assert_eq!(aggregated.tool_calls.len(), 2);
+    }
 }
--- a/lib/parsers/src/tool_calling/parsers.rs
+++ b/lib/parsers/src/tool_calling/parsers.rs
@@ -164,7 +164,12 @@ pub fn detect_tool_call_start(chunk: &str, parser_str: Option<&str>) -> anyhow::
    }
 }

-pub fn find_tool_call_end_position(chunk: &str, parser_str: Option<&str>) -> usize {
+/// Returns the byte offset immediately after the last complete tool-call
+/// section in `chunk`. Returns `None` when the parser detects that the section
+/// is not properly closed (e.g. kimi_k2 without `section_end`), signalling
+/// that the caller should NOT treat the current buffer as a complete result —
+/// more content may follow.
+pub fn find_tool_call_end_position(chunk: &str, parser_str: Option<&str>) -> Option<usize> {
    let parser_map = get_tool_parser_map();
    let parser_key = match parser_str {
        Some(s) if !s.is_empty() => s,
@@ -174,35 +179,36 @@ pub fn find_tool_call_end_position(chunk: &str, parser_str: Option<&str>) -> usi
    match parser_map.get(parser_key) {
        Some(config) => match &config.parser_config {
            ParserConfig::Json(json_config) => {
-                // For "default", use "nemotron_deci" as the effective parser; otherwise, use the provided parser_key
                let effective_parser = if parser_key == "default" {
                    "nemotron_deci"
                } else {
                    parser_key
                };
-                find_tool_call_end_position_json(chunk, effective_parser, json_config)
+                Some(find_tool_call_end_position_json(
+                    chunk,
+                    effective_parser,
+                    json_config,
+                ))
            }
            ParserConfig::Harmony(json_config) => {
-                find_tool_call_end_position_harmony(chunk, json_config)
+                Some(find_tool_call_end_position_harmony(chunk, json_config))
            }
-            ParserConfig::Pythonic => find_tool_call_end_position_pythonic(chunk),
-            ParserConfig::Typescript => {
-                // Typescript parser not implemented
-                chunk.len()
+            ParserConfig::Pythonic => Some(find_tool_call_end_position_pythonic(chunk)),
+            ParserConfig::Typescript => Some(chunk.len()),
+            ParserConfig::Xml(xml_config) => {
+                Some(find_tool_call_end_position_xml(chunk, xml_config))
+            }
+            ParserConfig::Dsml(dsml_config) => {
+                Some(find_tool_call_end_position_dsml(chunk, dsml_config))
            }
-            ParserConfig::Xml(xml_config) => find_tool_call_end_position_xml(chunk, xml_config),
-            ParserConfig::Dsml(dsml_config) => find_tool_call_end_position_dsml(chunk, dsml_config),
            ParserConfig::Glm47(glm47_config) => {
-                find_tool_call_end_position_glm47(chunk, glm47_config)
+                Some(find_tool_call_end_position_glm47(chunk, glm47_config))
            }
            ParserConfig::KimiK2(kimi_config) => {
                find_tool_call_end_position_kimi_k2(chunk, kimi_config)
            }
        },
-        None => {
-            // Unknown parser, return full content length
-            chunk.len()
-        }
+        None => Some(chunk.len()),
    }
 }
 // Tests

--- a/lib/parsers/src/tool_calling/xml/kimi_k2_parser.rs
+++ b/lib/parsers/src/tool_calling/xml/kimi_k2_parser.rs
@@ -69,8 +69,13 @@ pub fn detect_tool_call_start_kimi_k2(chunk: &str, config: &KimiK2ParserConfig)

 /// Returns the position after `<|tool_calls_section_end|>` (or singular variant) or the length
 /// of the chunk if not found.
-pub fn find_tool_call_end_position_kimi_k2(chunk: &str, config: &KimiK2ParserConfig) -> usize {
-    // Find the earliest matching end token variant.
+/// Returns `Some(pos)` when `section_end` is found, or `None` when it is
+/// missing. `None` tells the streaming jail that the section is not properly
+/// closed and it should keep accumulating instead of early-exiting.
+pub fn find_tool_call_end_position_kimi_k2(
+    chunk: &str,
+    config: &KimiK2ParserConfig,
+) -> Option<usize> {
    let mut earliest: Option<usize> = None;
    for end_token in &config.section_end_variants {
        if let Some(pos) = chunk.find(end_token.as_str()) {
@@ -78,7 +83,7 @@ pub fn find_tool_call_end_position_kimi_k2(chunk: &str, config: &KimiK2ParserCon
            earliest = Some(earliest.map_or(end_pos, |e: usize| e.min(end_pos)));
        }
    }
-    earliest.unwrap_or(chunk.len())
+    earliest
 }

 /// Format:
@@ -142,6 +147,31 @@ fn find_section_end(
 }

 /// Extract tool calls and normal text from message.
+///
+/// ## Difference from Moonshot's reference implementation
+///
+/// The reference parser in
+/// [tool_call_guidance.md](https://huggingface.co/moonshotai/Kimi-K2-Instruct/blob/main/docs/tool_call_guidance.md)
+/// requires `section_end` to extract any tool calls:
+///
+/// ```python
+/// pattern = r"<\|tool_calls_section_begin\|>(.*?)<\|tool_calls_section_end\|>"
+/// tool_calls_sections = re.findall(pattern, tool_call_rsp, re.DOTALL)
+/// ```
+///
+/// When `section_end` is missing (model hit max_tokens, EOS, or stop sequence),
+/// `re.findall` returns `[]` and all complete individual tool calls are silently
+/// dropped — even when individual calls have complete `call_begin` + args +
+/// `call_end` markers.
+///
+/// This implementation treats a missing `section_end` as "section extends to
+/// end-of-string", equivalent to:
+///
+/// ```python
+/// pattern = r"<\|tool_calls_section_begin\|>(.*?)(?:<\|tool_calls_section_end\|>|$)"
+/// ```
+///
+/// This allows recovery of complete individual tool calls from truncated output.
 fn extract_tool_calls(
    text: &str,
    config: &KimiK2ParserConfig,
@@ -158,21 +188,23 @@ fn extract_tool_calls(
            // Add text before tool call section to normal parts.
            normal_parts.push(&text[cursor..abs_start]);

+            let (block, next_cursor) =
                if let Some((end_pos, end_len)) = find_section_end(text, abs_start, config) {
                    let abs_end = abs_start + end_pos + end_len;
-                let block = &text[abs_start..abs_end];
+                    (&text[abs_start..abs_end], abs_end)
+                } else {
+                    // No section_end found — treat rest of string as section
+                    // body. Complete individual calls can still be extracted;
+                    // truly truncated calls (no call_end) are ignored by
+                    // parse_section_block's regex.
+                    (&text[abs_start..], text.len())
+                };

-                // Parse individual tool calls within this section block.
            if let Ok(mut parsed_calls) = parse_section_block(block, config, tools) {
                calls.append(&mut parsed_calls);
            }

-                cursor = abs_end;
-            } else {
-                // No end token found -> treat the rest as normal text.
-                normal_parts.push(&text[abs_start..]);
-                break;
-            }
+            cursor = next_cursor;
        } else {
            // No more tool call sections.
            normal_parts.push(&text[cursor..]);
@@ -300,11 +332,12 @@ mod tests {
        let config = default_config();
        let text = "<|tool_calls_section_begin|><|tool_call_begin|>functions.test:0<|tool_call_argument_begin|>{}<|tool_call_end|><|tool_calls_section_end|>more text";
        let pos = find_tool_call_end_position_kimi_k2(text, &config);
-        assert_eq!(&text[pos..], "more text");
+        assert_eq!(pos, Some(text.len() - "more text".len()));
+        assert_eq!(&text[pos.unwrap()..], "more text");

        let text_no_end = "<|tool_calls_section_begin|><|tool_call_begin|>functions.test:0";
        let pos = find_tool_call_end_position_kimi_k2(text_no_end, &config);
-        assert_eq!(pos, text_no_end.len());
+        assert_eq!(pos, None, "should return None when section_end is missing");
    }

    #[test]
@@ -425,18 +458,66 @@ mod tests {
        let config = default_config();
        let input = r#"<|tool_calls_section_begin|><|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{"location":"NYC"}<|tool_call_end|>"#;

-        // Should handle gracefully - section_end not found so whole text is treated as normal
+        // Missing section_end but individual tool call is complete (call_begin + args + call_end).
+        // This happens when the model hits max_tokens before emitting section_end.
+        // The parser should still extract the complete individual tool calls.
+        let (calls, _normal) = try_tool_call_parse_kimi_k2(input, &config, None).unwrap();
+        assert_eq!(
+            calls.len(),
+            1,
+            "Should parse complete tool calls even without section_end (max_tokens truncation)"
+        );
+        assert_eq!(calls[0].function.name, "get_weather");
+    }
+
+    #[test]
+    fn test_parse_truncated_mid_argument_no_section_end() {
+        let config = default_config();
+        // Model hit max_tokens mid-argument — no call_end, no section_end.
+        // Truly incomplete tool call, nothing salvageable.
+        let input = r#"<|tool_calls_section_begin|><|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{"location":"NY"#;
+
        let (calls, normal) = try_tool_call_parse_kimi_k2(input, &config, None).unwrap();
        assert_eq!(
            calls.len(),
            0,
-            "No tool calls should be parsed without section end"
+            "Truly truncated call (no call_end) should return 0 tool calls"
        );
+        // The section body is consumed by parse_section_block (which finds no
+        // complete calls), so normal content is empty — the raw markers are not
+        // re-emitted as user-visible text.
+        assert_eq!(normal, Some("".to_string()));
+    }
+
+    #[test]
+    fn test_parse_multiple_calls_no_section_end() {
+        let config = default_config();
+        // Two complete individual tool calls, but model stopped before section_end.
+        let input = r#"<|tool_calls_section_begin|><|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{"location":"NYC"}<|tool_call_end|><|tool_call_begin|>functions.get_time:1<|tool_call_argument_begin|>{"timezone":"EST"}<|tool_call_end|>"#;
+
+        let (calls, _) = try_tool_call_parse_kimi_k2(input, &config, None).unwrap();
        assert_eq!(
-            normal,
-            Some(input.to_string()),
-            "Input should be preserved as normal text"
+            calls.len(),
+            2,
+            "Should parse both complete tool calls even without section_end"
+        );
+        assert_eq!(calls[0].function.name, "get_weather");
+        assert_eq!(calls[1].function.name, "get_time");
+    }
+
+    #[test]
+    fn test_parse_complete_plus_truncated_no_section_end() {
+        let config = default_config();
+        // First call is complete, second is truncated mid-argument.
+        let input = r#"<|tool_calls_section_begin|><|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{"location":"NYC"}<|tool_call_end|><|tool_call_begin|>functions.get_time:1<|tool_call_argument_begin|>{"tz"#;
+
+        let (calls, _) = try_tool_call_parse_kimi_k2(input, &config, None).unwrap();
+        assert_eq!(
+            calls.len(),
+            1,
+            "Should parse the one complete tool call, ignoring the truncated second"
        );
+        assert_eq!(calls[0].function.name, "get_weather");
    }

    #[test]
@@ -526,7 +607,7 @@ mod tests {
        // Singular variant end token
        let text = "<|tool_call_section_begin|><|tool_call_begin|>functions.test:0<|tool_call_argument_begin|>{}<|tool_call_end|><|tool_call_section_end|>more text";
        let pos = find_tool_call_end_position_kimi_k2(text, &config);
-        assert_eq!(&text[pos..], "more text");
+        assert_eq!(&text[pos.unwrap()..], "more text");
    }

    // --- Tests inspired by vllm/sglang coverage gaps ---