qwen_parser.rs 10 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
use async_trait::async_trait;
use regex::Regex;
use serde_json::Value;

use crate::tool_parser::{
    errors::{ToolParserError, ToolParserResult},
    partial_json::PartialJson,
    state::ParseState,
    traits::ToolParser,
    types::{FunctionCall, StreamResult, ToolCall},
};

/// Qwen format parser for tool calls
///
/// Handles the Qwen 2.5/3 specific format:
/// `<tool_call>\n{"name": "func", "arguments": {...}}\n</tool_call>`
///
/// Features:
/// - XML-style tags with JSON content
/// - Support for multiple sequential tool calls
/// - Newline-aware parsing
pub struct QwenParser {
    /// Parser for handling incomplete JSON during streaming
    partial_json: PartialJson,
    /// Regex for extracting tool calls
    extractor: Regex,
}

impl QwenParser {
    /// Create a new Qwen parser
    pub fn new() -> Self {
        // Use (?s) flag for DOTALL mode to handle newlines
        let pattern = r"(?s)<tool_call>\n(.*?)\n</tool_call>";
        let extractor = Regex::new(pattern).expect("Valid regex pattern");

        Self {
            partial_json: PartialJson::default(),
            extractor,
        }
    }

    /// Parse a single JSON object into a ToolCall
    fn parse_single_object(&self, obj: &Value, index: usize) -> ToolParserResult<Option<ToolCall>> {
        let name = obj.get("name").and_then(|v| v.as_str());

        if let Some(name) = name {
            // Get arguments - Qwen uses "arguments" key
            let empty_obj = Value::Object(serde_json::Map::new());
            let args = obj.get("arguments").unwrap_or(&empty_obj);

            // Convert arguments to JSON string
            let arguments = serde_json::to_string(args)
                .map_err(|e| ToolParserError::ParsingFailed(e.to_string()))?;

            // Generate ID with index for multiple tools
            let id = format!("qwen_call_{}", index);

            Ok(Some(ToolCall {
                id,
                r#type: "function".to_string(),
                function: FunctionCall {
                    name: name.to_string(),
                    arguments,
                },
            }))
        } else {
            Ok(None)
        }
    }

    /// Check if text contains Qwen tool markers
    fn has_tool_markers(&self, text: &str) -> bool {
        text.contains("<tool_call>")
    }

    /// Find the start position of a tool call
    fn find_tool_start(&self, text: &str) -> Option<usize> {
        text.find("<tool_call>\n")
    }

    /// Find the end position of a tool call
    fn find_tool_end(&self, text: &str, start_pos: usize) -> Option<usize> {
        let search_from = start_pos + "<tool_call>\n".len();
        text[search_from..]
            .find("\n</tool_call>")
            .map(|pos| search_from + pos + "\n</tool_call>".len())
    }

    /// Check if buffer ends with a partial token
    fn ends_with_partial_token(&self, buffer: &str) -> Option<usize> {
        // Check for partial start token
        let start_token = "<tool_call>\n";
        // Use inclusive range to check if entire buffer could be a prefix
        for i in 1..=start_token.len().min(buffer.len()) {
            if start_token.starts_with(&buffer[buffer.len() - i..]) {
                return Some(i);
            }
        }

        // Check for partial end token
        let end_token = "\n</tool_call>";
102
103
104
105
106
107
108
        // Only check if buffer ends with a partial match (not the complete token without newline)
        // If buffer ends with "</tool_call>", that's not a partial token - it's missing the newline
        if buffer.ends_with("</tool_call>") {
            // This is a complete end tag, just missing the leading newline
            // Not a partial token situation
            return None;
        }
109
110
111
112
113
114
115
116
117
118
119
120
121
122
        // Use inclusive range to check if entire buffer could be a prefix
        (1..=end_token.len().min(buffer.len()))
            .find(|&i| end_token.starts_with(&buffer[buffer.len() - i..]))
    }
}

impl Default for QwenParser {
    fn default() -> Self {
        Self::new()
    }
}

#[async_trait]
impl ToolParser for QwenParser {
123
    async fn parse_complete(&self, text: &str) -> ToolParserResult<(String, Vec<ToolCall>)> {
124
125
        // Check if text contains Qwen format
        if !self.has_tool_markers(text) {
126
            return Ok((text.to_string(), vec![]));
127
128
        }

129
130
131
        // Find where the first tool call begins
        let idx = text.find("<tool_call>").unwrap(); // Safe because has_tool_markers checked
        let normal_text = text[..idx].to_string();
132

133
134
135
        // Extract tool calls
        let mut tools = Vec::new();
        for (index, captures) in self.extractor.captures_iter(text).enumerate() {
136
            if let Some(json_str) = captures.get(1) {
137
138
139
140
141
142
143
                let parsed = serde_json::from_str::<Value>(json_str.as_str().trim())
                    .map_err(|e| ToolParserError::ParsingFailed(e.to_string()))
                    .and_then(|v| self.parse_single_object(&v, index));

                match parsed {
                    Ok(Some(tool)) => tools.push(tool),
                    Ok(None) => continue,
144
                    Err(e) => {
145
                        tracing::warn!("Failed to parse tool call {}: {:?}", index, e);
146
                        continue;
147
148
149
150
151
                    }
                }
            }
        }

152
153
154
155
        // If no tools were successfully parsed despite having markers, return entire text as fallback
        if tools.is_empty() {
            return Ok((text.to_string(), vec![]));
        }
156
157

        Ok((normal_text, tools))
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
    }

    async fn parse_incremental(
        &self,
        chunk: &str,
        state: &mut ParseState,
    ) -> ToolParserResult<StreamResult> {
        state.buffer.push_str(chunk);

        // Check for partial token at end of buffer
        if let Some(_partial_len) = self.ends_with_partial_token(&state.buffer) {
            // Hold back the partial token
            return Ok(StreamResult::Incomplete);
        }

        // Check if we have the start marker
        if !self.has_tool_markers(&state.buffer) {
175
176
177
178
179
180
181
182
183
184
185
186
            // No tool markers detected - return all buffered content as normal text
            let normal_text = std::mem::take(&mut state.buffer);
            return Ok(StreamResult::NormalText(normal_text));
        }

        // Check for text before tool markers and extract it as normal text
        if let Some(marker_pos) = state.buffer.find("<tool_call>") {
            if marker_pos > 0 {
                // We have text before the tool marker - extract it as normal text
                let normal_text: String = state.buffer.drain(..marker_pos).collect();
                return Ok(StreamResult::NormalText(normal_text));
            }
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
        }

        // Find start and end positions
        if let Some(start_pos) = self.find_tool_start(&state.buffer) {
            // Check if we have the complete tool call
            if let Some(end_pos) = self.find_tool_end(&state.buffer, start_pos) {
                // Extract the JSON content
                let json_start = start_pos + "<tool_call>\n".len();
                let json_end = end_pos - "\n</tool_call>".len();
                let json_str = &state.buffer[json_start..json_end];

                // Parse the complete JSON
                match serde_json::from_str::<Value>(json_str.trim()) {
                    Ok(value) => {
                        if let Some(tool) = self.parse_single_object(&value, 0)? {
                            // Clear the consumed part from buffer using drain for efficiency
                            state.buffer.drain(..end_pos);
                            return Ok(StreamResult::ToolComplete(tool));
                        }
                    }
                    Err(_) => {
208
209
210
211
212
213
                        // JSON parsing failed, might be incomplete or malformed
                        // If we have what looks like a complete tool call block, treat as normal text
                        if state.buffer[start_pos..end_pos].contains("\n</tool_call>") {
                            let malformed_text: String = state.buffer.drain(..end_pos).collect();
                            return Ok(StreamResult::NormalText(malformed_text));
                        }
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
                    }
                }
            } else {
                // We have start but no end yet - try partial parsing
                let json_start = start_pos + "<tool_call>\n".len();
                let partial_json = &state.buffer[json_start..];

                // Remove trailing newline if present (might be start of end token)
                let partial_json = partial_json.trim_end();

                // Try to parse with partial JSON parser
                match self.partial_json.parse_value(partial_json) {
                    Ok((value, _consumed)) => {
                        // Extract tool name if available
                        if let Some(name) = value.get("name").and_then(|v| v.as_str()) {
                            // Check if we've already sent the name
                            if !state.in_string {
                                state.in_string = true; // Use as flag for "name sent"
                                return Ok(StreamResult::ToolName {
                                    index: 0,
                                    name: name.to_string(),
                                });
                            }

                            // Check for arguments
                            if let Some(args) = value.get("arguments") {
                                if let Ok(args_str) = serde_json::to_string(args) {
                                    return Ok(StreamResult::ToolArguments {
                                        index: 0,
                                        arguments: args_str,
                                    });
                                }
                            }
                        }
                    }
                    Err(_) => {
                        // Failed to parse even as partial JSON
                        // Keep buffering
                    }
                }
            }
        }

        Ok(StreamResult::Incomplete)
    }

    fn detect_format(&self, text: &str) -> bool {
261
        self.has_tool_markers(text)
262
263
    }
}