Unverified Commit e483c1ea authored by Simo Lin's avatar Simo Lin Committed by GitHub
Browse files

[router] Fix UTF-8 Boundary Panic in Stop Sequence Decoder (#11766)

parent da681f35
...@@ -149,29 +149,45 @@ impl StopSequenceDecoder { ...@@ -149,29 +149,45 @@ impl StopSequenceDecoder {
// Check for partial matches: is the end of jail_buffer the start of any stop_seq? // Check for partial matches: is the end of jail_buffer the start of any stop_seq?
// This handles stop sequences split across tokens // This handles stop sequences split across tokens
let mut longest_partial = 0; let buffer_len = self.jail_buffer.len();
let mut best_split_pos: Option<usize> = None;
for stop_seq in self for stop_seq in self
.config .config
.stop_sequences .stop_sequences
.iter() .iter()
.chain(&self.config.visible_stop_sequences) .chain(&self.config.visible_stop_sequences)
{ {
// Check suffixes of jail_buffer that match prefixes of stop_seq let stop_len = stop_seq.len();
// We check up to stop_seq.len() - 1 to avoid rechecking exact matches
let max_len = self.jail_buffer.len().min(stop_seq.len() - 1); if stop_len <= 1 || buffer_len == 0 {
for len in 1..=max_len { continue;
let suffix = &self.jail_buffer[self.jail_buffer.len() - len..]; }
if stop_seq.starts_with(suffix) {
longest_partial = longest_partial.max(len); let max_len = buffer_len.min(stop_len - 1);
for len in (1..=max_len).rev() {
let suffix_start = buffer_len - len;
if !self.jail_buffer.is_char_boundary(suffix_start) {
continue;
}
let suffix = &self.jail_buffer[suffix_start..];
if stop_seq.starts_with(suffix)
&& best_split_pos.is_none_or(|current| suffix_start < current)
{
best_split_pos = Some(suffix_start);
break;
} }
} }
} }
if longest_partial > 0 { if let Some(split_pos) = best_split_pos {
// Hold the partial match, flush the rest // Hold the partial match, flush the rest
let split_pos = self.jail_buffer.len() - longest_partial; // Drain [0..split_pos] as output, keep [split_pos..] in jail_buffer
let to_output = self.jail_buffer[..split_pos].to_string(); let to_output = self.jail_buffer.drain(..split_pos).collect::<String>();
self.jail_buffer = self.jail_buffer[split_pos..].to_string();
if to_output.is_empty() { if to_output.is_empty() {
Ok(SequenceDecoderOutput::Held) Ok(SequenceDecoderOutput::Held)
...@@ -457,4 +473,134 @@ mod tests { ...@@ -457,4 +473,134 @@ mod tests {
)); ));
} }
} }
#[test]
fn test_utf8_multibyte_character_boundaries() {
// This test verifies the fix for the UTF-8 boundary panic
// The panic occurred when trying to slice jail_buffer at a byte index
// that was in the middle of a multi-byte UTF-8 character (e.g., '×')
use crate::tokenizer::mock::MockTokenizer;
let tokenizer = Arc::new(MockTokenizer::new());
// Configure stop sequence with a multi-byte character
let config = StopSequenceConfig::default().with_stop_sequence(" ×");
let mut decoder = StopSequenceDecoder::new(tokenizer, config, false);
// Simulate the scenario: jail_buffer will contain " ×" (space + multiplication sign)
// The '×' character is UTF-8 encoded as bytes [0xC3, 0x97] (2 bytes)
// When checking for partial matches, we must not slice in the middle of these bytes
// This should not panic - the fix ensures we only slice at char boundaries
let result = decoder.process_token(1); // Will add some text to jail_buffer
assert!(result.is_ok());
// Even with multi-byte UTF-8 characters in the buffer, processing should work
let result = decoder.process_token(2);
assert!(result.is_ok());
}
#[test]
fn test_utf8_multibyte_delta_character() {
// Test for: byte index 1 is not a char boundary; it is inside 'Δ' (bytes 0..2) of `Δ`
// 'Δ' (U+0394 GREEK CAPITAL LETTER DELTA) is encoded as [0xCE, 0x94] (2 bytes)
let tokenizer = Arc::new(MockTokenizer::new());
let config = StopSequenceConfig::default().with_stop_sequence("Δ");
let mut decoder = StopSequenceDecoder::new(tokenizer, config, false);
// Process tokens - should not panic when checking partial matches
let result = decoder.process_token(1);
assert!(result.is_ok());
let result = decoder.process_token(2);
assert!(result.is_ok());
}
#[test]
fn test_utf8_multibyte_degree_character() {
// Test for: byte index 1 is not a char boundary; it is inside '°' (bytes 0..2) of `°`
// '°' (U+00B0 DEGREE SIGN) is encoded as [0xC2, 0xB0] (2 bytes)
let tokenizer = Arc::new(MockTokenizer::new());
let config = StopSequenceConfig::default().with_stop_sequence("°");
let mut decoder = StopSequenceDecoder::new(tokenizer, config, false);
// Process tokens - should not panic when checking partial matches
let result = decoder.process_token(1);
assert!(result.is_ok());
let result = decoder.process_token(2);
assert!(result.is_ok());
}
#[test]
fn test_utf8_multibyte_triangle_character() {
// Test for: byte index 4 is not a char boundary; it is inside '∆' (bytes 2..5) of ` (∆`
// '∆' (U+2206 INCREMENT) is encoded as [0xE2, 0x88, 0x86] (3 bytes)
let tokenizer = Arc::new(MockTokenizer::new());
let config = StopSequenceConfig::default().with_stop_sequence(" (∆");
let mut decoder = StopSequenceDecoder::new(tokenizer, config, false);
// Process tokens - should not panic when checking partial matches
let result = decoder.process_token(1);
assert!(result.is_ok());
let result = decoder.process_token(2);
assert!(result.is_ok());
let result = decoder.process_token(3);
assert!(result.is_ok());
}
#[test]
fn test_utf8_multibyte_en_dash_character() {
// Test for: byte index 3 is not a char boundary; it is inside '–' (bytes 1..4) of ` –`
// '–' (U+2013 EN DASH) is encoded as [0xE2, 0x80, 0x93] (3 bytes)
let tokenizer = Arc::new(MockTokenizer::new());
let config = StopSequenceConfig::default().with_stop_sequence(" –");
let mut decoder = StopSequenceDecoder::new(tokenizer, config, false);
// Process tokens - should not panic when checking partial matches
let result = decoder.process_token(1);
assert!(result.is_ok());
let result = decoder.process_token(2);
assert!(result.is_ok());
let result = decoder.process_token(3);
assert!(result.is_ok());
}
#[test]
fn test_utf8_multibyte_various_characters() {
// Comprehensive test with multiple multi-byte UTF-8 characters
// Tests 2-byte, 3-byte, and 4-byte UTF-8 sequences
let test_cases = vec![
("×", "multiplication sign - 2 bytes"),
("Δ", "Greek Delta - 2 bytes"),
("°", "degree sign - 2 bytes"),
("∆", "increment - 3 bytes"),
("–", "en dash - 3 bytes"),
("€", "euro sign - 3 bytes"),
("中", "Chinese character - 3 bytes"),
("🚀", "rocket emoji - 4 bytes"),
("💡", "lightbulb emoji - 4 bytes"),
];
for (stop_char, description) in test_cases {
let tokenizer = Arc::new(MockTokenizer::new());
let config = StopSequenceConfig::default().with_stop_sequence(stop_char);
let mut decoder = StopSequenceDecoder::new(tokenizer, config, false);
// Process multiple tokens - should not panic
for token_id in 1..=5 {
let result = decoder.process_token(token_id);
assert!(
result.is_ok(),
"Failed on {} with token {}",
description,
token_id
);
}
}
}
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment