Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
e483c1ea
Unverified
Commit
e483c1ea
authored
Oct 17, 2025
by
Simo Lin
Committed by
GitHub
Oct 17, 2025
Browse files
[router] Fix UTF-8 Boundary Panic in Stop Sequence Decoder (#11766)
parent
da681f35
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
158 additions
and
12 deletions
+158
-12
sgl-router/src/tokenizer/stop.rs
sgl-router/src/tokenizer/stop.rs
+158
-12
No files found.
sgl-router/src/tokenizer/stop.rs
View file @
e483c1ea
...
@@ -149,29 +149,45 @@ impl StopSequenceDecoder {
...
@@ -149,29 +149,45 @@ impl StopSequenceDecoder {
// Check for partial matches: is the end of jail_buffer the start of any stop_seq?
// Check for partial matches: is the end of jail_buffer the start of any stop_seq?
// This handles stop sequences split across tokens
// This handles stop sequences split across tokens
let
mut
longest_partial
=
0
;
let
buffer_len
=
self
.jail_buffer
.len
();
let
mut
best_split_pos
:
Option
<
usize
>
=
None
;
for
stop_seq
in
self
for
stop_seq
in
self
.config
.config
.stop_sequences
.stop_sequences
.iter
()
.iter
()
.chain
(
&
self
.config.visible_stop_sequences
)
.chain
(
&
self
.config.visible_stop_sequences
)
{
{
// Check suffixes of jail_buffer that match prefixes of stop_seq
let
stop_len
=
stop_seq
.len
();
// We check up to stop_seq.len() - 1 to avoid rechecking exact matches
let
max_len
=
self
.jail_buffer
.len
()
.min
(
stop_seq
.len
()
-
1
);
if
stop_len
<=
1
||
buffer_len
==
0
{
for
len
in
1
..=
max_len
{
continue
;
let
suffix
=
&
self
.jail_buffer
[
self
.jail_buffer
.len
()
-
len
..
];
}
if
stop_seq
.starts_with
(
suffix
)
{
longest_partial
=
longest_partial
.max
(
len
);
let
max_len
=
buffer_len
.min
(
stop_len
-
1
);
for
len
in
(
1
..=
max_len
)
.rev
()
{
let
suffix_start
=
buffer_len
-
len
;
if
!
self
.jail_buffer
.is_char_boundary
(
suffix_start
)
{
continue
;
}
let
suffix
=
&
self
.jail_buffer
[
suffix_start
..
];
if
stop_seq
.starts_with
(
suffix
)
&&
best_split_pos
.is_none_or
(|
current
|
suffix_start
<
current
)
{
best_split_pos
=
Some
(
suffix_start
);
break
;
}
}
}
}
}
}
if
l
ongest_partial
>
0
{
if
l
et
Some
(
split_pos
)
=
best_split_pos
{
// Hold the partial match, flush the rest
// Hold the partial match, flush the rest
let
split_pos
=
self
.jail_buffer
.len
()
-
longest_partial
;
// Drain [0..split_pos] as output, keep [split_pos..] in jail_buffer
let
to_output
=
self
.jail_buffer
[
..
split_pos
]
.to_string
();
let
to_output
=
self
.jail_buffer
.drain
(
..
split_pos
)
.collect
::
<
String
>
();
self
.jail_buffer
=
self
.jail_buffer
[
split_pos
..
]
.to_string
();
if
to_output
.is_empty
()
{
if
to_output
.is_empty
()
{
Ok
(
SequenceDecoderOutput
::
Held
)
Ok
(
SequenceDecoderOutput
::
Held
)
...
@@ -457,4 +473,134 @@ mod tests {
...
@@ -457,4 +473,134 @@ mod tests {
));
));
}
}
}
}
#[test]
fn
test_utf8_multibyte_character_boundaries
()
{
// This test verifies the fix for the UTF-8 boundary panic
// The panic occurred when trying to slice jail_buffer at a byte index
// that was in the middle of a multi-byte UTF-8 character (e.g., '×')
use
crate
::
tokenizer
::
mock
::
MockTokenizer
;
let
tokenizer
=
Arc
::
new
(
MockTokenizer
::
new
());
// Configure stop sequence with a multi-byte character
let
config
=
StopSequenceConfig
::
default
()
.with_stop_sequence
(
" ×"
);
let
mut
decoder
=
StopSequenceDecoder
::
new
(
tokenizer
,
config
,
false
);
// Simulate the scenario: jail_buffer will contain " ×" (space + multiplication sign)
// The '×' character is UTF-8 encoded as bytes [0xC3, 0x97] (2 bytes)
// When checking for partial matches, we must not slice in the middle of these bytes
// This should not panic - the fix ensures we only slice at char boundaries
let
result
=
decoder
.process_token
(
1
);
// Will add some text to jail_buffer
assert
!
(
result
.is_ok
());
// Even with multi-byte UTF-8 characters in the buffer, processing should work
let
result
=
decoder
.process_token
(
2
);
assert
!
(
result
.is_ok
());
}
#[test]
fn
test_utf8_multibyte_delta_character
()
{
// Test for: byte index 1 is not a char boundary; it is inside 'Δ' (bytes 0..2) of `Δ`
// 'Δ' (U+0394 GREEK CAPITAL LETTER DELTA) is encoded as [0xCE, 0x94] (2 bytes)
let
tokenizer
=
Arc
::
new
(
MockTokenizer
::
new
());
let
config
=
StopSequenceConfig
::
default
()
.with_stop_sequence
(
"Δ"
);
let
mut
decoder
=
StopSequenceDecoder
::
new
(
tokenizer
,
config
,
false
);
// Process tokens - should not panic when checking partial matches
let
result
=
decoder
.process_token
(
1
);
assert
!
(
result
.is_ok
());
let
result
=
decoder
.process_token
(
2
);
assert
!
(
result
.is_ok
());
}
#[test]
fn
test_utf8_multibyte_degree_character
()
{
// Test for: byte index 1 is not a char boundary; it is inside '°' (bytes 0..2) of `°`
// '°' (U+00B0 DEGREE SIGN) is encoded as [0xC2, 0xB0] (2 bytes)
let
tokenizer
=
Arc
::
new
(
MockTokenizer
::
new
());
let
config
=
StopSequenceConfig
::
default
()
.with_stop_sequence
(
"°"
);
let
mut
decoder
=
StopSequenceDecoder
::
new
(
tokenizer
,
config
,
false
);
// Process tokens - should not panic when checking partial matches
let
result
=
decoder
.process_token
(
1
);
assert
!
(
result
.is_ok
());
let
result
=
decoder
.process_token
(
2
);
assert
!
(
result
.is_ok
());
}
#[test]
fn
test_utf8_multibyte_triangle_character
()
{
// Test for: byte index 4 is not a char boundary; it is inside '∆' (bytes 2..5) of ` (∆`
// '∆' (U+2206 INCREMENT) is encoded as [0xE2, 0x88, 0x86] (3 bytes)
let
tokenizer
=
Arc
::
new
(
MockTokenizer
::
new
());
let
config
=
StopSequenceConfig
::
default
()
.with_stop_sequence
(
" (∆"
);
let
mut
decoder
=
StopSequenceDecoder
::
new
(
tokenizer
,
config
,
false
);
// Process tokens - should not panic when checking partial matches
let
result
=
decoder
.process_token
(
1
);
assert
!
(
result
.is_ok
());
let
result
=
decoder
.process_token
(
2
);
assert
!
(
result
.is_ok
());
let
result
=
decoder
.process_token
(
3
);
assert
!
(
result
.is_ok
());
}
#[test]
fn
test_utf8_multibyte_en_dash_character
()
{
// Test for: byte index 3 is not a char boundary; it is inside '–' (bytes 1..4) of ` –`
// '–' (U+2013 EN DASH) is encoded as [0xE2, 0x80, 0x93] (3 bytes)
let
tokenizer
=
Arc
::
new
(
MockTokenizer
::
new
());
let
config
=
StopSequenceConfig
::
default
()
.with_stop_sequence
(
" –"
);
let
mut
decoder
=
StopSequenceDecoder
::
new
(
tokenizer
,
config
,
false
);
// Process tokens - should not panic when checking partial matches
let
result
=
decoder
.process_token
(
1
);
assert
!
(
result
.is_ok
());
let
result
=
decoder
.process_token
(
2
);
assert
!
(
result
.is_ok
());
let
result
=
decoder
.process_token
(
3
);
assert
!
(
result
.is_ok
());
}
#[test]
fn
test_utf8_multibyte_various_characters
()
{
// Comprehensive test with multiple multi-byte UTF-8 characters
// Tests 2-byte, 3-byte, and 4-byte UTF-8 sequences
let
test_cases
=
vec!
[
(
"×"
,
"multiplication sign - 2 bytes"
),
(
"Δ"
,
"Greek Delta - 2 bytes"
),
(
"°"
,
"degree sign - 2 bytes"
),
(
"∆"
,
"increment - 3 bytes"
),
(
"–"
,
"en dash - 3 bytes"
),
(
"€"
,
"euro sign - 3 bytes"
),
(
"中"
,
"Chinese character - 3 bytes"
),
(
"🚀"
,
"rocket emoji - 4 bytes"
),
(
"💡"
,
"lightbulb emoji - 4 bytes"
),
];
for
(
stop_char
,
description
)
in
test_cases
{
let
tokenizer
=
Arc
::
new
(
MockTokenizer
::
new
());
let
config
=
StopSequenceConfig
::
default
()
.with_stop_sequence
(
stop_char
);
let
mut
decoder
=
StopSequenceDecoder
::
new
(
tokenizer
,
config
,
false
);
// Process multiple tokens - should not panic
for
token_id
in
1
..=
5
{
let
result
=
decoder
.process_token
(
token_id
);
assert
!
(
result
.is_ok
(),
"Failed on {} with token {}"
,
description
,
token_id
);
}
}
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment