".github/vscode:/vscode.git/clone" did not exist on "466b1993d8d8fd287f2119eecc663b3932a728ba"
Unverified Commit 9ad38639 authored by Biswa Panda's avatar Biswa Panda Committed by GitHub
Browse files

fix(frontend): Kimi tokenizer fix - use absolute token ID in tiktoken reserved...

fix(frontend): Kimi tokenizer fix - use absolute token ID in tiktoken reserved token fallback names (#7886)
parent d94c2ea2
...@@ -181,7 +181,7 @@ fn load_special_tokens(directory: &Path, num_base_tokens: usize) -> Result<FxHas ...@@ -181,7 +181,7 @@ fn load_special_tokens(directory: &Path, num_base_tokens: usize) -> Result<FxHas
// No tokenizer_config.json — generate default reserved tokens // No tokenizer_config.json — generate default reserved tokens
for i in 0..DEFAULT_NUM_RESERVED_SPECIAL_TOKENS { for i in 0..DEFAULT_NUM_RESERVED_SPECIAL_TOKENS {
let id = num_base_tokens as u32 + i; let id = num_base_tokens as u32 + i;
special_tokens.insert(format!("<|reserved_token_{i}|>"), id); special_tokens.insert(format!("<|reserved_token_{id}|>"), id);
} }
return Ok(special_tokens); return Ok(special_tokens);
} }
...@@ -222,14 +222,14 @@ fn load_special_tokens(directory: &Path, num_base_tokens: usize) -> Result<FxHas ...@@ -222,14 +222,14 @@ fn load_special_tokens(directory: &Path, num_base_tokens: usize) -> Result<FxHas
for i in 0..DEFAULT_NUM_RESERVED_SPECIAL_TOKENS { for i in 0..DEFAULT_NUM_RESERVED_SPECIAL_TOKENS {
let id = num_base_tokens as u32 + i; let id = num_base_tokens as u32 + i;
if !used_ids.contains(&id) { if !used_ids.contains(&id) {
special_tokens.insert(format!("<|reserved_token_{i}|>"), id); special_tokens.insert(format!("<|reserved_token_{id}|>"), id);
} }
} }
} else { } else {
// No added_tokens_decoder — generate default reserved tokens // No added_tokens_decoder — generate default reserved tokens
for i in 0..DEFAULT_NUM_RESERVED_SPECIAL_TOKENS { for i in 0..DEFAULT_NUM_RESERVED_SPECIAL_TOKENS {
let id = num_base_tokens as u32 + i; let id = num_base_tokens as u32 + i;
special_tokens.insert(format!("<|reserved_token_{i}|>"), id); special_tokens.insert(format!("<|reserved_token_{id}|>"), id);
} }
} }
...@@ -433,8 +433,8 @@ mod tests { ...@@ -433,8 +433,8 @@ mod tests {
let dir = tempfile::tempdir().unwrap(); let dir = tempfile::tempdir().unwrap();
let tokens = load_special_tokens(dir.path(), 100).unwrap(); let tokens = load_special_tokens(dir.path(), 100).unwrap();
assert_eq!(tokens.len(), 256); assert_eq!(tokens.len(), 256);
assert_eq!(tokens["<|reserved_token_0|>"], 100); assert_eq!(tokens["<|reserved_token_100|>"], 100);
assert_eq!(tokens["<|reserved_token_255|>"], 355); assert_eq!(tokens["<|reserved_token_355|>"], 355);
} }
#[test] #[test]
...@@ -660,4 +660,128 @@ mod tests { ...@@ -660,4 +660,128 @@ mod tests {
assert_eq!(decoded, *input); assert_eq!(decoded, *input);
} }
} }
/// Helper: create a tiktoken file containing all 256 single-byte tokens (ranks 0..255).
/// This gives a complete byte-level base vocabulary so any ASCII string can be encoded.
fn create_byte_level_tiktoken_file(dir: &Path) -> String {
let engine = base64::engine::general_purpose::STANDARD;
let mut content = String::new();
for byte_val in 0u16..256 {
let encoded = engine.encode([byte_val as u8]);
content.push_str(&format!("{encoded} {byte_val}\n"));
}
let file_path = dir.join("tiktoken.model");
std::fs::write(&file_path, &content).unwrap();
file_path.to_str().unwrap().to_string()
}
/// Regression test for Kimi K2.5 tokenizer inflation.
///
/// Python's tokenization_kimi.py names unnamed reserved tokens by absolute ID:
/// `<|reserved_token_{absolute_id}|>`
///
/// The Rust code previously used relative offsets (0..255) as the naming index,
/// so when a prompt contained `<|reserved_token_258|>` the Rust tokenizer did NOT
/// recognize it as a special token. Each occurrence was encoded as multiple BPE
/// tokens instead of 1, inflating an 8192-token prompt to 9038 tokens and causing
/// TRT-LLM to reject the request.
#[test]
fn test_reserved_token_absolute_id_naming_kimi_k25_regression() {
let dir = tempfile::tempdir().unwrap();
let file_path = create_byte_level_tiktoken_file(dir.path());
// config.json with kimi model type -> triggers KIMI_PATTERN
create_test_config(dir.path(), "kimi");
// tokenizer_config.json: BOS at 256, EOS at 257. Base vocab is IDs 0..255.
create_test_tokenizer_config(dir.path(), 256);
let tokenizer = TikTokenTokenizer::from_file_auto(&file_path).unwrap();
// ID 256 = [BOS], ID 257 = [EOS].
// ID 258 = first UNNAMED reserved token.
// With fix: named <|reserved_token_258|> (absolute ID)
// Before fix: named <|reserved_token_2|> (relative offset)
// Single unnamed reserved token should be recognized as 1 special token.
let single = "<|reserved_token_258|>";
let enc = tokenizer.encode(single).unwrap();
assert_eq!(
enc.token_ids().len(),
1,
"'{single}' should be 1 special token, got {} tokens: {:?}. \
This means fallback naming still uses relative offsets instead of absolute IDs.",
enc.token_ids().len(),
enc.token_ids()
);
assert_eq!(enc.token_ids()[0], 258);
// Multiple unnamed reserved tokens in sequence (mini version of the benchmark).
// IDs 258..268 are all unnamed; with the fix they're <|reserved_token_258|>..267.
let multi: String = (258u32..268)
.map(|id| format!("<|reserved_token_{id}|>"))
.collect();
let enc_multi = tokenizer.encode(&multi).unwrap();
assert_eq!(
enc_multi.token_ids().len(),
10,
"10 reserved token strings should produce exactly 10 tokens, got {}: {:?}",
enc_multi.token_ids().len(),
enc_multi.token_ids()
);
let expected_ids: Vec<u32> = (258..268).collect();
assert_eq!(enc_multi.token_ids(), &expected_ids);
}
/// Confirm that the old relative-offset naming would cause token inflation.
/// Manually builds a tokenizer whose special-token map uses the WRONG names
/// (relative offsets), then shows the same string encodes as many tokens.
#[test]
fn test_relative_offset_naming_causes_inflation() {
let dir = tempfile::tempdir().unwrap();
let file_path = create_byte_level_tiktoken_file(dir.path());
let _encoder = parse_tiktoken_file(&file_path).unwrap();
let num_base_tokens = 256usize;
// Build special tokens with the OLD (buggy) relative-offset naming
let mut bad_special_tokens: FxHashMap<String, u32> = FxHashMap::default();
bad_special_tokens.insert("[BOS]".to_string(), 256);
bad_special_tokens.insert("[EOS]".to_string(), 257);
for i in 0..DEFAULT_NUM_RESERVED_SPECIAL_TOKENS {
let id = num_base_tokens as u32 + i;
if id != 256 && id != 257 {
// OLD naming: relative offset i, not absolute id
bad_special_tokens.insert(format!("<|reserved_token_{i}|>"), id);
}
}
let bad_tokenizer =
TikTokenTokenizer::from_file(&file_path, KIMI_PATTERN, bad_special_tokens).unwrap();
// With the wrong naming, <|reserved_token_258|> is NOT recognized as special.
// It gets split into byte-level BPE tokens -> many more than 1.
let input = "<|reserved_token_258|>";
let enc = bad_tokenizer.encode(input).unwrap();
assert!(
enc.token_ids().len() > 1,
"With buggy relative-offset naming, '{}' should NOT be recognized as a \
single special token. Got {} token(s): {:?}",
input,
enc.token_ids().len(),
enc.token_ids()
);
// Show the inflation: 10 reserved tokens produce far more than 10 BPE tokens.
let multi: String = (258u32..268)
.map(|id| format!("<|reserved_token_{id}|>"))
.collect();
let enc_multi = bad_tokenizer.encode(&multi).unwrap();
assert!(
enc_multi.token_ids().len() > 10,
"With buggy naming, 10 reserved token strings should inflate beyond 10 tokens. \
Got {}",
enc_multi.token_ids().len(),
);
}
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment