fix(frontend): Kimi tokenizer fix - use absolute token ID in tiktoken reserved...

fix(frontend): Kimi tokenizer fix - use absolute token ID in tiktoken reserved token fallback names (#7886)

fix(frontend): Kimi tokenizer fix - use absolute token ID in tiktoken reserved...
fix(frontend): Kimi tokenizer fix - use absolute token ID in tiktoken reserved token fallback names (#7886)
9ad38639 · Biswa Panda · GitHub · d94c2ea2 · 9ad38639
Unverified Commit 9ad38639 authored Apr 03, 2026 by Biswa Panda Committed by GitHub Apr 03, 2026
Show whitespace changes
Inline Side-by-side

Showing with 129 additions and 5 deletions

lib/llm/src/tokenizers/tiktoken.rs lib/llm/src/tokenizers/tiktoken.rs +129 -5

No files found.
--- a/lib/llm/src/tokenizers/tiktoken.rs
+++ b/lib/llm/src/tokenizers/tiktoken.rs
@@ -181,7 +181,7 @@ fn load_special_tokens(directory: &Path, num_base_tokens: usize) -> Result<FxHas
        // No tokenizer_config.json — generate default reserved tokens
        for i in 0..DEFAULT_NUM_RESERVED_SPECIAL_TOKENS {
            let id = num_base_tokens as u32 + i;
-            special_tokens.insert(format!("<|reserved_token_{i}|>"), id);
+            special_tokens.insert(format!("<|reserved_token_{id}|>"), id);
        }
        return Ok(special_tokens);
    }
@@ -222,14 +222,14 @@ fn load_special_tokens(directory: &Path, num_base_tokens: usize) -> Result<FxHas
        for i in 0..DEFAULT_NUM_RESERVED_SPECIAL_TOKENS {
            let id = num_base_tokens as u32 + i;
            if !used_ids.contains(&id) {
-                special_tokens.insert(format!("<|reserved_token_{i}|>"), id);
+                special_tokens.insert(format!("<|reserved_token_{id}|>"), id);
            }
        }
    } else {
        // No added_tokens_decoder — generate default reserved tokens
        for i in 0..DEFAULT_NUM_RESERVED_SPECIAL_TOKENS {
            let id = num_base_tokens as u32 + i;
-            special_tokens.insert(format!("<|reserved_token_{i}|>"), id);
+            special_tokens.insert(format!("<|reserved_token_{id}|>"), id);
        }
    }
@@ -433,8 +433,8 @@ mod tests {
        let dir = tempfile::tempdir().unwrap();
        let tokens = load_special_tokens(dir.path(), 100).unwrap();
        assert_eq!(tokens.len(), 256);
-        assert_eq!(tokens["<|reserved_token_0|>"], 100);
+        assert_eq!(tokens["<|reserved_token_100|>"], 100);
-        assert_eq!(tokens["<|reserved_token_255|>"], 355);
+        assert_eq!(tokens["<|reserved_token_355|>"], 355);
    }
    #[test]
@@ -660,4 +660,128 @@ mod tests {
            assert_eq!(decoded, *input);
        }
    }
+    /// Helper: create a tiktoken file containing all 256 single-byte tokens (ranks 0..255).
+    /// This gives a complete byte-level base vocabulary so any ASCII string can be encoded.
+    fn create_byte_level_tiktoken_file(dir: &Path) -> String {
+        let engine = base64::engine::general_purpose::STANDARD;
+        let mut content = String::new();
+        for byte_val in 0u16..256 {
+            let encoded = engine.encode([byte_val as u8]);
+            content.push_str(&format!("{encoded} {byte_val}\n"));
+        }
+        let file_path = dir.join("tiktoken.model");
+        std::fs::write(&file_path, &content).unwrap();
+        file_path.to_str().unwrap().to_string()
+    }
+    /// Regression test for Kimi K2.5 tokenizer inflation.
+    ///
+    /// Python's tokenization_kimi.py names unnamed reserved tokens by absolute ID:
+    ///   `<|reserved_token_{absolute_id}|>`
+    ///
+    /// The Rust code previously used relative offsets (0..255) as the naming index,
+    /// so when a prompt contained `<|reserved_token_258|>` the Rust tokenizer did NOT
+    /// recognize it as a special token. Each occurrence was encoded as multiple BPE
+    /// tokens instead of 1, inflating an 8192-token prompt to 9038 tokens and causing
+    /// TRT-LLM to reject the request.
+    #[test]
+    fn test_reserved_token_absolute_id_naming_kimi_k25_regression() {
+        let dir = tempfile::tempdir().unwrap();
+        let file_path = create_byte_level_tiktoken_file(dir.path());
+        // config.json with kimi model type -> triggers KIMI_PATTERN
+        create_test_config(dir.path(), "kimi");
+        // tokenizer_config.json: BOS at 256, EOS at 257. Base vocab is IDs 0..255.
+        create_test_tokenizer_config(dir.path(), 256);
+        let tokenizer = TikTokenTokenizer::from_file_auto(&file_path).unwrap();
+        // ID 256 = [BOS], ID 257 = [EOS].
+        // ID 258 = first UNNAMED reserved token.
+        //   With fix:   named <|reserved_token_258|>  (absolute ID)
+        //   Before fix: named <|reserved_token_2|>    (relative offset)
+        // Single unnamed reserved token should be recognized as 1 special token.
+        let single = "<|reserved_token_258|>";
+        let enc = tokenizer.encode(single).unwrap();
+        assert_eq!(
+            enc.token_ids().len(),
+            1,
+            "'{single}' should be 1 special token, got {} tokens: {:?}. \
+             This means fallback naming still uses relative offsets instead of absolute IDs.",
+            enc.token_ids().len(),
+            enc.token_ids()
+        );
+        assert_eq!(enc.token_ids()[0], 258);
+        // Multiple unnamed reserved tokens in sequence (mini version of the benchmark).
+        // IDs 258..268 are all unnamed; with the fix they're <|reserved_token_258|>..267.
+        let multi: String = (258u32..268)
+            .map(|id| format!("<|reserved_token_{id}|>"))
+            .collect();
+        let enc_multi = tokenizer.encode(&multi).unwrap();
+        assert_eq!(
+            enc_multi.token_ids().len(),
+            10,
+            "10 reserved token strings should produce exactly 10 tokens, got {}: {:?}",
+            enc_multi.token_ids().len(),
+            enc_multi.token_ids()
+        );
+        let expected_ids: Vec<u32> = (258..268).collect();
+        assert_eq!(enc_multi.token_ids(), &expected_ids);
+    }
+    /// Confirm that the old relative-offset naming would cause token inflation.
+    /// Manually builds a tokenizer whose special-token map uses the WRONG names
+    /// (relative offsets), then shows the same string encodes as many tokens.
+    #[test]
+    fn test_relative_offset_naming_causes_inflation() {
+        let dir = tempfile::tempdir().unwrap();
+        let file_path = create_byte_level_tiktoken_file(dir.path());
+        let _encoder = parse_tiktoken_file(&file_path).unwrap();
+        let num_base_tokens = 256usize;
+        // Build special tokens with the OLD (buggy) relative-offset naming
+        let mut bad_special_tokens: FxHashMap<String, u32> = FxHashMap::default();
+        bad_special_tokens.insert("[BOS]".to_string(), 256);
+        bad_special_tokens.insert("[EOS]".to_string(), 257);
+        for i in 0..DEFAULT_NUM_RESERVED_SPECIAL_TOKENS {
+            let id = num_base_tokens as u32 + i;
+            if id != 256 && id != 257 {
+                // OLD naming: relative offset i, not absolute id
+                bad_special_tokens.insert(format!("<|reserved_token_{i}|>"), id);
+            }
+        }
+        let bad_tokenizer =
+            TikTokenTokenizer::from_file(&file_path, KIMI_PATTERN, bad_special_tokens).unwrap();
+        // With the wrong naming, <|reserved_token_258|> is NOT recognized as special.
+        // It gets split into byte-level BPE tokens -> many more than 1.
+        let input = "<|reserved_token_258|>";
+        let enc = bad_tokenizer.encode(input).unwrap();
+        assert!(
+            enc.token_ids().len() > 1,
+            "With buggy relative-offset naming, '{}' should NOT be recognized as a \
+             single special token. Got {} token(s): {:?}",
+            input,
+            enc.token_ids().len(),
+            enc.token_ids()
+        );
+        // Show the inflation: 10 reserved tokens produce far more than 10 BPE tokens.
+        let multi: String = (258u32..268)
+            .map(|id| format!("<|reserved_token_{id}|>"))
+            .collect();
+        let enc_multi = bad_tokenizer.encode(&multi).unwrap();
+        assert!(
+            enc_multi.token_ids().len() > 10,
+            "With buggy naming, 10 reserved token strings should inflate beyond 10 tokens. \
+             Got {}",
+            enc_multi.token_ids().len(),
+        );
+    }
 }