feat: split tokenizer code into dynamo-tokenizers crate (#8185)

0d635418 · ishandhanani · GitHub · 159d9e06 · 0d635418 · 0d635418
Unverified Commit 0d635418 authored Apr 23, 2026 by ishandhanani Committed by GitHub Apr 23, 2026
12 changed files
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2419,9 +2419,9 @@ dependencies = [
 "dynamo-parsers",
 "dynamo-protocols",
 "dynamo-runtime",
+ "dynamo-tokenizers",
 "dynamo-tokens",
 "either",
- "fastokens",
 "ffmpeg-next",
 "flate2",
 "futures",
@@ -2453,7 +2453,6 @@ dependencies = [
 "reqwest 0.12.24",
 "rmp-serde",
 "rstest 0.18.2",
- "rustc-hash 1.1.0",
 "rustls 0.23.37",
 "serde",
 "serde_json",
@@ -2463,7 +2462,6 @@ dependencies = [
 "temp-env",
 "tempfile",
 "thiserror 2.0.18",
- "tiktoken-rs",
 "tmq",
 "tokenizers",
 "tokio",
@@ -2642,6 +2640,24 @@ dependencies = [
 "xxhash-rust",
 ]
+[[package]]
+name = "dynamo-tokenizers"
+version = "1.1.0"
+dependencies = [
+ "anyhow",
+ "base64 0.22.1",
+ "fastokens",
+ "rayon",
+ "rustc-hash 1.1.0",
+ "serde",
+ "serde_json",
+ "strum",
+ "tempfile",
+ "tiktoken-rs",
+ "tokenizers",
+ "tracing",
+]
 [[package]]
 name = "dynamo-tokens"
 version = "1.1.0"

--- a/Cargo.toml
+++ b/Cargo.toml
@@ -6,6 +6,7 @@ members = [
    "lib/llm",
    "lib/runtime",
    "lib/config",
+    "lib/tokenizers",
    "lib/tokens",
    "lib/mocker",
    "lib/kv-router",
@@ -39,6 +40,7 @@ keywords = ["llm", "genai", "inference", "nvidia", "distributed"]
 dynamo-runtime = { path = "lib/runtime", version = "1.1.0" }
 dynamo-llm = { path = "lib/llm", version = "1.1.0" }
 dynamo-config = { path = "lib/config", version = "1.1.0" }
+dynamo-tokenizers = { path = "lib/tokenizers", version = "1.1.0" }
 dynamo-tokens = { path = "lib/tokens", version = "1.1.0" }
 dynamo-memory = { path = "lib/memory", version = "1.1.0" }
 dynamo-mocker = { path = "lib/mocker", version = "1.1.0" }

--- a/lib/bindings/kvbm/Cargo.lock
+++ b/lib/bindings/kvbm/Cargo.lock
@@ -1554,9 +1554,9 @@ dependencies = [
 "dynamo-parsers",
 "dynamo-protocols",
 "dynamo-runtime",
+ "dynamo-tokenizers",
 "dynamo-tokens",
 "either",
- "fastokens",
 "flate2",
 "futures",
 "futures-util",
@@ -1581,7 +1581,6 @@ dependencies = [
 "rayon",
 "reqwest",
 "rmp-serde",
- "rustc-hash 1.1.0",
 "rustls",
 "serde",
 "serde_json",
@@ -1589,7 +1588,6 @@ dependencies = [
 "strum",
 "tempfile",
 "thiserror 2.0.18",
- "tiktoken-rs",
 "tmq",
 "tokenizers",
 "tokio",
@@ -1752,6 +1750,23 @@ dependencies = [
 "xxhash-rust",
 ]
+[[package]]
+name = "dynamo-tokenizers"
+version = "1.1.0"
+dependencies = [
+ "anyhow",
+ "base64 0.22.1",
+ "fastokens",
+ "rayon",
+ "rustc-hash 1.1.0",
+ "serde",
+ "serde_json",
+ "strum",
+ "tiktoken-rs",
+ "tokenizers",
+ "tracing",
+]
 [[package]]
 name = "dynamo-tokens"
 version = "1.1.0"

--- a/lib/bindings/python/Cargo.lock
+++ b/lib/bindings/python/Cargo.lock
@@ -1566,9 +1566,9 @@ dependencies = [
 "dynamo-parsers",
 "dynamo-protocols",
 "dynamo-runtime",
+ "dynamo-tokenizers",
 "dynamo-tokens",
 "either",
- "fastokens",
 "ffmpeg-next",
 "flate2",
 "futures",
@@ -1595,7 +1595,6 @@ dependencies = [
 "rayon",
 "reqwest",
 "rmp-serde",
- "rustc-hash 1.1.0",
 "rustls",
 "serde",
 "serde_json",
@@ -1603,7 +1602,6 @@ dependencies = [
 "strum",
 "tempfile",
 "thiserror 2.0.18",
- "tiktoken-rs",
 "tmq",
 "tokenizers",
 "tokio",
@@ -1799,6 +1797,23 @@ dependencies = [
 "xxhash-rust",
 ]
+[[package]]
+name = "dynamo-tokenizers"
+version = "1.1.0"
+dependencies = [
+ "anyhow",
+ "base64 0.22.1",
+ "fastokens",
+ "rayon",
+ "rustc-hash 1.1.0",
+ "serde",
+ "serde_json",
+ "strum",
+ "tiktoken-rs",
+ "tokenizers",
+ "tracing",
+]
 [[package]]
 name = "dynamo-tokens"
 version = "1.1.0"

--- a/lib/llm/Cargo.toml
+++ b/lib/llm/Cargo.toml
@@ -57,6 +57,7 @@ dynamo-kv-router = { workspace = true, features = ["metrics", "runtime-protocols
 dynamo-memory = { workspace = true }
 dynamo-mocker = { workspace = true }
 dynamo-runtime = { workspace = true }
+dynamo-tokenizers = { workspace = true }
 dynamo-tokens = { workspace = true }
 # workspace
@@ -150,9 +151,6 @@ tokenizers = { version = "0.21.4", default-features = false, features = [
  "esaxx_fast",
  "rustls-tls",
 ] }
-tiktoken-rs = { version = "0.9", default-features = false }
-rustc-hash = "1.1"
-fastokens = { workspace = true }
 # backend
 galil-seiferas = { version = "0.1" }

--- a/lib/llm/src/lib.rs
+++ b/lib/llm/src/lib.rs
@@ -6,10 +6,6 @@
 //! The `dynamo.llm` crate is a Rust library that provides a set of traits and types for building
 //! distributed LLM inference solutions.
-use std::{fs::File, io::BufReader, path::Path};
-use anyhow::Context as _;
 pub mod backend;
 pub mod common;
 pub mod discovery;
@@ -35,7 +31,8 @@ pub mod preprocessor;
 pub mod protocols;
 pub mod recorder;
 pub mod request_template;
-pub mod tokenizers;
+pub use dynamo_tokenizers as tokenizers;
+pub use dynamo_tokenizers::{file_json_field, log_json_err};
 pub mod tokens;
 pub mod types;
 pub mod utils;
@@ -46,111 +43,6 @@ pub mod block_manager;
 #[cfg(feature = "cuda")]
 pub mod cuda;
-/// Reads a JSON file, extracts a specific field, and deserializes it into type T.
-///
-/// # Arguments
-///
-/// * `json_file_path`: Path to the JSON file.
-/// * `field_name`: The name of the field to extract from the JSON map.
-///
-/// # Returns
-///
-/// A `Result` containing the deserialized value of type `T` if successful,
-/// or an `anyhow::Error` if any step fails (file I/O, JSON parsing, field not found,
-/// or deserialization to `T` fails).
-///
-/// # Type Parameters
-///
-/// * `T`: The expected type of the field's value. `T` must implement `serde::de::DeserializeOwned`.
-pub fn file_json_field<T: serde::de::DeserializeOwned>(
-    json_file_path: &Path,
-    field_name: &str,
-) -> anyhow::Result<T> {
-    // 1. Open the file
-    let file = File::open(json_file_path)
-        .with_context(|| format!("Failed to open file: {:?}", json_file_path))?;
-    let reader = BufReader::new(file);
-    // 2. Parse the JSON file into a generic serde_json::Value
-    // We parse into `serde_json::Value` first because we need to look up a specific field.
-    // If we tried to deserialize directly into `T`, `T` would need to represent the whole JSON structure.
-    let json_data: serde_json::Value = serde_json::from_reader(reader)
-        .with_context(|| format!("Failed to parse JSON from file: {:?}", json_file_path))?;
-    // 3. Ensure the root of the JSON is an object (map)
-    let map = json_data.as_object().ok_or_else(|| {
-        anyhow::anyhow!("JSON root is not an object in file: {:?}", json_file_path)
-    })?;
-    // 4. Get the specific field's value
-    let field_value = map.get(field_name).ok_or_else(|| {
-        anyhow::anyhow!(
-            "Field '{}' not found in JSON file: {:?}",
-            field_name,
-            json_file_path
-        )
-    })?;
-    // 5. Deserialize the field's value into the target type T
-    // We need to clone `field_value` because `from_value` consumes its input.
-    serde_json::from_value(field_value.clone()).with_context(|| {
-        format!(
-            "Failed to deserialize field '{}' (value: {:?}) to the expected type from file: {:?}",
-            field_name, field_value, json_file_path
-        )
-    })
-}
-/// Pretty-print the part of JSON that has an error.
-pub fn log_json_err(filename: &str, json: &str, err: &serde_json::Error) {
-    const ERROR_PREFIX: &str = ">>     ";
-    // Only log errors that relate to the content of the JSON file
-    if !(err.is_syntax() || err.is_data()) {
-        return;
-    }
-    // These are 1 based for humans so subtract
-    let line = err.line().saturating_sub(1);
-    let column = err.column().saturating_sub(1);
-    let json_lines: Vec<&str> = json.lines().collect();
-    if json_lines.is_empty() {
-        tracing::error!("JSON parsing error in {filename}: File is empty.");
-        return;
-    }
-    // Two lines before
-    let start_index = (line - 2).max(0);
-    // The problem line and two lines after
-    let end_index = (line + 3).min(json_lines.len());
-    // Collect the context
-    let mut context_lines: Vec<String> = (start_index..end_index)
-        .map(|i| {
-            if i == line {
-                format!("{ERROR_PREFIX}{}", json_lines[i])
-            } else {
-                // Six places because tokenizer.json is very long
-                format!("{:06} {}", i + 1, json_lines[i])
-            }
-        })
-        .collect();
-    // Insert the column indicator
-    let col_indicator = "_".to_string().repeat(column + ERROR_PREFIX.len()) + "^";
-    let error_in_context_idx = line - start_index;
-    if error_in_context_idx < context_lines.len() {
-        context_lines.insert(error_in_context_idx + 1, col_indicator);
-    }
-    tracing::error!(
-        "JSON parsing error in {filename}: Line {}, column {}:\n{}",
-        err.line(),
-        err.column(),
-        context_lines.join("\n")
-    );
-}
 #[cfg(test)]
 mod file_json_field_tests {
    use super::file_json_field;

--- a/lib/tokenizers/Cargo.toml
+++ b/lib/tokenizers/Cargo.toml
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+[package]
+name = "dynamo-tokenizers"
+version.workspace = true
+edition.workspace = true
+description = "Standalone tokenizer implementations for Dynamo"
+authors.workspace = true
+license.workspace = true
+homepage.workspace = true
+repository.workspace = true
+keywords.workspace = true
+[dependencies]
+anyhow = { workspace = true }
+fastokens = { workspace = true }
+serde = { workspace = true }
+serde_json = { workspace = true }
+strum = { workspace = true }
+tracing = { workspace = true }
+base64 = { version = "0.22" }
+rayon = "1"
+rustc-hash = "1.1"
+tiktoken-rs = { version = "0.9", default-features = false }
+tokenizers = { version = "0.21.4", default-features = false, features = [
+  "onig",
+  "esaxx_fast",
+  "rustls-tls",
+] }
+[dev-dependencies]
+tempfile = { workspace = true }
--- a/lib/llm/src/tokenizers/README.md
+++ b/lib/llm/src/tokenizers/README.md
 # Tokenizers
 ## Introduction
-`tokenizers` is designed for efficient and versatile tokenization in natural language processing. It supports both HuggingFace models, offering a streamlined API for text encoding and decoding.
+`dynamo-tokenizers` provides efficient, versatile tokenization for NLP workloads. It supports HuggingFace and TikToken tokenizers (plus a FastTokenizer hybrid mode) through a streamlined encoding/decoding API.
 ## Features
 - **Hash Verification**: Ensures tokenization consistency and accuracy across different models.
@@ -12,7 +12,7 @@
 #### HuggingFace Tokenizer
 ```rust
-use dynamo_llm::tokenizers::hf::HuggingFaceTokenizer;
+use dynamo_tokenizers::hf::HuggingFaceTokenizer;
 let hf_tokenizer = HuggingFaceTokenizer::from_file("tests/data/sample-models/TinyLlama_v1.1/tokenizer.json")
    .expect("Failed to load HuggingFace tokenizer");
@@ -21,7 +21,7 @@ let hf_tokenizer = HuggingFaceTokenizer::from_file("tests/data/sample-models/Tin
 ### Encoding and Decoding Text
 ```rust
-use dynamo_llm::tokenizers::{HuggingFaceTokenizer, traits::{Encoder, Decoder}};
+use dynamo_tokenizers::{HuggingFaceTokenizer, traits::{Encoder, Decoder}};
 let tokenizer = HuggingFaceTokenizer::from_file("tests/data/sample-models/TinyLlama_v1.1/tokenizer.json")
    .expect("Failed to load HuggingFace tokenizer");
@@ -39,7 +39,7 @@ assert_eq!(text, decoded_text);
 // Using the Sequence object for encoding and decoding
-use dynamo_llm::tokenizers::{Sequence, Tokenizer};
+use dynamo_tokenizers::{Sequence, Tokenizer};
 use std::sync::{Arc, RwLock};
 let tokenizer = Tokenizer::from(Arc::new(tokenizer));

--- a/lib/llm/src/tokenizers/fastokens.rs
+++ b/lib/llm/src/tokenizers/fastokens.rs
@@ -62,13 +62,13 @@ impl Tokenizer for FastTokenizer {}
 #[cfg(test)]
 mod tests {
    use super::*;
-    use crate::tokenizers::HuggingFaceTokenizer;
+    use crate::HuggingFaceTokenizer;
    // Minimal synthetic BPE tokenizer with no normalizer or post-processor --
    // compatible with fastokens. Vocab covers: H,T,a,d,e,h,i,l,o,r,s,t,w + punctuation.
    const TOKENIZER_PATH: &str = concat!(
        env!("CARGO_MANIFEST_DIR"),
-        "/tests/data/sample-models/minimal-bpe/tokenizer.json"
+        "/../llm/tests/data/sample-models/minimal-bpe/tokenizer.json"
    );
    #[test]
@@ -124,7 +124,7 @@ mod tests {
    #[test]
    fn test_fast_with_decode_stream() {
-        use crate::tokenizers::Tokenizer as TokenizerWrapper;
+        use crate::Tokenizer as TokenizerWrapper;
        use std::sync::Arc;
        let tokenizer = Arc::new(FastTokenizer::from_file(TOKENIZER_PATH).unwrap());

--- a/lib/llm/src/tokenizers/hf.rs
+++ b/lib/llm/src/tokenizers/hf.rs
--- a/lib/llm/src/tokenizers.rs
+++ b/lib/llm/src/tokenizers.rs
@@ -11,9 +11,9 @@ pub mod tiktoken;
 use std::hash::{DefaultHasher, Hash, Hasher};
 use std::sync::Arc;
-use std::{ops::Deref, path::Path};
+use std::{fs::File, io::BufReader, ops::Deref, path::Path};
-use crate::protocols::TokenIdType;
+use anyhow::Context as _;
 pub use anyhow::{Error, Result};
 pub use fastokens::FastTokenizer;
@@ -21,6 +21,8 @@ pub use hf::HuggingFaceTokenizer;
 pub use tiktoken::TikTokenTokenizer;
 pub use traits::DecodeResult;
+pub type TokenIdType = u32;
 /// Represents the type of tokenizer being used
 #[derive(Debug)]
 pub enum TokenizerType {
@@ -131,6 +133,80 @@ pub mod traits {
    }
 }
+pub fn file_json_field<T: serde::de::DeserializeOwned>(
+    json_file_path: &Path,
+    field_name: &str,
+) -> anyhow::Result<T> {
+    let file = File::open(json_file_path)
+        .with_context(|| format!("Failed to open file: {:?}", json_file_path))?;
+    let reader = BufReader::new(file);
+    let json_data: serde_json::Value = serde_json::from_reader(reader)
+        .with_context(|| format!("Failed to parse JSON from file: {:?}", json_file_path))?;
+    let map = json_data.as_object().ok_or_else(|| {
+        anyhow::anyhow!("JSON root is not an object in file: {:?}", json_file_path)
+    })?;
+    let field_value = map.get(field_name).ok_or_else(|| {
+        anyhow::anyhow!(
+            "Field '{}' not found in JSON file: {:?}",
+            field_name,
+            json_file_path
+        )
+    })?;
+    serde_json::from_value(field_value.clone()).with_context(|| {
+        format!(
+            "Failed to deserialize field '{}' (value: {:?}) to the expected type from file: {:?}",
+            field_name, field_value, json_file_path
+        )
+    })
+}
+pub fn log_json_err(filename: &str, json: &str, err: &serde_json::Error) {
+    const ERROR_PREFIX: &str = ">>     ";
+    if !(err.is_syntax() || err.is_data()) {
+        return;
+    }
+    let line = err.line().saturating_sub(1);
+    let column = err.column().saturating_sub(1);
+    let json_lines: Vec<&str> = json.lines().collect();
+    if json_lines.is_empty() {
+        tracing::error!("JSON parsing error in {filename}: File is empty.");
+        return;
+    }
+    let start_index = line.saturating_sub(2);
+    let end_index = line.saturating_add(3).min(json_lines.len());
+    let mut context_lines: Vec<String> = (start_index..end_index)
+        .map(|i| {
+            if i == line {
+                format!("{ERROR_PREFIX}{}", json_lines[i])
+            } else {
+                format!("{:06} {}", i + 1, json_lines[i])
+            }
+        })
+        .collect();
+    let col_indicator = "_".to_string().repeat(column + ERROR_PREFIX.len()) + "^";
+    let error_in_context_idx = line - start_index;
+    if error_in_context_idx < context_lines.len() {
+        context_lines.insert(error_in_context_idx + 1, col_indicator);
+    }
+    tracing::error!(
+        "JSON parsing error in {filename}: Line {}, column {}:\n{}",
+        err.line(),
+        err.column(),
+        context_lines.join("\n")
+    );
+}
 impl Encoding {
    pub fn get_hash(&self) -> u64 {
        let mut hasher = DefaultHasher::new();

--- a/lib/llm/src/tokenizers/tiktoken.rs
+++ b/lib/llm/src/tokenizers/tiktoken.rs
@@ -172,7 +172,7 @@ fn detect_bpe_pattern(directory: &Path) -> Result<&'static str> {
        _ => Err(Error::msg(format!(
            "Unsupported tiktoken model_type '{model_type}'. \
             Currently supported: kimi, kimi_k2, kimi_k25, deepseek_v3. \
-             To add a new model type, extend detect_bpe_pattern() in tokenizers/tiktoken.rs \
+             To add a new model type, extend detect_bpe_pattern() in lib/tokenizers/src/tiktoken.rs \
             with the appropriate BPE regex pattern. \
             Alternatively, provide a tokenizer.json (HuggingFace format) instead."
        ))),
@@ -249,7 +249,7 @@ fn load_special_tokens(directory: &Path, num_base_tokens: usize) -> Result<FxHas
 #[cfg(test)]
 mod tests {
    use super::*;
-    use crate::tokenizers::DecodeStream;
+    use crate::DecodeStream;
    use std::io::Write;
    use std::sync::Arc;
@@ -643,7 +643,7 @@ mod tests {
    fn test_decode_stream_incremental_multibyte_reassembly() {
        let dir = tempfile::tempdir().unwrap();
        let tokenizer = create_byte_token_tokenizer(dir.path());
-        let tokenizer_arc: Arc<dyn crate::tokenizers::traits::Tokenizer> = Arc::new(tokenizer);
+        let tokenizer_arc: Arc<dyn crate::traits::Tokenizer> = Arc::new(tokenizer);
        let mut stream = DecodeStream::new(tokenizer_arc, &[5], false);
@@ -663,7 +663,7 @@ mod tests {
    fn test_decode_stream_incremental_emoji_reassembly() {
        let dir = tempfile::tempdir().unwrap();
        let tokenizer = create_byte_token_tokenizer(dir.path());
-        let tokenizer_arc: Arc<dyn crate::tokenizers::traits::Tokenizer> = Arc::new(tokenizer);
+        let tokenizer_arc: Arc<dyn crate::traits::Tokenizer> = Arc::new(tokenizer);
        let mut stream = DecodeStream::new(tokenizer_arc, &[5], false);