"...git@developer.sourcefind.cn:2222/OpenDAS/vllm_cscc.git" did not exist on "e8ee2a78dbc08d398d5e798a149657b8aa821850"
Unverified Commit 0d635418 authored by ishandhanani's avatar ishandhanani Committed by GitHub
Browse files

feat: split tokenizer code into dynamo-tokenizers crate (#8185)

parent 159d9e06
...@@ -2419,9 +2419,9 @@ dependencies = [ ...@@ -2419,9 +2419,9 @@ dependencies = [
"dynamo-parsers", "dynamo-parsers",
"dynamo-protocols", "dynamo-protocols",
"dynamo-runtime", "dynamo-runtime",
"dynamo-tokenizers",
"dynamo-tokens", "dynamo-tokens",
"either", "either",
"fastokens",
"ffmpeg-next", "ffmpeg-next",
"flate2", "flate2",
"futures", "futures",
...@@ -2453,7 +2453,6 @@ dependencies = [ ...@@ -2453,7 +2453,6 @@ dependencies = [
"reqwest 0.12.24", "reqwest 0.12.24",
"rmp-serde", "rmp-serde",
"rstest 0.18.2", "rstest 0.18.2",
"rustc-hash 1.1.0",
"rustls 0.23.37", "rustls 0.23.37",
"serde", "serde",
"serde_json", "serde_json",
...@@ -2463,7 +2462,6 @@ dependencies = [ ...@@ -2463,7 +2462,6 @@ dependencies = [
"temp-env", "temp-env",
"tempfile", "tempfile",
"thiserror 2.0.18", "thiserror 2.0.18",
"tiktoken-rs",
"tmq", "tmq",
"tokenizers", "tokenizers",
"tokio", "tokio",
...@@ -2642,6 +2640,24 @@ dependencies = [ ...@@ -2642,6 +2640,24 @@ dependencies = [
"xxhash-rust", "xxhash-rust",
] ]
[[package]]
name = "dynamo-tokenizers"
version = "1.1.0"
dependencies = [
"anyhow",
"base64 0.22.1",
"fastokens",
"rayon",
"rustc-hash 1.1.0",
"serde",
"serde_json",
"strum",
"tempfile",
"tiktoken-rs",
"tokenizers",
"tracing",
]
[[package]] [[package]]
name = "dynamo-tokens" name = "dynamo-tokens"
version = "1.1.0" version = "1.1.0"
......
...@@ -6,6 +6,7 @@ members = [ ...@@ -6,6 +6,7 @@ members = [
"lib/llm", "lib/llm",
"lib/runtime", "lib/runtime",
"lib/config", "lib/config",
"lib/tokenizers",
"lib/tokens", "lib/tokens",
"lib/mocker", "lib/mocker",
"lib/kv-router", "lib/kv-router",
...@@ -39,6 +40,7 @@ keywords = ["llm", "genai", "inference", "nvidia", "distributed"] ...@@ -39,6 +40,7 @@ keywords = ["llm", "genai", "inference", "nvidia", "distributed"]
dynamo-runtime = { path = "lib/runtime", version = "1.1.0" } dynamo-runtime = { path = "lib/runtime", version = "1.1.0" }
dynamo-llm = { path = "lib/llm", version = "1.1.0" } dynamo-llm = { path = "lib/llm", version = "1.1.0" }
dynamo-config = { path = "lib/config", version = "1.1.0" } dynamo-config = { path = "lib/config", version = "1.1.0" }
dynamo-tokenizers = { path = "lib/tokenizers", version = "1.1.0" }
dynamo-tokens = { path = "lib/tokens", version = "1.1.0" } dynamo-tokens = { path = "lib/tokens", version = "1.1.0" }
dynamo-memory = { path = "lib/memory", version = "1.1.0" } dynamo-memory = { path = "lib/memory", version = "1.1.0" }
dynamo-mocker = { path = "lib/mocker", version = "1.1.0" } dynamo-mocker = { path = "lib/mocker", version = "1.1.0" }
......
...@@ -1554,9 +1554,9 @@ dependencies = [ ...@@ -1554,9 +1554,9 @@ dependencies = [
"dynamo-parsers", "dynamo-parsers",
"dynamo-protocols", "dynamo-protocols",
"dynamo-runtime", "dynamo-runtime",
"dynamo-tokenizers",
"dynamo-tokens", "dynamo-tokens",
"either", "either",
"fastokens",
"flate2", "flate2",
"futures", "futures",
"futures-util", "futures-util",
...@@ -1581,7 +1581,6 @@ dependencies = [ ...@@ -1581,7 +1581,6 @@ dependencies = [
"rayon", "rayon",
"reqwest", "reqwest",
"rmp-serde", "rmp-serde",
"rustc-hash 1.1.0",
"rustls", "rustls",
"serde", "serde",
"serde_json", "serde_json",
...@@ -1589,7 +1588,6 @@ dependencies = [ ...@@ -1589,7 +1588,6 @@ dependencies = [
"strum", "strum",
"tempfile", "tempfile",
"thiserror 2.0.18", "thiserror 2.0.18",
"tiktoken-rs",
"tmq", "tmq",
"tokenizers", "tokenizers",
"tokio", "tokio",
...@@ -1752,6 +1750,23 @@ dependencies = [ ...@@ -1752,6 +1750,23 @@ dependencies = [
"xxhash-rust", "xxhash-rust",
] ]
[[package]]
name = "dynamo-tokenizers"
version = "1.1.0"
dependencies = [
"anyhow",
"base64 0.22.1",
"fastokens",
"rayon",
"rustc-hash 1.1.0",
"serde",
"serde_json",
"strum",
"tiktoken-rs",
"tokenizers",
"tracing",
]
[[package]] [[package]]
name = "dynamo-tokens" name = "dynamo-tokens"
version = "1.1.0" version = "1.1.0"
......
...@@ -1566,9 +1566,9 @@ dependencies = [ ...@@ -1566,9 +1566,9 @@ dependencies = [
"dynamo-parsers", "dynamo-parsers",
"dynamo-protocols", "dynamo-protocols",
"dynamo-runtime", "dynamo-runtime",
"dynamo-tokenizers",
"dynamo-tokens", "dynamo-tokens",
"either", "either",
"fastokens",
"ffmpeg-next", "ffmpeg-next",
"flate2", "flate2",
"futures", "futures",
...@@ -1595,7 +1595,6 @@ dependencies = [ ...@@ -1595,7 +1595,6 @@ dependencies = [
"rayon", "rayon",
"reqwest", "reqwest",
"rmp-serde", "rmp-serde",
"rustc-hash 1.1.0",
"rustls", "rustls",
"serde", "serde",
"serde_json", "serde_json",
...@@ -1603,7 +1602,6 @@ dependencies = [ ...@@ -1603,7 +1602,6 @@ dependencies = [
"strum", "strum",
"tempfile", "tempfile",
"thiserror 2.0.18", "thiserror 2.0.18",
"tiktoken-rs",
"tmq", "tmq",
"tokenizers", "tokenizers",
"tokio", "tokio",
...@@ -1799,6 +1797,23 @@ dependencies = [ ...@@ -1799,6 +1797,23 @@ dependencies = [
"xxhash-rust", "xxhash-rust",
] ]
[[package]]
name = "dynamo-tokenizers"
version = "1.1.0"
dependencies = [
"anyhow",
"base64 0.22.1",
"fastokens",
"rayon",
"rustc-hash 1.1.0",
"serde",
"serde_json",
"strum",
"tiktoken-rs",
"tokenizers",
"tracing",
]
[[package]] [[package]]
name = "dynamo-tokens" name = "dynamo-tokens"
version = "1.1.0" version = "1.1.0"
......
...@@ -57,6 +57,7 @@ dynamo-kv-router = { workspace = true, features = ["metrics", "runtime-protocols ...@@ -57,6 +57,7 @@ dynamo-kv-router = { workspace = true, features = ["metrics", "runtime-protocols
dynamo-memory = { workspace = true } dynamo-memory = { workspace = true }
dynamo-mocker = { workspace = true } dynamo-mocker = { workspace = true }
dynamo-runtime = { workspace = true } dynamo-runtime = { workspace = true }
dynamo-tokenizers = { workspace = true }
dynamo-tokens = { workspace = true } dynamo-tokens = { workspace = true }
# workspace # workspace
...@@ -150,9 +151,6 @@ tokenizers = { version = "0.21.4", default-features = false, features = [ ...@@ -150,9 +151,6 @@ tokenizers = { version = "0.21.4", default-features = false, features = [
"esaxx_fast", "esaxx_fast",
"rustls-tls", "rustls-tls",
] } ] }
tiktoken-rs = { version = "0.9", default-features = false }
rustc-hash = "1.1"
fastokens = { workspace = true }
# backend # backend
galil-seiferas = { version = "0.1" } galil-seiferas = { version = "0.1" }
......
...@@ -6,10 +6,6 @@ ...@@ -6,10 +6,6 @@
//! The `dynamo.llm` crate is a Rust library that provides a set of traits and types for building //! The `dynamo.llm` crate is a Rust library that provides a set of traits and types for building
//! distributed LLM inference solutions. //! distributed LLM inference solutions.
use std::{fs::File, io::BufReader, path::Path};
use anyhow::Context as _;
pub mod backend; pub mod backend;
pub mod common; pub mod common;
pub mod discovery; pub mod discovery;
...@@ -35,7 +31,8 @@ pub mod preprocessor; ...@@ -35,7 +31,8 @@ pub mod preprocessor;
pub mod protocols; pub mod protocols;
pub mod recorder; pub mod recorder;
pub mod request_template; pub mod request_template;
pub mod tokenizers; pub use dynamo_tokenizers as tokenizers;
pub use dynamo_tokenizers::{file_json_field, log_json_err};
pub mod tokens; pub mod tokens;
pub mod types; pub mod types;
pub mod utils; pub mod utils;
...@@ -46,111 +43,6 @@ pub mod block_manager; ...@@ -46,111 +43,6 @@ pub mod block_manager;
#[cfg(feature = "cuda")] #[cfg(feature = "cuda")]
pub mod cuda; pub mod cuda;
/// Reads a JSON file, extracts a specific field, and deserializes it into type T.
///
/// # Arguments
///
/// * `json_file_path`: Path to the JSON file.
/// * `field_name`: The name of the field to extract from the JSON map.
///
/// # Returns
///
/// A `Result` containing the deserialized value of type `T` if successful,
/// or an `anyhow::Error` if any step fails (file I/O, JSON parsing, field not found,
/// or deserialization to `T` fails).
///
/// # Type Parameters
///
/// * `T`: The expected type of the field's value. `T` must implement `serde::de::DeserializeOwned`.
pub fn file_json_field<T: serde::de::DeserializeOwned>(
json_file_path: &Path,
field_name: &str,
) -> anyhow::Result<T> {
// 1. Open the file
let file = File::open(json_file_path)
.with_context(|| format!("Failed to open file: {:?}", json_file_path))?;
let reader = BufReader::new(file);
// 2. Parse the JSON file into a generic serde_json::Value
// We parse into `serde_json::Value` first because we need to look up a specific field.
// If we tried to deserialize directly into `T`, `T` would need to represent the whole JSON structure.
let json_data: serde_json::Value = serde_json::from_reader(reader)
.with_context(|| format!("Failed to parse JSON from file: {:?}", json_file_path))?;
// 3. Ensure the root of the JSON is an object (map)
let map = json_data.as_object().ok_or_else(|| {
anyhow::anyhow!("JSON root is not an object in file: {:?}", json_file_path)
})?;
// 4. Get the specific field's value
let field_value = map.get(field_name).ok_or_else(|| {
anyhow::anyhow!(
"Field '{}' not found in JSON file: {:?}",
field_name,
json_file_path
)
})?;
// 5. Deserialize the field's value into the target type T
// We need to clone `field_value` because `from_value` consumes its input.
serde_json::from_value(field_value.clone()).with_context(|| {
format!(
"Failed to deserialize field '{}' (value: {:?}) to the expected type from file: {:?}",
field_name, field_value, json_file_path
)
})
}
/// Pretty-print the part of JSON that has an error.
pub fn log_json_err(filename: &str, json: &str, err: &serde_json::Error) {
const ERROR_PREFIX: &str = ">> ";
// Only log errors that relate to the content of the JSON file
if !(err.is_syntax() || err.is_data()) {
return;
}
// These are 1 based for humans so subtract
let line = err.line().saturating_sub(1);
let column = err.column().saturating_sub(1);
let json_lines: Vec<&str> = json.lines().collect();
if json_lines.is_empty() {
tracing::error!("JSON parsing error in {filename}: File is empty.");
return;
}
// Two lines before
let start_index = (line - 2).max(0);
// The problem line and two lines after
let end_index = (line + 3).min(json_lines.len());
// Collect the context
let mut context_lines: Vec<String> = (start_index..end_index)
.map(|i| {
if i == line {
format!("{ERROR_PREFIX}{}", json_lines[i])
} else {
// Six places because tokenizer.json is very long
format!("{:06} {}", i + 1, json_lines[i])
}
})
.collect();
// Insert the column indicator
let col_indicator = "_".to_string().repeat(column + ERROR_PREFIX.len()) + "^";
let error_in_context_idx = line - start_index;
if error_in_context_idx < context_lines.len() {
context_lines.insert(error_in_context_idx + 1, col_indicator);
}
tracing::error!(
"JSON parsing error in {filename}: Line {}, column {}:\n{}",
err.line(),
err.column(),
context_lines.join("\n")
);
}
#[cfg(test)] #[cfg(test)]
mod file_json_field_tests { mod file_json_field_tests {
use super::file_json_field; use super::file_json_field;
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
[package]
name = "dynamo-tokenizers"
version.workspace = true
edition.workspace = true
description = "Standalone tokenizer implementations for Dynamo"
authors.workspace = true
license.workspace = true
homepage.workspace = true
repository.workspace = true
keywords.workspace = true
[dependencies]
anyhow = { workspace = true }
fastokens = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
strum = { workspace = true }
tracing = { workspace = true }
base64 = { version = "0.22" }
rayon = "1"
rustc-hash = "1.1"
tiktoken-rs = { version = "0.9", default-features = false }
tokenizers = { version = "0.21.4", default-features = false, features = [
"onig",
"esaxx_fast",
"rustls-tls",
] }
[dev-dependencies]
tempfile = { workspace = true }
# Tokenizers # Tokenizers
## Introduction ## Introduction
`tokenizers` is designed for efficient and versatile tokenization in natural language processing. It supports both HuggingFace models, offering a streamlined API for text encoding and decoding. `dynamo-tokenizers` provides efficient, versatile tokenization for NLP workloads. It supports HuggingFace and TikToken tokenizers (plus a FastTokenizer hybrid mode) through a streamlined encoding/decoding API.
## Features ## Features
- **Hash Verification**: Ensures tokenization consistency and accuracy across different models. - **Hash Verification**: Ensures tokenization consistency and accuracy across different models.
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
#### HuggingFace Tokenizer #### HuggingFace Tokenizer
```rust ```rust
use dynamo_llm::tokenizers::hf::HuggingFaceTokenizer; use dynamo_tokenizers::hf::HuggingFaceTokenizer;
let hf_tokenizer = HuggingFaceTokenizer::from_file("tests/data/sample-models/TinyLlama_v1.1/tokenizer.json") let hf_tokenizer = HuggingFaceTokenizer::from_file("tests/data/sample-models/TinyLlama_v1.1/tokenizer.json")
.expect("Failed to load HuggingFace tokenizer"); .expect("Failed to load HuggingFace tokenizer");
...@@ -21,7 +21,7 @@ let hf_tokenizer = HuggingFaceTokenizer::from_file("tests/data/sample-models/Tin ...@@ -21,7 +21,7 @@ let hf_tokenizer = HuggingFaceTokenizer::from_file("tests/data/sample-models/Tin
### Encoding and Decoding Text ### Encoding and Decoding Text
```rust ```rust
use dynamo_llm::tokenizers::{HuggingFaceTokenizer, traits::{Encoder, Decoder}}; use dynamo_tokenizers::{HuggingFaceTokenizer, traits::{Encoder, Decoder}};
let tokenizer = HuggingFaceTokenizer::from_file("tests/data/sample-models/TinyLlama_v1.1/tokenizer.json") let tokenizer = HuggingFaceTokenizer::from_file("tests/data/sample-models/TinyLlama_v1.1/tokenizer.json")
.expect("Failed to load HuggingFace tokenizer"); .expect("Failed to load HuggingFace tokenizer");
...@@ -39,7 +39,7 @@ assert_eq!(text, decoded_text); ...@@ -39,7 +39,7 @@ assert_eq!(text, decoded_text);
// Using the Sequence object for encoding and decoding // Using the Sequence object for encoding and decoding
use dynamo_llm::tokenizers::{Sequence, Tokenizer}; use dynamo_tokenizers::{Sequence, Tokenizer};
use std::sync::{Arc, RwLock}; use std::sync::{Arc, RwLock};
let tokenizer = Tokenizer::from(Arc::new(tokenizer)); let tokenizer = Tokenizer::from(Arc::new(tokenizer));
......
...@@ -62,13 +62,13 @@ impl Tokenizer for FastTokenizer {} ...@@ -62,13 +62,13 @@ impl Tokenizer for FastTokenizer {}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::tokenizers::HuggingFaceTokenizer; use crate::HuggingFaceTokenizer;
// Minimal synthetic BPE tokenizer with no normalizer or post-processor -- // Minimal synthetic BPE tokenizer with no normalizer or post-processor --
// compatible with fastokens. Vocab covers: H,T,a,d,e,h,i,l,o,r,s,t,w + punctuation. // compatible with fastokens. Vocab covers: H,T,a,d,e,h,i,l,o,r,s,t,w + punctuation.
const TOKENIZER_PATH: &str = concat!( const TOKENIZER_PATH: &str = concat!(
env!("CARGO_MANIFEST_DIR"), env!("CARGO_MANIFEST_DIR"),
"/tests/data/sample-models/minimal-bpe/tokenizer.json" "/../llm/tests/data/sample-models/minimal-bpe/tokenizer.json"
); );
#[test] #[test]
...@@ -124,7 +124,7 @@ mod tests { ...@@ -124,7 +124,7 @@ mod tests {
#[test] #[test]
fn test_fast_with_decode_stream() { fn test_fast_with_decode_stream() {
use crate::tokenizers::Tokenizer as TokenizerWrapper; use crate::Tokenizer as TokenizerWrapper;
use std::sync::Arc; use std::sync::Arc;
let tokenizer = Arc::new(FastTokenizer::from_file(TOKENIZER_PATH).unwrap()); let tokenizer = Arc::new(FastTokenizer::from_file(TOKENIZER_PATH).unwrap());
......
...@@ -11,9 +11,9 @@ pub mod tiktoken; ...@@ -11,9 +11,9 @@ pub mod tiktoken;
use std::hash::{DefaultHasher, Hash, Hasher}; use std::hash::{DefaultHasher, Hash, Hasher};
use std::sync::Arc; use std::sync::Arc;
use std::{ops::Deref, path::Path}; use std::{fs::File, io::BufReader, ops::Deref, path::Path};
use crate::protocols::TokenIdType; use anyhow::Context as _;
pub use anyhow::{Error, Result}; pub use anyhow::{Error, Result};
pub use fastokens::FastTokenizer; pub use fastokens::FastTokenizer;
...@@ -21,6 +21,8 @@ pub use hf::HuggingFaceTokenizer; ...@@ -21,6 +21,8 @@ pub use hf::HuggingFaceTokenizer;
pub use tiktoken::TikTokenTokenizer; pub use tiktoken::TikTokenTokenizer;
pub use traits::DecodeResult; pub use traits::DecodeResult;
pub type TokenIdType = u32;
/// Represents the type of tokenizer being used /// Represents the type of tokenizer being used
#[derive(Debug)] #[derive(Debug)]
pub enum TokenizerType { pub enum TokenizerType {
...@@ -131,6 +133,80 @@ pub mod traits { ...@@ -131,6 +133,80 @@ pub mod traits {
} }
} }
pub fn file_json_field<T: serde::de::DeserializeOwned>(
json_file_path: &Path,
field_name: &str,
) -> anyhow::Result<T> {
let file = File::open(json_file_path)
.with_context(|| format!("Failed to open file: {:?}", json_file_path))?;
let reader = BufReader::new(file);
let json_data: serde_json::Value = serde_json::from_reader(reader)
.with_context(|| format!("Failed to parse JSON from file: {:?}", json_file_path))?;
let map = json_data.as_object().ok_or_else(|| {
anyhow::anyhow!("JSON root is not an object in file: {:?}", json_file_path)
})?;
let field_value = map.get(field_name).ok_or_else(|| {
anyhow::anyhow!(
"Field '{}' not found in JSON file: {:?}",
field_name,
json_file_path
)
})?;
serde_json::from_value(field_value.clone()).with_context(|| {
format!(
"Failed to deserialize field '{}' (value: {:?}) to the expected type from file: {:?}",
field_name, field_value, json_file_path
)
})
}
pub fn log_json_err(filename: &str, json: &str, err: &serde_json::Error) {
const ERROR_PREFIX: &str = ">> ";
if !(err.is_syntax() || err.is_data()) {
return;
}
let line = err.line().saturating_sub(1);
let column = err.column().saturating_sub(1);
let json_lines: Vec<&str> = json.lines().collect();
if json_lines.is_empty() {
tracing::error!("JSON parsing error in {filename}: File is empty.");
return;
}
let start_index = line.saturating_sub(2);
let end_index = line.saturating_add(3).min(json_lines.len());
let mut context_lines: Vec<String> = (start_index..end_index)
.map(|i| {
if i == line {
format!("{ERROR_PREFIX}{}", json_lines[i])
} else {
format!("{:06} {}", i + 1, json_lines[i])
}
})
.collect();
let col_indicator = "_".to_string().repeat(column + ERROR_PREFIX.len()) + "^";
let error_in_context_idx = line - start_index;
if error_in_context_idx < context_lines.len() {
context_lines.insert(error_in_context_idx + 1, col_indicator);
}
tracing::error!(
"JSON parsing error in {filename}: Line {}, column {}:\n{}",
err.line(),
err.column(),
context_lines.join("\n")
);
}
impl Encoding { impl Encoding {
pub fn get_hash(&self) -> u64 { pub fn get_hash(&self) -> u64 {
let mut hasher = DefaultHasher::new(); let mut hasher = DefaultHasher::new();
......
...@@ -172,7 +172,7 @@ fn detect_bpe_pattern(directory: &Path) -> Result<&'static str> { ...@@ -172,7 +172,7 @@ fn detect_bpe_pattern(directory: &Path) -> Result<&'static str> {
_ => Err(Error::msg(format!( _ => Err(Error::msg(format!(
"Unsupported tiktoken model_type '{model_type}'. \ "Unsupported tiktoken model_type '{model_type}'. \
Currently supported: kimi, kimi_k2, kimi_k25, deepseek_v3. \ Currently supported: kimi, kimi_k2, kimi_k25, deepseek_v3. \
To add a new model type, extend detect_bpe_pattern() in tokenizers/tiktoken.rs \ To add a new model type, extend detect_bpe_pattern() in lib/tokenizers/src/tiktoken.rs \
with the appropriate BPE regex pattern. \ with the appropriate BPE regex pattern. \
Alternatively, provide a tokenizer.json (HuggingFace format) instead." Alternatively, provide a tokenizer.json (HuggingFace format) instead."
))), ))),
...@@ -249,7 +249,7 @@ fn load_special_tokens(directory: &Path, num_base_tokens: usize) -> Result<FxHas ...@@ -249,7 +249,7 @@ fn load_special_tokens(directory: &Path, num_base_tokens: usize) -> Result<FxHas
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::tokenizers::DecodeStream; use crate::DecodeStream;
use std::io::Write; use std::io::Write;
use std::sync::Arc; use std::sync::Arc;
...@@ -643,7 +643,7 @@ mod tests { ...@@ -643,7 +643,7 @@ mod tests {
fn test_decode_stream_incremental_multibyte_reassembly() { fn test_decode_stream_incremental_multibyte_reassembly() {
let dir = tempfile::tempdir().unwrap(); let dir = tempfile::tempdir().unwrap();
let tokenizer = create_byte_token_tokenizer(dir.path()); let tokenizer = create_byte_token_tokenizer(dir.path());
let tokenizer_arc: Arc<dyn crate::tokenizers::traits::Tokenizer> = Arc::new(tokenizer); let tokenizer_arc: Arc<dyn crate::traits::Tokenizer> = Arc::new(tokenizer);
let mut stream = DecodeStream::new(tokenizer_arc, &[5], false); let mut stream = DecodeStream::new(tokenizer_arc, &[5], false);
...@@ -663,7 +663,7 @@ mod tests { ...@@ -663,7 +663,7 @@ mod tests {
fn test_decode_stream_incremental_emoji_reassembly() { fn test_decode_stream_incremental_emoji_reassembly() {
let dir = tempfile::tempdir().unwrap(); let dir = tempfile::tempdir().unwrap();
let tokenizer = create_byte_token_tokenizer(dir.path()); let tokenizer = create_byte_token_tokenizer(dir.path());
let tokenizer_arc: Arc<dyn crate::tokenizers::traits::Tokenizer> = Arc::new(tokenizer); let tokenizer_arc: Arc<dyn crate::traits::Tokenizer> = Arc::new(tokenizer);
let mut stream = DecodeStream::new(tokenizer_arc, &[5], false); let mut stream = DecodeStream::new(tokenizer_arc, &[5], false);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment