use anyhow::Result; /// Core encoding trait - separate from decoding for modularity pub trait Encoder: Send + Sync { fn encode(&self, input: &str) -> Result; fn encode_batch(&self, inputs: &[&str]) -> Result>; } /// Core decoding trait - can be implemented independently pub trait Decoder: Send + Sync { fn decode(&self, token_ids: &[u32], skip_special_tokens: bool) -> Result; } /// Combined tokenizer trait pub trait Tokenizer: Encoder + Decoder { fn vocab_size(&self) -> usize; fn get_special_tokens(&self) -> &SpecialTokens; fn token_to_id(&self, token: &str) -> Option; fn id_to_token(&self, id: u32) -> Option; } /// Contains the results of tokenizing text: token IDs, string tokens, and their spans #[derive(Debug, Clone)] pub enum Encoding { /// Hugging Face Hf(Box), /// Sentence Piece Sp(Vec), /// Tiktoken (for GPT models) Tiktoken(Vec), } impl Encoding { pub fn token_ids(&self) -> Vec { match self { Encoding::Hf(inner) => inner.get_ids().to_vec(), Encoding::Sp(inner) => inner.clone(), Encoding::Tiktoken(inner) => inner.iter().map(|&id| id as u32).collect(), } } pub fn token_ids_ref(&self) -> &[u32] { match self { Encoding::Hf(inner) => inner.get_ids(), Encoding::Sp(inner) => inner, Encoding::Tiktoken(_) => { // Tiktoken uses usize, we can't return a reference to u32 // This is a limitation - callers should use token_ids() for Tiktoken &[] } } } } #[derive(Debug, Clone)] pub struct SpecialTokens { pub bos_token: Option, pub eos_token: Option, pub unk_token: Option, pub sep_token: Option, pub pad_token: Option, pub cls_token: Option, pub mask_token: Option, pub additional_special_tokens: Vec, }