// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 //! Dataset-driven benchmarks //! //! Downloads a real dataset (LongBench-v2) from HuggingFace Hub and benchmarks //! per-sample encode throughput with correctness verification. //! //! This benchmark is opt-in: set RUN_BENCH=1 to run it. Without that variable //! it exits immediately so that `cargo test --all-targets` in CI is not affected. //! //! Run: //! RUN_BENCH=1 cargo bench --bench tokenizer_dataset //! //! Override tokenizer (default: Qwen/Qwen3-0.6B): //! RUN_BENCH=1 TOKENIZER_PATH=deepseek-ai/DeepSeek-V3 cargo bench --bench tokenizer_dataset //! //! Override dataset and sample count (default: 503): //! RUN_BENCH=1 DATASET=RyokoAI/ShareGPT52K MAX_SAMPLES=50 cargo bench --bench tokenizer_dataset //! //! Batch benchmark (default: sequential): //! RUN_BENCH=1 BATCH_SIZE=64 cargo bench --bench tokenizer_dataset use std::path::Path; use std::time::{Duration, Instant}; use dynamo_llm::tokenizers::{FastTokenizer, HuggingFaceTokenizer, traits::Encoder}; /// Default HuggingFace model for the tokenizer. const DEFAULT_HF_MODEL: &str = "Qwen/Qwen3-0.6B"; /// Default dataset on HuggingFace Hub. const DEFAULT_DATASET: &str = "zai-org/LongBench-v2"; /// Default number of samples when MAX_SAMPLES is not set. const DEFAULT_MAX_SAMPLES: usize = 503; /// Resolve tokenizer path: local file, HF model name, or default. fn resolve_tokenizer_path() -> String { let input = std::env::var("TOKENIZER_PATH").ok(); if let Some(ref p) = input && Path::new(p).is_file() { eprintln!("[setup] Using local tokenizer: {p}"); return p.clone(); } let model_name = input.as_deref().unwrap_or(DEFAULT_HF_MODEL); eprintln!("[setup] Downloading tokenizer for {model_name}..."); let cache = hf_hub::Cache::default(); let api = hf_hub::api::sync::ApiBuilder::from_cache(cache) .with_progress(true) .build() .expect("Failed to create HuggingFace API client"); let repo = api.model(model_name.to_string()); let path = repo .get("tokenizer.json") .expect("Failed to download tokenizer.json"); let path_str = path.display().to_string(); eprintln!("[setup] Tokenizer: {path_str}"); path_str } /// Return the JSON filename for a known HuggingFace Hub dataset. fn dataset_json_file(dataset: &str) -> &'static str { match dataset { "RyokoAI/ShareGPT52K" => "sg_90k_part1.json", "zai-org/LongBench-v2" => "data.json", _ => panic!( "Unknown dataset: {dataset}. Supported: zai-org/LongBench-v2, RyokoAI/ShareGPT52K" ), } } /// Extract a text sample from a single JSON item. fn extract_text(dataset: &str, item: &serde_json::Value) -> Option { match dataset { "RyokoAI/ShareGPT52K" => { let messages = item.get("conversations")?.as_array()?; let parts: Vec = messages .iter() .filter_map(|msg| { let role = msg.get("from")?.as_str()?; let value = msg.get("value")?.as_str()?; if value.is_empty() { return None; } Some(format!("[{role}]: {value}")) }) .collect(); if parts.is_empty() { return None; } Some(parts.join("\n\n")) } "zai-org/LongBench-v2" => { let context = item.get("context")?.as_str()?; if context.is_empty() { return None; } Some(context.to_string()) } _ => None, } } /// Load text samples from a HuggingFace Hub dataset. fn load_dataset(dataset: &str, max_items: usize) -> Vec { let json_file = dataset_json_file(dataset); eprintln!("[setup] Downloading dataset {dataset}..."); let api = hf_hub::api::sync::Api::new().expect("Failed to create HuggingFace API client"); let repo = api.dataset(dataset.to_string()); let json_path = repo.get(json_file).expect("Failed to download dataset"); let text = std::fs::read_to_string(&json_path).expect("Failed to read dataset JSON"); let data: Vec = serde_json::from_str(&text).expect("Failed to parse dataset JSON"); let samples: Vec = data .iter() .take(max_items) .filter_map(|item| extract_text(dataset, item)) .collect(); eprintln!("[setup] Loaded {} samples", samples.len()); samples } fn print_summary( label: &str, n: usize, total_chars: u64, total_tokens: u64, total_hf: Duration, total_ft: Duration, ) { let hf_ms = total_hf.as_secs_f64() * 1000.0; let ft_ms = total_ft.as_secs_f64() * 1000.0; let speedup = hf_ms / ft_ms; let nf = n as f64; println!(); println!("=== {label} ({n} samples) ==="); println!(" Total chars: {total_chars}"); println!(" Total tokens: {total_tokens}"); println!(" ---"); println!(" HF total: {hf_ms:>10.2} ms"); println!(" fastokens total: {ft_ms:>10.2} ms"); println!(" Speedup: {speedup:>10.2}x"); println!(" ---"); println!(" HF avg/sample: {:>10.3} ms", hf_ms / nf); println!(" ft avg/sample: {:>10.3} ms", ft_ms / nf); println!( " HF throughput: {:>10.2} MB/s", total_chars as f64 / total_hf.as_secs_f64() / 1_000_000.0 ); println!( " ft throughput: {:>10.2} MB/s", total_chars as f64 / total_ft.as_secs_f64() / 1_000_000.0 ); } fn bench_sequential(samples: &[String], hf: &HuggingFaceTokenizer, fast: &FastTokenizer) { let mut total_hf = Duration::ZERO; let mut total_ft = Duration::ZERO; let mut total_tokens: u64 = 0; let mut total_chars: u64 = 0; let mut mismatches = 0u64; for (i, text) in samples.iter().enumerate() { let t0 = Instant::now(); let hf_enc = hf.encode(text).expect("HF encode failed"); let t1 = Instant::now(); let ft_enc = fast.encode(text).expect("fastokens encode failed"); let t2 = Instant::now(); let dt_hf = t1 - t0; let dt_ft = t2 - t1; if hf_enc.token_ids() != ft_enc.token_ids() { mismatches += 1; if mismatches <= 3 { eprintln!( "[MISMATCH] sample {i}: hf={} tokens, ft={} tokens", hf_enc.token_ids().len(), ft_enc.token_ids().len() ); } } total_hf += dt_hf; total_ft += dt_ft; total_tokens += ft_enc.token_ids().len() as u64; total_chars += text.len() as u64; if (i + 1) % 20 == 0 { eprintln!("[progress] {}/{}", i + 1, samples.len()); } } if mismatches > 0 { eprintln!("[WARNING] {mismatches} samples had mismatched token IDs"); } else { eprintln!("[OK] All samples produced identical token IDs"); } print_summary( "Sequential Benchmark", samples.len(), total_chars, total_tokens, total_hf, total_ft, ); } fn bench_batched( samples: &[String], hf: &HuggingFaceTokenizer, fast: &FastTokenizer, batch_size: usize, ) { let mut total_hf = Duration::ZERO; let mut total_ft = Duration::ZERO; let mut total_tokens: u64 = 0; let mut total_chars: u64 = 0; let mut mismatches = 0u64; let num_batches = samples.len().div_ceil(batch_size); for (batch_idx, batch) in samples.chunks(batch_size).enumerate() { let batch_refs: Vec<&str> = batch.iter().map(|s| s.as_str()).collect(); let batch_chars: u64 = batch.iter().map(|s| s.len() as u64).sum(); let t0 = Instant::now(); let hf_results = hf .encode_batch(&batch_refs) .expect("HF encode_batch failed"); let t1 = Instant::now(); let ft_results = fast .encode_batch(&batch_refs) .expect("fastokens encode_batch failed"); let t2 = Instant::now(); // Verify correctness per sample within the batch if hf_results.len() != ft_results.len() { eprintln!( "[LENGTH MISMATCH] batch {batch_idx}: hf returned {} results, ft returned {} results (expected {})", hf_results.len(), ft_results.len(), batch.len() ); } let max_len = hf_results.len().max(ft_results.len()); for j in 0..max_len { let global_idx = batch_idx * batch_size + j; match (hf_results.get(j), ft_results.get(j)) { (Some(hf_enc), Some(ft_enc)) => { if hf_enc.token_ids() != ft_enc.token_ids() { mismatches += 1; if mismatches <= 3 { eprintln!( "[MISMATCH] sample {global_idx}: hf={} tokens, ft={} tokens", hf_enc.token_ids().len(), ft_enc.token_ids().len() ); } } } (Some(_), None) => { mismatches += 1; if mismatches <= 3 { eprintln!("[MISMATCH] sample {global_idx}: hf produced result, ft missing"); } } (None, Some(_)) => { mismatches += 1; if mismatches <= 3 { eprintln!("[MISMATCH] sample {global_idx}: ft produced result, hf missing"); } } (None, None) => unreachable!(), } } let batch_tokens: u64 = ft_results .iter() .map(|enc| enc.token_ids().len() as u64) .sum(); total_hf += t1 - t0; total_ft += t2 - t1; total_tokens += batch_tokens; total_chars += batch_chars; if (batch_idx + 1) % 5 == 0 { eprintln!("[progress] batch {}/{num_batches}", batch_idx + 1); } } if mismatches > 0 { eprintln!("[WARNING] {mismatches} samples had mismatched token IDs"); } else { eprintln!("[OK] All samples produced identical token IDs"); } print_summary( &format!("Batched Benchmark (batch_size={batch_size})"), samples.len(), total_chars, total_tokens, total_hf, total_ft, ); } fn main() { // This benchmark downloads a large dataset and takes several minutes. // It is opt-in to avoid blocking `cargo test --all-targets` in CI. if std::env::var("RUN_BENCH").is_err() { eprintln!("[skip] tokenizer_dataset benchmark skipped. Set RUN_BENCH=1 to run it."); eprintln!("[skip] See lib/llm/benches/README.md for usage."); return; } let tokenizer_path = resolve_tokenizer_path(); let dataset = std::env::var("DATASET").unwrap_or_else(|_| DEFAULT_DATASET.to_string()); let max_samples: usize = std::env::var("MAX_SAMPLES") .ok() .and_then(|v| v.parse().ok()) .unwrap_or(DEFAULT_MAX_SAMPLES); let batch_size: Option = std::env::var("BATCH_SIZE") .ok() .and_then(|v| v.parse().ok()); let samples = load_dataset(&dataset, max_samples); let hf = HuggingFaceTokenizer::from_file(&tokenizer_path) .expect("Failed to load HuggingFace tokenizer"); let fast = FastTokenizer::from_file(&tokenizer_path).expect("Failed to load fastokens tokenizer"); // Warmup if let Some(s) = samples.first() { let _ = hf.encode(s.as_str()); let _ = fast.encode(s.as_str()); } if let Some(bs) = batch_size { bench_batched(&samples, &hf, &fast, bs); } else { bench_sequential(&samples, &hf, &fast); } }