tokenizer_simple.rs 9.2 KB
Newer Older
1
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
4
5
6
// SPDX-License-Identifier: Apache-2.0

use std::hint::black_box;
use std::sync::Arc;

7
use criterion::{Criterion, Throughput, criterion_group, criterion_main};
8
9
10

use dynamo_llm::backend::Decoder;
use dynamo_llm::protocols::common::StopConditions;
11
use dynamo_llm::tokenizers::DecodeStream;
12
use dynamo_llm::tokenizers::FastTokenizer;
13
use dynamo_llm::tokenizers::hf::HuggingFaceTokenizer;
Nikita's avatar
Nikita committed
14
use dynamo_llm::tokenizers::tiktoken::TikTokenTokenizer;
15
16
use dynamo_llm::tokenizers::traits::{Encoder, Tokenizer};
use dynamo_llm::types::TokenIdType;
17
use std::path::Path;
18
19
20
21
22
23

const TEST_TOKENIZER: &str = concat!(
    env!("CARGO_MANIFEST_DIR"),
    "/tests/data/sample-models/TinyLlama_v1.1/tokenizer.json"
);

Nikita's avatar
Nikita committed
24
25
26
27
28
const TEST_TIKTOKEN: &str = concat!(
    env!("CARGO_MANIFEST_DIR"),
    "/tests/data/sample-models/mock-tiktoken/tiktoken.model"
);

29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
/// Input Sequence Length for tokenizer
const TARGET_ISL: usize = 8_000;

// A string of length exactly 128 bytes.
const INPUT_STR: &str = "The cat sat by the window, watching raindrops race down the glass. Far thunder rumbled. She purred softly, feeling safe at home.";

/// `cargo bench -- encode` to run it
pub fn encode(c: &mut Criterion) {
    let test_str: &str = &INPUT_STR.repeat(TARGET_ISL / INPUT_STR.len());

    let encoder = HuggingFaceTokenizer::from_file(TEST_TOKENIZER).unwrap();
    let mut group = c.benchmark_group("encode-group");
    group.throughput(Throughput::Bytes(test_str.len() as u64));
    group.bench_function("tokenizer_encode", |b| {
        b.iter(|| {
            let _ = encoder.encode(black_box(test_str)).unwrap();
        })
    });
    group.finish();
}

pub fn decode(c: &mut Criterion) {
    const TEST_TOKS: [TokenIdType; 34] = [
        450, 6635, 3290, 491, 278, 3474, 29892, 21217, 1153, 513, 307, 567, 8175, 1623, 278, 12917,
        29889, 8413, 266, 5062, 364, 25443, 29889, 2296, 3708, 1127, 4964, 368, 29892, 11223, 9109,
        472, 3271, 29889,
    ];
56

57
    let mut group = c.benchmark_group("decode-group");
58
    group.throughput(Throughput::Elements(TEST_TOKS.len() as u64));
59
    group.bench_function("tokenizer_decoder", |b| {
60
61
62
63
64
        b.iter_with_setup(
            || {
                let tokenizer: Arc<dyn Tokenizer> =
                    Arc::new(HuggingFaceTokenizer::from_file(TEST_TOKENIZER).unwrap());
                let ds = DecodeStream::new(tokenizer, &[], false);
65
                Decoder::new(ds, StopConditions::default(), false, None)
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
            },
            |mut decoder| {
                for tok in black_box(TEST_TOKS) {
                    let _ = decoder.step(tok).unwrap();
                }
            },
        )
    });
    group.finish();
}

pub fn decode_big(c: &mut Criterion) {
    const NUM_TOKENS: usize = 2048;

    const BIG_TEST_TOKS: [TokenIdType; NUM_TOKENS] = [450; NUM_TOKENS];
    let mut group = c.benchmark_group("decode-big-group");
    group.throughput(Throughput::Elements(NUM_TOKENS as u64));
    group.bench_function("tokenizer_decoder_big", |b| {
        b.iter_with_setup(
            || {
                let tokenizer: Arc<dyn Tokenizer> =
                    Arc::new(HuggingFaceTokenizer::from_file(TEST_TOKENIZER).unwrap());
                let ds = DecodeStream::new(tokenizer, &[], false);
89
                Decoder::new(ds, StopConditions::default(), false, None)
90
91
92
93
94
95
96
            },
            |mut decoder| {
                for tok in black_box(&BIG_TEST_TOKS) {
                    let _ = decoder.step(*tok).unwrap();
                }
            },
        )
97
98
99
100
    });
    group.finish();
}

Nikita's avatar
Nikita committed
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
pub fn tiktoken_encode(c: &mut Criterion) {
    let test_str: &str = &INPUT_STR.repeat(TARGET_ISL / INPUT_STR.len());

    let encoder = TikTokenTokenizer::from_file_auto(TEST_TIKTOKEN).unwrap();
    let mut group = c.benchmark_group("tiktoken-encode-group");
    group.throughput(Throughput::Bytes(test_str.len() as u64));
    group.bench_function("tiktoken_encode", |b| {
        b.iter(|| {
            let _ = encoder.encode(black_box(test_str)).unwrap();
        })
    });
    group.finish();
}

pub fn tiktoken_decode(c: &mut Criterion) {
    // Encode a test string to get realistic token IDs for this tokenizer
    let encoder = TikTokenTokenizer::from_file_auto(TEST_TIKTOKEN).unwrap();
    let encoding = encoder.encode(INPUT_STR).unwrap();
    let test_toks: Vec<TokenIdType> = encoding.token_ids().to_vec();

    let mut group = c.benchmark_group("tiktoken-decode-group");
    group.throughput(Throughput::Elements(test_toks.len() as u64));
    group.bench_function("tiktoken_decoder", |b| {
        let toks = test_toks.clone();
        b.iter_with_setup(
            || {
                let tokenizer: Arc<dyn Tokenizer> =
                    Arc::new(TikTokenTokenizer::from_file_auto(TEST_TIKTOKEN).unwrap());
                let ds = DecodeStream::new(tokenizer, &[], false);
                Decoder::new(ds, StopConditions::default(), false, None)
            },
            |mut decoder| {
                for tok in black_box(&toks) {
                    let _ = decoder.step(*tok).unwrap();
                }
            },
        )
    });
    group.finish();
}

142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
// ---------------------------------------------------------------------------
// Tokenizer backend benchmarks
//
// By default these use the in-tree TinyLlama tokenizer. Override with a
// production-size tokenizer for more realistic numbers:
//   TOKENIZER_PATH=/path/to/tokenizer.json cargo bench -- fastokens
//   TOKENIZER_PATH=Qwen/Qwen3-0.6B        cargo bench -- fastokens
// ---------------------------------------------------------------------------

/// Default HuggingFace model to download when TOKENIZER_PATH is not set.
const DEFAULT_HF_MODEL: &str = "Qwen/Qwen3-0.6B";

/// Resolve a tokenizer.json path from TOKENIZER_PATH env var or download from HF Hub.
fn resolve_tokenizer_path() -> String {
    let input = std::env::var("TOKENIZER_PATH").ok();

    if let Some(ref p) = input
        && Path::new(p).is_file()
    {
        return p.clone();
    }

    let model_name = input.as_deref().unwrap_or(DEFAULT_HF_MODEL);
    let cache = hf_hub::Cache::default();
    let api = hf_hub::api::sync::ApiBuilder::from_cache(cache)
        .with_progress(true)
        .build()
        .expect("Failed to create HuggingFace API client");

    let repo = api.model(model_name.to_string());
    repo.get("tokenizer.json")
        .expect("Failed to download tokenizer.json from HuggingFace Hub")
        .display()
        .to_string()
}

const FASTOKENS_BATCH_SIZE: usize = 64;

pub fn fastokens_encode(c: &mut Criterion) {
    let tokenizer_path = resolve_tokenizer_path();
    let test_str: &str = &INPUT_STR.repeat(TARGET_ISL / INPUT_STR.len());

    let hf_encoder = HuggingFaceTokenizer::from_file(&tokenizer_path).unwrap();
    let fast_encoder = FastTokenizer::from_file(&tokenizer_path).unwrap();

    // Verify parity before benchmarking
    let hf_ids = hf_encoder.encode(INPUT_STR).unwrap();
    let fast_ids = fast_encoder.encode(INPUT_STR).unwrap();
    assert_eq!(
        hf_ids.token_ids(),
        fast_ids.token_ids(),
        "fastokens and HuggingFace must produce identical token IDs"
    );

    let mut group = c.benchmark_group("fastokens-encode");
    group.throughput(Throughput::Bytes(test_str.len() as u64));

    group.bench_function("hf_encode", |b| {
        b.iter(|| {
            let _ = hf_encoder.encode(black_box(test_str)).unwrap();
        })
    });

    group.bench_function("fastokens_encode", |b| {
        b.iter(|| {
            let _ = fast_encoder.encode(black_box(test_str)).unwrap();
        })
    });

    group.finish();
}

pub fn fastokens_batch_encode(c: &mut Criterion) {
    let tokenizer_path = resolve_tokenizer_path();
    let batch: Vec<&str> = (0..FASTOKENS_BATCH_SIZE).map(|_| INPUT_STR).collect();
    let total_bytes: u64 = batch.iter().map(|s| s.len() as u64).sum();

    let hf_encoder = HuggingFaceTokenizer::from_file(&tokenizer_path).unwrap();
    let fast_encoder = FastTokenizer::from_file(&tokenizer_path).unwrap();

    // Verify batch parity before benchmarking
    let hf_batch = hf_encoder.encode_batch(&batch).unwrap();
    let fast_batch = fast_encoder.encode_batch(&batch).unwrap();
    assert_eq!(
        hf_batch.len(),
        fast_batch.len(),
        "batch result count mismatch: hf={} vs ft={}",
        hf_batch.len(),
        fast_batch.len()
    );
    for (i, (hf_enc, ft_enc)) in hf_batch.iter().zip(fast_batch.iter()).enumerate() {
        assert_eq!(
            hf_enc.token_ids(),
            ft_enc.token_ids(),
            "batch item {i}: fastokens and HuggingFace must produce identical token IDs"
        );
    }

    let mut group = c.benchmark_group("fastokens-batch-encode");
    group.throughput(Throughput::Bytes(total_bytes));

    group.bench_function("hf_batch_encode", |b| {
        b.iter(|| {
            let _ = hf_encoder.encode_batch(black_box(&batch)).unwrap();
        })
    });

    group.bench_function("fastokens_batch_encode", |b| {
        b.iter(|| {
            let _ = fast_encoder.encode_batch(black_box(&batch)).unwrap();
        })
    });

    group.finish();
}

Nikita's avatar
Nikita committed
258
259
260
261
262
263
criterion_group!(
    benches,
    encode,
    decode,
    decode_big,
    tiktoken_encode,
264
265
266
    tiktoken_decode,
    fastokens_encode,
    fastokens_batch_encode
Nikita's avatar
Nikita committed
267
);
268
criterion_main!(benches);