"tests/tokenizers_/test_detokenize.py" did not exist on "5ffc0d13a2d38050ba44c2efd848910d87ceb57e"
tokenizer.rs 5.18 KB
Newer Older
1
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
4
5
6
// SPDX-License-Identifier: Apache-2.0

use std::hint::black_box;
use std::sync::Arc;

7
use criterion::{Criterion, Throughput, criterion_group, criterion_main};
8
9
10

use dynamo_llm::backend::Decoder;
use dynamo_llm::protocols::common::StopConditions;
11
use dynamo_llm::tokenizers::DecodeStream;
12
use dynamo_llm::tokenizers::hf::HuggingFaceTokenizer;
Nikita's avatar
Nikita committed
13
use dynamo_llm::tokenizers::tiktoken::TikTokenTokenizer;
14
15
16
17
18
19
20
21
use dynamo_llm::tokenizers::traits::{Encoder, Tokenizer};
use dynamo_llm::types::TokenIdType;

const TEST_TOKENIZER: &str = concat!(
    env!("CARGO_MANIFEST_DIR"),
    "/tests/data/sample-models/TinyLlama_v1.1/tokenizer.json"
);

Nikita's avatar
Nikita committed
22
23
24
25
26
const TEST_TIKTOKEN: &str = concat!(
    env!("CARGO_MANIFEST_DIR"),
    "/tests/data/sample-models/mock-tiktoken/tiktoken.model"
);

27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
/// Input Sequence Length for tokenizer
const TARGET_ISL: usize = 8_000;

// A string of length exactly 128 bytes.
const INPUT_STR: &str = "The cat sat by the window, watching raindrops race down the glass. Far thunder rumbled. She purred softly, feeling safe at home.";

/// `cargo bench -- encode` to run it
pub fn encode(c: &mut Criterion) {
    let test_str: &str = &INPUT_STR.repeat(TARGET_ISL / INPUT_STR.len());

    let encoder = HuggingFaceTokenizer::from_file(TEST_TOKENIZER).unwrap();
    let mut group = c.benchmark_group("encode-group");
    group.throughput(Throughput::Bytes(test_str.len() as u64));
    group.bench_function("tokenizer_encode", |b| {
        b.iter(|| {
            let _ = encoder.encode(black_box(test_str)).unwrap();
        })
    });
    group.finish();
}

pub fn decode(c: &mut Criterion) {
    const TEST_TOKS: [TokenIdType; 34] = [
        450, 6635, 3290, 491, 278, 3474, 29892, 21217, 1153, 513, 307, 567, 8175, 1623, 278, 12917,
        29889, 8413, 266, 5062, 364, 25443, 29889, 2296, 3708, 1127, 4964, 368, 29892, 11223, 9109,
        472, 3271, 29889,
    ];
54

55
    let mut group = c.benchmark_group("decode-group");
56
    group.throughput(Throughput::Elements(TEST_TOKS.len() as u64));
57
    group.bench_function("tokenizer_decoder", |b| {
58
59
60
61
62
        b.iter_with_setup(
            || {
                let tokenizer: Arc<dyn Tokenizer> =
                    Arc::new(HuggingFaceTokenizer::from_file(TEST_TOKENIZER).unwrap());
                let ds = DecodeStream::new(tokenizer, &[], false);
63
                Decoder::new(ds, StopConditions::default(), false, None)
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
            },
            |mut decoder| {
                for tok in black_box(TEST_TOKS) {
                    let _ = decoder.step(tok).unwrap();
                }
            },
        )
    });
    group.finish();
}

pub fn decode_big(c: &mut Criterion) {
    const NUM_TOKENS: usize = 2048;

    const BIG_TEST_TOKS: [TokenIdType; NUM_TOKENS] = [450; NUM_TOKENS];
    let mut group = c.benchmark_group("decode-big-group");
    group.throughput(Throughput::Elements(NUM_TOKENS as u64));
    group.bench_function("tokenizer_decoder_big", |b| {
        b.iter_with_setup(
            || {
                let tokenizer: Arc<dyn Tokenizer> =
                    Arc::new(HuggingFaceTokenizer::from_file(TEST_TOKENIZER).unwrap());
                let ds = DecodeStream::new(tokenizer, &[], false);
87
                Decoder::new(ds, StopConditions::default(), false, None)
88
89
90
91
92
93
94
            },
            |mut decoder| {
                for tok in black_box(&BIG_TEST_TOKS) {
                    let _ = decoder.step(*tok).unwrap();
                }
            },
        )
95
96
97
98
    });
    group.finish();
}

Nikita's avatar
Nikita committed
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
pub fn tiktoken_encode(c: &mut Criterion) {
    let test_str: &str = &INPUT_STR.repeat(TARGET_ISL / INPUT_STR.len());

    let encoder = TikTokenTokenizer::from_file_auto(TEST_TIKTOKEN).unwrap();
    let mut group = c.benchmark_group("tiktoken-encode-group");
    group.throughput(Throughput::Bytes(test_str.len() as u64));
    group.bench_function("tiktoken_encode", |b| {
        b.iter(|| {
            let _ = encoder.encode(black_box(test_str)).unwrap();
        })
    });
    group.finish();
}

pub fn tiktoken_decode(c: &mut Criterion) {
    // Encode a test string to get realistic token IDs for this tokenizer
    let encoder = TikTokenTokenizer::from_file_auto(TEST_TIKTOKEN).unwrap();
    let encoding = encoder.encode(INPUT_STR).unwrap();
    let test_toks: Vec<TokenIdType> = encoding.token_ids().to_vec();

    let mut group = c.benchmark_group("tiktoken-decode-group");
    group.throughput(Throughput::Elements(test_toks.len() as u64));
    group.bench_function("tiktoken_decoder", |b| {
        let toks = test_toks.clone();
        b.iter_with_setup(
            || {
                let tokenizer: Arc<dyn Tokenizer> =
                    Arc::new(TikTokenTokenizer::from_file_auto(TEST_TIKTOKEN).unwrap());
                let ds = DecodeStream::new(tokenizer, &[], false);
                Decoder::new(ds, StopConditions::default(), false, None)
            },
            |mut decoder| {
                for tok in black_box(&toks) {
                    let _ = decoder.step(*tok).unwrap();
                }
            },
        )
    });
    group.finish();
}

criterion_group!(
    benches,
    encode,
    decode,
    decode_big,
    tiktoken_encode,
    tiktoken_decode
);
148
criterion_main!(benches);