Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
da810a26
"docs/vscode:/vscode.git/clone" did not exist on "d9b674b8689ada6f56099715f7028da5809c26c9"
Unverified
Commit
da810a26
authored
Mar 15, 2026
by
Biswa Panda
Committed by
GitHub
Mar 15, 2026
Browse files
feat: integrate fastokens BPE tokenizer backend (#7387)
parent
cdf66b11
Changes
12
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
551 additions
and
12 deletions
+551
-12
.cargo/config.toml
.cargo/config.toml
+6
-0
Cargo.lock
Cargo.lock
+78
-4
Cargo.toml
Cargo.toml
+1
-0
components/src/dynamo/frontend/frontend_args.py
components/src/dynamo/frontend/frontend_args.py
+22
-0
components/src/dynamo/frontend/main.py
components/src/dynamo/frontend/main.py
+4
-0
lib/bindings/kvbm/Cargo.lock
lib/bindings/kvbm/Cargo.lock
+93
-4
lib/bindings/python/Cargo.lock
lib/bindings/python/Cargo.lock
+93
-4
lib/llm/Cargo.toml
lib/llm/Cargo.toml
+1
-0
lib/llm/src/model_card.rs
lib/llm/src/model_card.rs
+39
-0
lib/llm/src/tokenizers.rs
lib/llm/src/tokenizers.rs
+2
-0
lib/llm/src/tokenizers/fastokens.rs
lib/llm/src/tokenizers/fastokens.rs
+160
-0
lib/llm/tests/data/sample-models/minimal-bpe/tokenizer.json
lib/llm/tests/data/sample-models/minimal-bpe/tokenizer.json
+52
-0
No files found.
.cargo/config.toml
View file @
da810a26
...
...
@@ -13,3 +13,9 @@ rustflags = ["-C", "target-cpu=x86-64-v3", "--cfg", "tokio_unstable"]
[target.aarch64-unknown-linux-gnu]
rustflags
=
[
"-C"
,
"target-cpu=neoverse-n1"
,
"--cfg"
,
"tokio_unstable"
]
# Static-link pcre2 C library (used by the fastokens tokenizer crate).
# Without this, pcre2-sys tries to find a system libpcre2 via pkg-config,
# which breaks Docker builds and bundles a .so into the Python wheel.
[env]
PCRE2_SYS_STATIC
=
"1"
Cargo.lock
View file @
da810a26
...
...
@@ -1063,7 +1063,7 @@ version = "3.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "faf9468729b8cbcea668e36183cb69d317348c2e08e994829fb56ebfdfbaac34"
dependencies = [
"windows-sys 0.
61.2
",
"windows-sys 0.
48.0
",
]
[[package]]
...
...
@@ -1481,6 +1481,12 @@ dependencies = [
"syn 2.0.117",
]
[[package]]
name = "daachorse"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "63b7ef7a4be509357f4804d0a22e830daddb48f19fd604e4ad32ddce04a94c36"
[[package]]
name = "darling"
version = "0.20.11"
...
...
@@ -1986,6 +1992,7 @@ dependencies = [
"dynamo-runtime",
"dynamo-tokens",
"either",
"fastokens",
"ffmpeg-next",
"flate2",
"futures",
...
...
@@ -2418,6 +2425,36 @@ dependencies = [
"regex-syntax",
]
[[package]]
name = "fancy-regex"
version = "0.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72cf461f865c862bb7dc573f643dd6a2b6842f7c30b07882b56bd148cc2761b8"
dependencies = [
"bit-set 0.8.0",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "fastokens"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aca43986686f3dff724cc465e0afcf883c361112474072b1d825058852b25f9c"
dependencies = [
"daachorse",
"fancy-regex 0.17.0",
"hf-hub",
"icu_normalizer",
"memchr",
"pcre2",
"rayon",
"serde",
"serde_json",
"strum",
"thiserror 2.0.18",
]
[[package]]
name = "fastrand"
version = "2.3.0"
...
...
@@ -3275,6 +3312,9 @@ dependencies = [
"icu_properties",
"icu_provider",
"smallvec",
"utf16_iter",
"utf8_iter",
"write16",
"zerovec",
]
...
...
@@ -5374,6 +5414,28 @@ version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df94ce210e5bc13cb6651479fa48d14f601d9858cfe0467f43ae157023b938d3"
[[package]]
name = "pcre2"
version = "0.2.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e970b0fcce0c7ee6ef662744ff711f21ccd6f11b7cf03cd187a80e89797fc67"
dependencies = [
"libc",
"log",
"pcre2-sys",
]
[[package]]
name = "pcre2-sys"
version = "0.2.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "18b9073c1a2549bd409bf4a32c94d903bb1a09bf845bc306ae148897fa0760a4"
dependencies = [
"cc",
"libc",
"pkg-config",
]
[[package]]
name = "pear"
version = "0.2.9"
...
...
@@ -5800,7 +5862,7 @@ version = "0.13.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "be769465445e8c1474e9c5dac2018218498557af32d9ed057325ec9a41ae81bf"
dependencies = [
"heck 0.
5.0
",
"heck 0.
4.1
",
"itertools 0.14.0",
"log",
"multimap",
...
...
@@ -5820,7 +5882,7 @@ version = "0.14.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7"
dependencies = [
"heck 0.
5.0
",
"heck 0.
4.1
",
"itertools 0.14.0",
"log",
"multimap",
...
...
@@ -8559,6 +8621,12 @@ version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
[[package]]
name = "utf16_iter"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246"
[[package]]
name = "utf8_iter"
version = "1.0.4"
...
...
@@ -9011,7 +9079,7 @@ version = "0.1.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
dependencies = [
"windows-sys 0.
61.2
",
"windows-sys 0.
48.0
",
]
[[package]]
...
...
@@ -9428,6 +9496,12 @@ dependencies = [
"wasmparser",
]
[[package]]
name = "write16"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936"
[[package]]
name = "writeable"
version = "0.6.2"
...
...
Cargo.toml
View file @
da810a26
...
...
@@ -46,6 +46,7 @@ dynamo-mocker = { path = "lib/mocker", version = "1.0.0" }
dynamo-kv-router
=
{
path
=
"lib/kv-router"
,
version
=
"1.0.0"
,
features
=
[
"metrics"
,
"runtime-protocols"
]
}
dynamo-async-openai
=
{
path
=
"lib/async-openai"
,
version
=
"1.0.0"
,
features
=
["byot"]
}
dynamo-parsers
=
{
path
=
"lib/parsers"
,
version
=
"1.0.0"
}
fastokens
=
{
version
=
"0.1.0"
}
# kvbm
kvbm-common
=
{
path
=
"lib/kvbm-common"
,
version
=
"0.1.0"
}
...
...
components/src/dynamo/frontend/frontend_args.py
View file @
da810a26
...
...
@@ -76,6 +76,9 @@ class FrontendConfig(KvRouterConfigBase):
enable_streaming_tool_dispatch
:
bool
enable_streaming_reasoning_dispatch
:
bool
preprocess_workers
:
int
tokenizer_backend
:
str
_VALID_TOKENIZER_BACKENDS
=
{
"default"
,
"fastokens"
}
def
validate
(
self
)
->
None
:
if
bool
(
self
.
tls_cert_path
)
^
bool
(
self
.
tls_key_path
):
# ^ is XOR
...
...
@@ -88,6 +91,11 @@ class FrontendConfig(KvRouterConfigBase):
)
if
self
.
router_enable_cache_control
and
self
.
router_mode
!=
"kv"
:
raise
ValueError
(
"--enable-cache-control requires --router-mode=kv"
)
if
self
.
tokenizer_backend
not
in
self
.
_VALID_TOKENIZER_BACKENDS
:
raise
ValueError
(
f
"--tokenizer: invalid value '
{
self
.
tokenizer_backend
}
' "
f
"(choose from
{
sorted
(
self
.
_VALID_TOKENIZER_BACKENDS
)
}
)"
)
@
register_encoder
(
FrontendConfig
)
...
...
@@ -424,3 +432,17 @@ class FrontendArgGroup(ArgGroup):
),
arg_type
=
int
,
)
add_argument
(
g
,
flag_name
=
"--tokenizer"
,
env_var
=
"DYN_TOKENIZER"
,
default
=
"default"
,
dest
=
"tokenizer_backend"
,
help
=
(
"Tokenizer backend for BPE models: 'default' (HuggingFace tokenizers library) "
"or 'fastokens' (fastokens crate for high-performance BPE encoding). "
"Decoding always uses HuggingFace. Has no effect on TikToken models."
),
choices
=
[
"default"
,
"fastokens"
],
)
components/src/dynamo/frontend/main.py
View file @
da810a26
...
...
@@ -165,6 +165,10 @@ async def async_main():
config
,
vllm_flags
,
sglang_flags
=
parse_args
()
dump_config
(
config
.
dump_config_to
,
config
)
os
.
environ
[
"DYN_EVENT_PLANE"
]
=
config
.
event_plane
if
config
.
tokenizer_backend
==
"fastokens"
:
os
.
environ
[
"DYN_TOKENIZER"
]
=
"fastokens"
else
:
os
.
environ
.
pop
(
"DYN_TOKENIZER"
,
None
)
logger
.
info
(
f
"Request migration
{
'enabled'
if
config
.
migration_limit
>
0
else
'disabled'
}
"
f
"(limit:
{
config
.
migration_limit
}
)"
...
...
lib/bindings/kvbm/Cargo.lock
View file @
da810a26
...
...
@@ -585,7 +585,16 @@ version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1"
dependencies = [
"bit-vec",
"bit-vec 0.6.3",
]
[[package]]
name = "bit-set"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3"
dependencies = [
"bit-vec 0.8.0",
]
[[package]]
...
...
@@ -594,6 +603,12 @@ version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb"
[[package]]
name = "bit-vec"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7"
[[package]]
name = "bit_field"
version = "0.10.3"
...
...
@@ -1139,6 +1154,12 @@ dependencies = [
"syn",
]
[[package]]
name = "daachorse"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "63b7ef7a4be509357f4804d0a22e830daddb48f19fd604e4ad32ddce04a94c36"
[[package]]
name = "darling"
version = "0.20.11"
...
...
@@ -1608,6 +1629,7 @@ dependencies = [
"dynamo-runtime",
"dynamo-tokens",
"either",
"fastokens",
"flate2",
"futures",
"futures-util",
...
...
@@ -2004,11 +2026,41 @@ version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "531e46835a22af56d1e3b66f04844bed63158bc094a628bec1d321d9b4c44bf2"
dependencies = [
"bit-set",
"bit-set
0.5.3
",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "fancy-regex"
version = "0.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72cf461f865c862bb7dc573f643dd6a2b6842f7c30b07882b56bd148cc2761b8"
dependencies = [
"bit-set 0.8.0",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "fastokens"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aca43986686f3dff724cc465e0afcf883c361112474072b1d825058852b25f9c"
dependencies = [
"daachorse",
"fancy-regex 0.17.0",
"hf-hub",
"icu_normalizer",
"memchr",
"pcre2",
"rayon",
"serde",
"serde_json",
"strum",
"thiserror 2.0.18",
]
[[package]]
name = "fastrand"
version = "2.3.0"
...
...
@@ -2705,6 +2757,9 @@ dependencies = [
"icu_properties",
"icu_provider",
"smallvec",
"utf16_iter",
"utf8_iter",
"write16",
"zerovec",
]
...
...
@@ -4351,7 +4406,7 @@ dependencies = [
"base64 0.22.1",
"bstr",
"clap",
"fancy-regex",
"fancy-regex
0.13.0
",
"futures",
"image",
"regex",
...
...
@@ -4562,6 +4617,28 @@ version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df94ce210e5bc13cb6651479fa48d14f601d9858cfe0467f43ae157023b938d3"
[[package]]
name = "pcre2"
version = "0.2.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e970b0fcce0c7ee6ef662744ff711f21ccd6f11b7cf03cd187a80e89797fc67"
dependencies = [
"libc",
"log",
"pcre2-sys",
]
[[package]]
name = "pcre2-sys"
version = "0.2.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "18b9073c1a2549bd409bf4a32c94d903bb1a09bf845bc306ae148897fa0760a4"
dependencies = [
"cc",
"libc",
"pkg-config",
]
[[package]]
name = "pear"
version = "0.2.9"
...
...
@@ -6516,7 +6593,7 @@ dependencies = [
"anyhow",
"base64 0.22.1",
"bstr",
"fancy-regex",
"fancy-regex
0.13.0
",
"lazy_static",
"regex",
"rustc-hash 1.1.0",
...
...
@@ -7312,6 +7389,12 @@ dependencies = [
"serde_derive",
]
[[package]]
name = "utf16_iter"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246"
[[package]]
name = "utf8_iter"
version = "1.0.4"
...
...
@@ -8079,6 +8162,12 @@ dependencies = [
"wasmparser",
]
[[package]]
name = "write16"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936"
[[package]]
name = "writeable"
version = "0.6.2"
...
...
lib/bindings/python/Cargo.lock
View file @
da810a26
...
...
@@ -603,7 +603,16 @@ version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1"
dependencies = [
"bit-vec",
"bit-vec 0.6.3",
]
[[package]]
name = "bit-set"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3"
dependencies = [
"bit-vec 0.8.0",
]
[[package]]
...
...
@@ -612,6 +621,12 @@ version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb"
[[package]]
name = "bit-vec"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7"
[[package]]
name = "bit_field"
version = "0.10.3"
...
...
@@ -1157,6 +1172,12 @@ dependencies = [
"syn",
]
[[package]]
name = "daachorse"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "63b7ef7a4be509357f4804d0a22e830daddb48f19fd604e4ad32ddce04a94c36"
[[package]]
name = "darling"
version = "0.20.11"
...
...
@@ -1621,6 +1642,7 @@ dependencies = [
"dynamo-runtime",
"dynamo-tokens",
"either",
"fastokens",
"ffmpeg-next",
"flate2",
"futures",
...
...
@@ -2048,11 +2070,41 @@ version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "531e46835a22af56d1e3b66f04844bed63158bc094a628bec1d321d9b4c44bf2"
dependencies = [
"bit-set",
"bit-set
0.5.3
",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "fancy-regex"
version = "0.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72cf461f865c862bb7dc573f643dd6a2b6842f7c30b07882b56bd148cc2761b8"
dependencies = [
"bit-set 0.8.0",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "fastokens"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aca43986686f3dff724cc465e0afcf883c361112474072b1d825058852b25f9c"
dependencies = [
"daachorse",
"fancy-regex 0.17.0",
"hf-hub",
"icu_normalizer",
"memchr",
"pcre2",
"rayon",
"serde",
"serde_json",
"strum",
"thiserror 2.0.18",
]
[[package]]
name = "fastrand"
version = "2.3.0"
...
...
@@ -2774,6 +2826,9 @@ dependencies = [
"icu_properties",
"icu_provider",
"smallvec",
"utf16_iter",
"utf8_iter",
"write16",
"zerovec",
]
...
...
@@ -4408,7 +4463,7 @@ dependencies = [
"base64 0.22.1",
"bstr",
"clap",
"fancy-regex",
"fancy-regex
0.13.0
",
"futures",
"image",
"regex",
...
...
@@ -4619,6 +4674,28 @@ version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df94ce210e5bc13cb6651479fa48d14f601d9858cfe0467f43ae157023b938d3"
[[package]]
name = "pcre2"
version = "0.2.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e970b0fcce0c7ee6ef662744ff711f21ccd6f11b7cf03cd187a80e89797fc67"
dependencies = [
"libc",
"log",
"pcre2-sys",
]
[[package]]
name = "pcre2-sys"
version = "0.2.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "18b9073c1a2549bd409bf4a32c94d903bb1a09bf845bc306ae148897fa0760a4"
dependencies = [
"cc",
"libc",
"pkg-config",
]
[[package]]
name = "pear"
version = "0.2.9"
...
...
@@ -6583,7 +6660,7 @@ dependencies = [
"anyhow",
"base64 0.22.1",
"bstr",
"fancy-regex",
"fancy-regex
0.13.0
",
"lazy_static",
"regex",
"rustc-hash 1.1.0",
...
...
@@ -7379,6 +7456,12 @@ dependencies = [
"serde_derive",
]
[[package]]
name = "utf16_iter"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246"
[[package]]
name = "utf8_iter"
version = "1.0.4"
...
...
@@ -8163,6 +8246,12 @@ dependencies = [
"wasmparser",
]
[[package]]
name = "write16"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936"
[[package]]
name = "writeable"
version = "0.6.2"
...
...
lib/llm/Cargo.toml
View file @
da810a26
...
...
@@ -143,6 +143,7 @@ tokenizers = { version = "0.21.4", default-features = false, features = [
]
}
tiktoken-rs
=
{
version
=
"0.9"
,
default-features
=
false
}
rustc-hash
=
"1.1"
fastokens
=
{
workspace
=
true
}
# backend
galil-seiferas
=
{
version
=
"0.1"
}
...
...
lib/llm/src/model_card.rs
View file @
da810a26
...
...
@@ -378,12 +378,51 @@ impl ModelDeploymentCard {
/// Load the tokenizer as a generic, backend-agnostic `Tokenizer` trait object.
/// This supports both HuggingFace `tokenizer.json` and tiktoken `.model`/`.tiktoken` files.
///
/// When the `DYN_TOKENIZER=fastokens` env var is set, uses `fastokens` for encoding
pub
fn
tokenizer
(
&
self
)
->
anyhow
::
Result
<
crate
::
tokenizers
::
Tokenizer
>
{
let
use_fast
=
match
std
::
env
::
var
(
"DYN_TOKENIZER"
)
{
Ok
(
v
)
if
v
==
"fastokens"
=>
true
,
Ok
(
v
)
if
v
==
"default"
||
v
.is_empty
()
=>
false
,
Ok
(
v
)
=>
{
tracing
::
warn!
(
value
=
%
v
,
"Unrecognized DYN_TOKENIZER value, expected 'fastokens' or 'default'; falling back to default"
);
false
}
Err
(
_
)
=>
false
,
};
match
&
self
.tokenizer
{
Some
(
TokenizerKind
::
HfTokenizerJson
(
checked_file
))
=>
{
let
p
=
checked_file
.path
()
.ok_or_else
(||
{
anyhow
::
anyhow!
(
"Tokenizer is URL-backed ({:?})"
,
checked_file
.url
())
})
?
;
// Try fastokens backend if requested
if
use_fast
{
if
let
Some
(
path_str
)
=
p
.to_str
()
{
match
crate
::
tokenizers
::
FastTokenizer
::
from_file
(
path_str
)
{
Ok
(
fast
)
=>
{
tracing
::
info!
(
"Using fastokens tokenizer backend"
);
return
Ok
(
crate
::
tokenizers
::
Tokenizer
::
from
(
Arc
::
new
(
fast
)));
}
Err
(
e
)
=>
{
tracing
::
warn!
(
%
e
,
"Failed to load fastokens, falling back to HuggingFace"
);
}
}
}
else
{
tracing
::
warn!
(
path
=
%
p
.display
(),
"Tokenizer path contains non-UTF-8 characters, skipping fastokens; falling back to HuggingFace"
);
}
}
let
hf
=
HfTokenizer
::
from_file
(
p
)
.inspect_err
(|
err
|
{
if
let
Some
(
serde_err
)
=
err
.downcast_ref
::
<
serde_json
::
Error
>
()
...
...
lib/llm/src/tokenizers.rs
View file @
da810a26
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
pub
mod
fastokens
;
pub
mod
hf
;
pub
mod
tiktoken
;
...
...
@@ -15,6 +16,7 @@ use std::{ops::Deref, path::Path};
use
crate
::
protocols
::
TokenIdType
;
pub
use
anyhow
::{
Error
,
Result
};
pub
use
fastokens
::
FastTokenizer
;
pub
use
hf
::
HuggingFaceTokenizer
;
pub
use
tiktoken
::
TikTokenTokenizer
;
...
...
lib/llm/src/tokenizers/fastokens.rs
0 → 100644
View file @
da810a26
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Fastokens backend using the `fastokens` crate for high-performance BPE encoding.
//!
//! `fastokens` only supports encoding, so this module provides a hybrid tokenizer that
//! uses `fastokens` for encoding and falls back to `HuggingFaceTokenizer` for decoding.
//! Both are loaded from the same `tokenizer.json` file.
use
std
::
path
::
Path
;
use
rayon
::
prelude
::
*
;
use
super
::{
Encoding
,
Error
,
Result
,
TokenIdType
,
hf
::
HuggingFaceTokenizer
,
traits
::{
Decoder
,
Encoder
,
Tokenizer
},
};
/// Hybrid tokenizer: fast BPE encoding via `fastokens`, decoding via HuggingFace.
///
/// Both backends are loaded from the same `tokenizer.json` file.
pub
struct
FastTokenizer
{
fast_encoder
:
fastokens
::
Tokenizer
,
hf_decoder
:
HuggingFaceTokenizer
,
}
impl
FastTokenizer
{
pub
fn
from_file
(
path
:
&
str
)
->
Result
<
Self
>
{
let
fast_encoder
=
fastokens
::
Tokenizer
::
from_file
(
Path
::
new
(
path
))
.map_err
(|
e
|
Error
::
msg
(
format!
(
"Error loading fastokens tokenizer: {e}"
)))
?
;
let
hf_decoder
=
HuggingFaceTokenizer
::
from_file
(
path
)
?
;
Ok
(
Self
{
fast_encoder
,
hf_decoder
,
})
}
}
impl
Encoder
for
FastTokenizer
{
fn
encode
(
&
self
,
input
:
&
str
)
->
Result
<
Encoding
>
{
let
ids
=
self
.fast_encoder
.encode
(
input
)
.map_err
(|
e
|
Error
::
msg
(
format!
(
"Fastokens encode error: {e}"
)))
?
;
Ok
(
Encoding
::
Sp
(
ids
))
}
fn
encode_batch
(
&
self
,
inputs
:
&
[
&
str
])
->
Result
<
Vec
<
Encoding
>>
{
inputs
.par_iter
()
.map
(|
input
|
self
.encode
(
input
))
.collect
()
}
}
impl
Decoder
for
FastTokenizer
{
fn
decode
(
&
self
,
token_ids
:
&
[
TokenIdType
],
skip_special_tokens
:
bool
)
->
Result
<
String
>
{
self
.hf_decoder
.decode
(
token_ids
,
skip_special_tokens
)
}
}
impl
Tokenizer
for
FastTokenizer
{}
#[cfg(test)]
mod
tests
{
use
super
::
*
;
use
crate
::
tokenizers
::
HuggingFaceTokenizer
;
// Minimal synthetic BPE tokenizer with no normalizer or post-processor --
// compatible with fastokens. Vocab covers: H,T,a,d,e,h,i,l,o,r,s,t,w + punctuation.
const
TOKENIZER_PATH
:
&
str
=
concat!
(
env!
(
"CARGO_MANIFEST_DIR"
),
"/tests/data/sample-models/minimal-bpe/tokenizer.json"
);
#[test]
fn
test_fast_encode_decode_roundtrip
()
{
let
tokenizer
=
FastTokenizer
::
from_file
(
TOKENIZER_PATH
)
.unwrap
();
// Encode then decode: verifies both paths execute without error.
// With a null decoder, HF inserts spaces between tokens so exact equality
// is not expected here -- we just verify the operations succeed and produce
// non-empty results.
let
text
=
"Hello, world!"
;
let
encoding
=
tokenizer
.encode
(
text
)
.unwrap
();
assert
!
(
!
encoding
.token_ids
()
.is_empty
());
let
decoded
=
tokenizer
.decode
(
encoding
.token_ids
(),
true
)
.unwrap
();
assert
!
(
!
decoded
.is_empty
());
// The decoded text should contain the same non-space characters
let
enc_chars
:
String
=
text
.chars
()
.filter
(|
c
|
!
c
.is_whitespace
())
.collect
();
let
dec_chars
:
String
=
decoded
.chars
()
.filter
(|
c
|
!
c
.is_whitespace
())
.collect
();
assert_eq!
(
enc_chars
,
dec_chars
,
"non-space characters must be preserved"
);
}
#[test]
fn
test_fast_matches_hf_encoding
()
{
let
fast
=
FastTokenizer
::
from_file
(
TOKENIZER_PATH
)
.unwrap
();
let
hf
=
HuggingFaceTokenizer
::
from_file
(
TOKENIZER_PATH
)
.unwrap
();
for
text
in
&
[
"Hello, world!"
,
"Hello"
,
" world"
,
"He llo"
]
{
let
fast_ids
=
fast
.encode
(
text
)
.unwrap
();
let
hf_ids
=
hf
.encode
(
text
)
.unwrap
();
assert_eq!
(
fast_ids
.token_ids
(),
hf_ids
.token_ids
(),
"fastokens and HuggingFace must produce identical token IDs for '{text}'"
);
}
}
#[test]
fn
test_fast_batch_encode
()
{
let
tokenizer
=
FastTokenizer
::
from_file
(
TOKENIZER_PATH
)
.unwrap
();
let
inputs
=
&
[
"Hello"
,
" world"
,
"Hello, world!"
];
let
encodings
=
tokenizer
.encode_batch
(
inputs
)
.unwrap
();
assert_eq!
(
encodings
.len
(),
inputs
.len
());
for
(
enc
,
input
)
in
encodings
.iter
()
.zip
(
inputs
.iter
())
{
assert
!
(
!
enc
.token_ids
()
.is_empty
(),
"encoding for '{input}' must be non-empty"
);
}
}
#[test]
fn
test_fast_with_decode_stream
()
{
use
crate
::
tokenizers
::
Tokenizer
as
TokenizerWrapper
;
use
std
::
sync
::
Arc
;
let
tokenizer
=
Arc
::
new
(
FastTokenizer
::
from_file
(
TOKENIZER_PATH
)
.unwrap
());
let
wrapper
=
TokenizerWrapper
::
from
(
tokenizer
);
// Encode a prompt and a continuation, then step through the decode stream
let
prompt_ids
=
wrapper
.encode
(
"Hello"
)
.unwrap
()
.token_ids
()
.to_vec
();
let
continuation
=
", world!"
;
let
cont_ids
=
wrapper
.encode
(
continuation
)
.unwrap
()
.token_ids
()
.to_vec
();
let
mut
stream
=
wrapper
.decode_stream
(
&
prompt_ids
,
true
);
// Accumulate incremental chunks from decode_stream
let
mut
accumulated
=
String
::
new
();
for
id
in
&
cont_ids
{
if
let
Some
(
chunk
)
=
stream
.step
(
*
id
)
.unwrap
()
{
accumulated
.push_str
(
&
chunk
);
}
}
// DecodeStream uses prompt tokens as context, so the expected text is
// decode(prompt + continuation) minus decode(prompt) -- not a bare
// decode(continuation) which lacks the surrounding context.
let
mut
all_ids
=
prompt_ids
.clone
();
all_ids
.extend_from_slice
(
&
cont_ids
);
let
full_text
=
wrapper
.decode
(
&
all_ids
,
true
)
.unwrap
();
let
prompt_text
=
wrapper
.decode
(
&
prompt_ids
,
true
)
.unwrap
();
let
expected
=
&
full_text
[
prompt_text
.len
()
..
];
assert_eq!
(
accumulated
,
expected
,
"streamed chunks must equal context-aware decoded continuation"
);
}
}
lib/llm/tests/data/sample-models/minimal-bpe/tokenizer.json
0 → 100644
View file @
da810a26
{
"version"
:
"1.0"
,
"truncation"
:
null
,
"padding"
:
null
,
"added_tokens"
:
[],
"normalizer"
:
null
,
"pre_tokenizer"
:
null
,
"post_processor"
:
null
,
"decoder"
:
null
,
"model"
:
{
"type"
:
"BPE"
,
"dropout"
:
null
,
"unk_token"
:
"<unk>"
,
"continuing_subword_prefix"
:
null
,
"end_of_word_suffix"
:
null
,
"fuse_unk"
:
false
,
"byte_fallback"
:
false
,
"ignore_merges"
:
false
,
"vocab"
:
{
"<unk>"
:
0
,
" "
:
1
,
"!"
:
2
,
","
:
3
,
"."
:
4
,
"H"
:
5
,
"T"
:
6
,
"a"
:
7
,
"d"
:
8
,
"e"
:
9
,
"h"
:
10
,
"i"
:
11
,
"l"
:
12
,
"o"
:
13
,
"r"
:
14
,
"s"
:
15
,
"t"
:
16
,
"w"
:
17
,
"He"
:
18
,
"ll"
:
19
,
"llo"
:
20
,
"or"
:
21
,
"ld"
:
22
},
"merges"
:
[
"H e"
,
"l l"
,
"ll o"
,
"o r"
,
"l d"
]
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment