Unverified Commit da810a26 authored by Biswa Panda's avatar Biswa Panda Committed by GitHub
Browse files

feat: integrate fastokens BPE tokenizer backend (#7387)

parent cdf66b11
...@@ -13,3 +13,9 @@ rustflags = ["-C", "target-cpu=x86-64-v3", "--cfg", "tokio_unstable"] ...@@ -13,3 +13,9 @@ rustflags = ["-C", "target-cpu=x86-64-v3", "--cfg", "tokio_unstable"]
[target.aarch64-unknown-linux-gnu] [target.aarch64-unknown-linux-gnu]
rustflags = ["-C", "target-cpu=neoverse-n1", "--cfg", "tokio_unstable"] rustflags = ["-C", "target-cpu=neoverse-n1", "--cfg", "tokio_unstable"]
# Static-link pcre2 C library (used by the fastokens tokenizer crate).
# Without this, pcre2-sys tries to find a system libpcre2 via pkg-config,
# which breaks Docker builds and bundles a .so into the Python wheel.
[env]
PCRE2_SYS_STATIC = "1"
...@@ -1063,7 +1063,7 @@ version = "3.1.1" ...@@ -1063,7 +1063,7 @@ version = "3.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "faf9468729b8cbcea668e36183cb69d317348c2e08e994829fb56ebfdfbaac34" checksum = "faf9468729b8cbcea668e36183cb69d317348c2e08e994829fb56ebfdfbaac34"
dependencies = [ dependencies = [
"windows-sys 0.61.2", "windows-sys 0.48.0",
] ]
[[package]] [[package]]
...@@ -1481,6 +1481,12 @@ dependencies = [ ...@@ -1481,6 +1481,12 @@ dependencies = [
"syn 2.0.117", "syn 2.0.117",
] ]
[[package]]
name = "daachorse"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "63b7ef7a4be509357f4804d0a22e830daddb48f19fd604e4ad32ddce04a94c36"
[[package]] [[package]]
name = "darling" name = "darling"
version = "0.20.11" version = "0.20.11"
...@@ -1986,6 +1992,7 @@ dependencies = [ ...@@ -1986,6 +1992,7 @@ dependencies = [
"dynamo-runtime", "dynamo-runtime",
"dynamo-tokens", "dynamo-tokens",
"either", "either",
"fastokens",
"ffmpeg-next", "ffmpeg-next",
"flate2", "flate2",
"futures", "futures",
...@@ -2418,6 +2425,36 @@ dependencies = [ ...@@ -2418,6 +2425,36 @@ dependencies = [
"regex-syntax", "regex-syntax",
] ]
[[package]]
name = "fancy-regex"
version = "0.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72cf461f865c862bb7dc573f643dd6a2b6842f7c30b07882b56bd148cc2761b8"
dependencies = [
"bit-set 0.8.0",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "fastokens"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aca43986686f3dff724cc465e0afcf883c361112474072b1d825058852b25f9c"
dependencies = [
"daachorse",
"fancy-regex 0.17.0",
"hf-hub",
"icu_normalizer",
"memchr",
"pcre2",
"rayon",
"serde",
"serde_json",
"strum",
"thiserror 2.0.18",
]
[[package]] [[package]]
name = "fastrand" name = "fastrand"
version = "2.3.0" version = "2.3.0"
...@@ -3275,6 +3312,9 @@ dependencies = [ ...@@ -3275,6 +3312,9 @@ dependencies = [
"icu_properties", "icu_properties",
"icu_provider", "icu_provider",
"smallvec", "smallvec",
"utf16_iter",
"utf8_iter",
"write16",
"zerovec", "zerovec",
] ]
...@@ -5374,6 +5414,28 @@ version = "0.2.3" ...@@ -5374,6 +5414,28 @@ version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df94ce210e5bc13cb6651479fa48d14f601d9858cfe0467f43ae157023b938d3" checksum = "df94ce210e5bc13cb6651479fa48d14f601d9858cfe0467f43ae157023b938d3"
[[package]]
name = "pcre2"
version = "0.2.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e970b0fcce0c7ee6ef662744ff711f21ccd6f11b7cf03cd187a80e89797fc67"
dependencies = [
"libc",
"log",
"pcre2-sys",
]
[[package]]
name = "pcre2-sys"
version = "0.2.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "18b9073c1a2549bd409bf4a32c94d903bb1a09bf845bc306ae148897fa0760a4"
dependencies = [
"cc",
"libc",
"pkg-config",
]
[[package]] [[package]]
name = "pear" name = "pear"
version = "0.2.9" version = "0.2.9"
...@@ -5800,7 +5862,7 @@ version = "0.13.5" ...@@ -5800,7 +5862,7 @@ version = "0.13.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "be769465445e8c1474e9c5dac2018218498557af32d9ed057325ec9a41ae81bf" checksum = "be769465445e8c1474e9c5dac2018218498557af32d9ed057325ec9a41ae81bf"
dependencies = [ dependencies = [
"heck 0.5.0", "heck 0.4.1",
"itertools 0.14.0", "itertools 0.14.0",
"log", "log",
"multimap", "multimap",
...@@ -5820,7 +5882,7 @@ version = "0.14.3" ...@@ -5820,7 +5882,7 @@ version = "0.14.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7" checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7"
dependencies = [ dependencies = [
"heck 0.5.0", "heck 0.4.1",
"itertools 0.14.0", "itertools 0.14.0",
"log", "log",
"multimap", "multimap",
...@@ -8559,6 +8621,12 @@ version = "0.7.6" ...@@ -8559,6 +8621,12 @@ version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
[[package]]
name = "utf16_iter"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246"
[[package]] [[package]]
name = "utf8_iter" name = "utf8_iter"
version = "1.0.4" version = "1.0.4"
...@@ -9011,7 +9079,7 @@ version = "0.1.11" ...@@ -9011,7 +9079,7 @@ version = "0.1.11"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
dependencies = [ dependencies = [
"windows-sys 0.61.2", "windows-sys 0.48.0",
] ]
[[package]] [[package]]
...@@ -9428,6 +9496,12 @@ dependencies = [ ...@@ -9428,6 +9496,12 @@ dependencies = [
"wasmparser", "wasmparser",
] ]
[[package]]
name = "write16"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936"
[[package]] [[package]]
name = "writeable" name = "writeable"
version = "0.6.2" version = "0.6.2"
......
...@@ -46,6 +46,7 @@ dynamo-mocker = { path = "lib/mocker", version = "1.0.0" } ...@@ -46,6 +46,7 @@ dynamo-mocker = { path = "lib/mocker", version = "1.0.0" }
dynamo-kv-router = { path = "lib/kv-router", version = "1.0.0", features = ["metrics", "runtime-protocols"] } dynamo-kv-router = { path = "lib/kv-router", version = "1.0.0", features = ["metrics", "runtime-protocols"] }
dynamo-async-openai = { path = "lib/async-openai", version = "1.0.0", features = ["byot"] } dynamo-async-openai = { path = "lib/async-openai", version = "1.0.0", features = ["byot"] }
dynamo-parsers = { path = "lib/parsers", version = "1.0.0" } dynamo-parsers = { path = "lib/parsers", version = "1.0.0" }
fastokens = { version = "0.1.0" }
# kvbm # kvbm
kvbm-common = { path = "lib/kvbm-common", version = "0.1.0" } kvbm-common = { path = "lib/kvbm-common", version = "0.1.0" }
......
...@@ -76,6 +76,9 @@ class FrontendConfig(KvRouterConfigBase): ...@@ -76,6 +76,9 @@ class FrontendConfig(KvRouterConfigBase):
enable_streaming_tool_dispatch: bool enable_streaming_tool_dispatch: bool
enable_streaming_reasoning_dispatch: bool enable_streaming_reasoning_dispatch: bool
preprocess_workers: int preprocess_workers: int
tokenizer_backend: str
_VALID_TOKENIZER_BACKENDS = {"default", "fastokens"}
def validate(self) -> None: def validate(self) -> None:
if bool(self.tls_cert_path) ^ bool(self.tls_key_path): # ^ is XOR if bool(self.tls_cert_path) ^ bool(self.tls_key_path): # ^ is XOR
...@@ -88,6 +91,11 @@ class FrontendConfig(KvRouterConfigBase): ...@@ -88,6 +91,11 @@ class FrontendConfig(KvRouterConfigBase):
) )
if self.router_enable_cache_control and self.router_mode != "kv": if self.router_enable_cache_control and self.router_mode != "kv":
raise ValueError("--enable-cache-control requires --router-mode=kv") raise ValueError("--enable-cache-control requires --router-mode=kv")
if self.tokenizer_backend not in self._VALID_TOKENIZER_BACKENDS:
raise ValueError(
f"--tokenizer: invalid value '{self.tokenizer_backend}' "
f"(choose from {sorted(self._VALID_TOKENIZER_BACKENDS)})"
)
@register_encoder(FrontendConfig) @register_encoder(FrontendConfig)
...@@ -424,3 +432,17 @@ class FrontendArgGroup(ArgGroup): ...@@ -424,3 +432,17 @@ class FrontendArgGroup(ArgGroup):
), ),
arg_type=int, arg_type=int,
) )
add_argument(
g,
flag_name="--tokenizer",
env_var="DYN_TOKENIZER",
default="default",
dest="tokenizer_backend",
help=(
"Tokenizer backend for BPE models: 'default' (HuggingFace tokenizers library) "
"or 'fastokens' (fastokens crate for high-performance BPE encoding). "
"Decoding always uses HuggingFace. Has no effect on TikToken models."
),
choices=["default", "fastokens"],
)
...@@ -165,6 +165,10 @@ async def async_main(): ...@@ -165,6 +165,10 @@ async def async_main():
config, vllm_flags, sglang_flags = parse_args() config, vllm_flags, sglang_flags = parse_args()
dump_config(config.dump_config_to, config) dump_config(config.dump_config_to, config)
os.environ["DYN_EVENT_PLANE"] = config.event_plane os.environ["DYN_EVENT_PLANE"] = config.event_plane
if config.tokenizer_backend == "fastokens":
os.environ["DYN_TOKENIZER"] = "fastokens"
else:
os.environ.pop("DYN_TOKENIZER", None)
logger.info( logger.info(
f"Request migration {'enabled' if config.migration_limit > 0 else 'disabled'} " f"Request migration {'enabled' if config.migration_limit > 0 else 'disabled'} "
f"(limit: {config.migration_limit})" f"(limit: {config.migration_limit})"
......
...@@ -585,7 +585,16 @@ version = "0.5.3" ...@@ -585,7 +585,16 @@ version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1" checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1"
dependencies = [ dependencies = [
"bit-vec", "bit-vec 0.6.3",
]
[[package]]
name = "bit-set"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3"
dependencies = [
"bit-vec 0.8.0",
] ]
[[package]] [[package]]
...@@ -594,6 +603,12 @@ version = "0.6.3" ...@@ -594,6 +603,12 @@ version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb"
[[package]]
name = "bit-vec"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7"
[[package]] [[package]]
name = "bit_field" name = "bit_field"
version = "0.10.3" version = "0.10.3"
...@@ -1139,6 +1154,12 @@ dependencies = [ ...@@ -1139,6 +1154,12 @@ dependencies = [
"syn", "syn",
] ]
[[package]]
name = "daachorse"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "63b7ef7a4be509357f4804d0a22e830daddb48f19fd604e4ad32ddce04a94c36"
[[package]] [[package]]
name = "darling" name = "darling"
version = "0.20.11" version = "0.20.11"
...@@ -1608,6 +1629,7 @@ dependencies = [ ...@@ -1608,6 +1629,7 @@ dependencies = [
"dynamo-runtime", "dynamo-runtime",
"dynamo-tokens", "dynamo-tokens",
"either", "either",
"fastokens",
"flate2", "flate2",
"futures", "futures",
"futures-util", "futures-util",
...@@ -2004,11 +2026,41 @@ version = "0.13.0" ...@@ -2004,11 +2026,41 @@ version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "531e46835a22af56d1e3b66f04844bed63158bc094a628bec1d321d9b4c44bf2" checksum = "531e46835a22af56d1e3b66f04844bed63158bc094a628bec1d321d9b4c44bf2"
dependencies = [ dependencies = [
"bit-set", "bit-set 0.5.3",
"regex-automata", "regex-automata",
"regex-syntax", "regex-syntax",
] ]
[[package]]
name = "fancy-regex"
version = "0.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72cf461f865c862bb7dc573f643dd6a2b6842f7c30b07882b56bd148cc2761b8"
dependencies = [
"bit-set 0.8.0",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "fastokens"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aca43986686f3dff724cc465e0afcf883c361112474072b1d825058852b25f9c"
dependencies = [
"daachorse",
"fancy-regex 0.17.0",
"hf-hub",
"icu_normalizer",
"memchr",
"pcre2",
"rayon",
"serde",
"serde_json",
"strum",
"thiserror 2.0.18",
]
[[package]] [[package]]
name = "fastrand" name = "fastrand"
version = "2.3.0" version = "2.3.0"
...@@ -2705,6 +2757,9 @@ dependencies = [ ...@@ -2705,6 +2757,9 @@ dependencies = [
"icu_properties", "icu_properties",
"icu_provider", "icu_provider",
"smallvec", "smallvec",
"utf16_iter",
"utf8_iter",
"write16",
"zerovec", "zerovec",
] ]
...@@ -4351,7 +4406,7 @@ dependencies = [ ...@@ -4351,7 +4406,7 @@ dependencies = [
"base64 0.22.1", "base64 0.22.1",
"bstr", "bstr",
"clap", "clap",
"fancy-regex", "fancy-regex 0.13.0",
"futures", "futures",
"image", "image",
"regex", "regex",
...@@ -4562,6 +4617,28 @@ version = "0.2.3" ...@@ -4562,6 +4617,28 @@ version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df94ce210e5bc13cb6651479fa48d14f601d9858cfe0467f43ae157023b938d3" checksum = "df94ce210e5bc13cb6651479fa48d14f601d9858cfe0467f43ae157023b938d3"
[[package]]
name = "pcre2"
version = "0.2.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e970b0fcce0c7ee6ef662744ff711f21ccd6f11b7cf03cd187a80e89797fc67"
dependencies = [
"libc",
"log",
"pcre2-sys",
]
[[package]]
name = "pcre2-sys"
version = "0.2.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "18b9073c1a2549bd409bf4a32c94d903bb1a09bf845bc306ae148897fa0760a4"
dependencies = [
"cc",
"libc",
"pkg-config",
]
[[package]] [[package]]
name = "pear" name = "pear"
version = "0.2.9" version = "0.2.9"
...@@ -6516,7 +6593,7 @@ dependencies = [ ...@@ -6516,7 +6593,7 @@ dependencies = [
"anyhow", "anyhow",
"base64 0.22.1", "base64 0.22.1",
"bstr", "bstr",
"fancy-regex", "fancy-regex 0.13.0",
"lazy_static", "lazy_static",
"regex", "regex",
"rustc-hash 1.1.0", "rustc-hash 1.1.0",
...@@ -7312,6 +7389,12 @@ dependencies = [ ...@@ -7312,6 +7389,12 @@ dependencies = [
"serde_derive", "serde_derive",
] ]
[[package]]
name = "utf16_iter"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246"
[[package]] [[package]]
name = "utf8_iter" name = "utf8_iter"
version = "1.0.4" version = "1.0.4"
...@@ -8079,6 +8162,12 @@ dependencies = [ ...@@ -8079,6 +8162,12 @@ dependencies = [
"wasmparser", "wasmparser",
] ]
[[package]]
name = "write16"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936"
[[package]] [[package]]
name = "writeable" name = "writeable"
version = "0.6.2" version = "0.6.2"
......
...@@ -603,7 +603,16 @@ version = "0.5.3" ...@@ -603,7 +603,16 @@ version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1" checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1"
dependencies = [ dependencies = [
"bit-vec", "bit-vec 0.6.3",
]
[[package]]
name = "bit-set"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3"
dependencies = [
"bit-vec 0.8.0",
] ]
[[package]] [[package]]
...@@ -612,6 +621,12 @@ version = "0.6.3" ...@@ -612,6 +621,12 @@ version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb"
[[package]]
name = "bit-vec"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7"
[[package]] [[package]]
name = "bit_field" name = "bit_field"
version = "0.10.3" version = "0.10.3"
...@@ -1157,6 +1172,12 @@ dependencies = [ ...@@ -1157,6 +1172,12 @@ dependencies = [
"syn", "syn",
] ]
[[package]]
name = "daachorse"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "63b7ef7a4be509357f4804d0a22e830daddb48f19fd604e4ad32ddce04a94c36"
[[package]] [[package]]
name = "darling" name = "darling"
version = "0.20.11" version = "0.20.11"
...@@ -1621,6 +1642,7 @@ dependencies = [ ...@@ -1621,6 +1642,7 @@ dependencies = [
"dynamo-runtime", "dynamo-runtime",
"dynamo-tokens", "dynamo-tokens",
"either", "either",
"fastokens",
"ffmpeg-next", "ffmpeg-next",
"flate2", "flate2",
"futures", "futures",
...@@ -2048,11 +2070,41 @@ version = "0.13.0" ...@@ -2048,11 +2070,41 @@ version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "531e46835a22af56d1e3b66f04844bed63158bc094a628bec1d321d9b4c44bf2" checksum = "531e46835a22af56d1e3b66f04844bed63158bc094a628bec1d321d9b4c44bf2"
dependencies = [ dependencies = [
"bit-set", "bit-set 0.5.3",
"regex-automata", "regex-automata",
"regex-syntax", "regex-syntax",
] ]
[[package]]
name = "fancy-regex"
version = "0.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72cf461f865c862bb7dc573f643dd6a2b6842f7c30b07882b56bd148cc2761b8"
dependencies = [
"bit-set 0.8.0",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "fastokens"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aca43986686f3dff724cc465e0afcf883c361112474072b1d825058852b25f9c"
dependencies = [
"daachorse",
"fancy-regex 0.17.0",
"hf-hub",
"icu_normalizer",
"memchr",
"pcre2",
"rayon",
"serde",
"serde_json",
"strum",
"thiserror 2.0.18",
]
[[package]] [[package]]
name = "fastrand" name = "fastrand"
version = "2.3.0" version = "2.3.0"
...@@ -2774,6 +2826,9 @@ dependencies = [ ...@@ -2774,6 +2826,9 @@ dependencies = [
"icu_properties", "icu_properties",
"icu_provider", "icu_provider",
"smallvec", "smallvec",
"utf16_iter",
"utf8_iter",
"write16",
"zerovec", "zerovec",
] ]
...@@ -4408,7 +4463,7 @@ dependencies = [ ...@@ -4408,7 +4463,7 @@ dependencies = [
"base64 0.22.1", "base64 0.22.1",
"bstr", "bstr",
"clap", "clap",
"fancy-regex", "fancy-regex 0.13.0",
"futures", "futures",
"image", "image",
"regex", "regex",
...@@ -4619,6 +4674,28 @@ version = "0.2.3" ...@@ -4619,6 +4674,28 @@ version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df94ce210e5bc13cb6651479fa48d14f601d9858cfe0467f43ae157023b938d3" checksum = "df94ce210e5bc13cb6651479fa48d14f601d9858cfe0467f43ae157023b938d3"
[[package]]
name = "pcre2"
version = "0.2.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e970b0fcce0c7ee6ef662744ff711f21ccd6f11b7cf03cd187a80e89797fc67"
dependencies = [
"libc",
"log",
"pcre2-sys",
]
[[package]]
name = "pcre2-sys"
version = "0.2.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "18b9073c1a2549bd409bf4a32c94d903bb1a09bf845bc306ae148897fa0760a4"
dependencies = [
"cc",
"libc",
"pkg-config",
]
[[package]] [[package]]
name = "pear" name = "pear"
version = "0.2.9" version = "0.2.9"
...@@ -6583,7 +6660,7 @@ dependencies = [ ...@@ -6583,7 +6660,7 @@ dependencies = [
"anyhow", "anyhow",
"base64 0.22.1", "base64 0.22.1",
"bstr", "bstr",
"fancy-regex", "fancy-regex 0.13.0",
"lazy_static", "lazy_static",
"regex", "regex",
"rustc-hash 1.1.0", "rustc-hash 1.1.0",
...@@ -7379,6 +7456,12 @@ dependencies = [ ...@@ -7379,6 +7456,12 @@ dependencies = [
"serde_derive", "serde_derive",
] ]
[[package]]
name = "utf16_iter"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246"
[[package]] [[package]]
name = "utf8_iter" name = "utf8_iter"
version = "1.0.4" version = "1.0.4"
...@@ -8163,6 +8246,12 @@ dependencies = [ ...@@ -8163,6 +8246,12 @@ dependencies = [
"wasmparser", "wasmparser",
] ]
[[package]]
name = "write16"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936"
[[package]] [[package]]
name = "writeable" name = "writeable"
version = "0.6.2" version = "0.6.2"
......
...@@ -143,6 +143,7 @@ tokenizers = { version = "0.21.4", default-features = false, features = [ ...@@ -143,6 +143,7 @@ tokenizers = { version = "0.21.4", default-features = false, features = [
] } ] }
tiktoken-rs = { version = "0.9", default-features = false } tiktoken-rs = { version = "0.9", default-features = false }
rustc-hash = "1.1" rustc-hash = "1.1"
fastokens = { workspace = true }
# backend # backend
galil-seiferas = { version = "0.1" } galil-seiferas = { version = "0.1" }
......
...@@ -378,12 +378,51 @@ impl ModelDeploymentCard { ...@@ -378,12 +378,51 @@ impl ModelDeploymentCard {
/// Load the tokenizer as a generic, backend-agnostic `Tokenizer` trait object. /// Load the tokenizer as a generic, backend-agnostic `Tokenizer` trait object.
/// This supports both HuggingFace `tokenizer.json` and tiktoken `.model`/`.tiktoken` files. /// This supports both HuggingFace `tokenizer.json` and tiktoken `.model`/`.tiktoken` files.
///
/// When the `DYN_TOKENIZER=fastokens` env var is set, uses `fastokens` for encoding
pub fn tokenizer(&self) -> anyhow::Result<crate::tokenizers::Tokenizer> { pub fn tokenizer(&self) -> anyhow::Result<crate::tokenizers::Tokenizer> {
let use_fast = match std::env::var("DYN_TOKENIZER") {
Ok(v) if v == "fastokens" => true,
Ok(v) if v == "default" || v.is_empty() => false,
Ok(v) => {
tracing::warn!(
value = %v,
"Unrecognized DYN_TOKENIZER value, expected 'fastokens' or 'default'; falling back to default"
);
false
}
Err(_) => false,
};
match &self.tokenizer { match &self.tokenizer {
Some(TokenizerKind::HfTokenizerJson(checked_file)) => { Some(TokenizerKind::HfTokenizerJson(checked_file)) => {
let p = checked_file.path().ok_or_else(|| { let p = checked_file.path().ok_or_else(|| {
anyhow::anyhow!("Tokenizer is URL-backed ({:?})", checked_file.url()) anyhow::anyhow!("Tokenizer is URL-backed ({:?})", checked_file.url())
})?; })?;
// Try fastokens backend if requested
if use_fast {
if let Some(path_str) = p.to_str() {
match crate::tokenizers::FastTokenizer::from_file(path_str) {
Ok(fast) => {
tracing::info!("Using fastokens tokenizer backend");
return Ok(crate::tokenizers::Tokenizer::from(Arc::new(fast)));
}
Err(e) => {
tracing::warn!(
%e,
"Failed to load fastokens, falling back to HuggingFace"
);
}
}
} else {
tracing::warn!(
path = %p.display(),
"Tokenizer path contains non-UTF-8 characters, skipping fastokens; falling back to HuggingFace"
);
}
}
let hf = HfTokenizer::from_file(p) let hf = HfTokenizer::from_file(p)
.inspect_err(|err| { .inspect_err(|err| {
if let Some(serde_err) = err.downcast_ref::<serde_json::Error>() if let Some(serde_err) = err.downcast_ref::<serde_json::Error>()
......
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
pub mod fastokens;
pub mod hf; pub mod hf;
pub mod tiktoken; pub mod tiktoken;
...@@ -15,6 +16,7 @@ use std::{ops::Deref, path::Path}; ...@@ -15,6 +16,7 @@ use std::{ops::Deref, path::Path};
use crate::protocols::TokenIdType; use crate::protocols::TokenIdType;
pub use anyhow::{Error, Result}; pub use anyhow::{Error, Result};
pub use fastokens::FastTokenizer;
pub use hf::HuggingFaceTokenizer; pub use hf::HuggingFaceTokenizer;
pub use tiktoken::TikTokenTokenizer; pub use tiktoken::TikTokenTokenizer;
......
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Fastokens backend using the `fastokens` crate for high-performance BPE encoding.
//!
//! `fastokens` only supports encoding, so this module provides a hybrid tokenizer that
//! uses `fastokens` for encoding and falls back to `HuggingFaceTokenizer` for decoding.
//! Both are loaded from the same `tokenizer.json` file.
use std::path::Path;
use rayon::prelude::*;
use super::{
Encoding, Error, Result, TokenIdType,
hf::HuggingFaceTokenizer,
traits::{Decoder, Encoder, Tokenizer},
};
/// Hybrid tokenizer: fast BPE encoding via `fastokens`, decoding via HuggingFace.
///
/// Both backends are loaded from the same `tokenizer.json` file.
pub struct FastTokenizer {
fast_encoder: fastokens::Tokenizer,
hf_decoder: HuggingFaceTokenizer,
}
impl FastTokenizer {
pub fn from_file(path: &str) -> Result<Self> {
let fast_encoder = fastokens::Tokenizer::from_file(Path::new(path))
.map_err(|e| Error::msg(format!("Error loading fastokens tokenizer: {e}")))?;
let hf_decoder = HuggingFaceTokenizer::from_file(path)?;
Ok(Self {
fast_encoder,
hf_decoder,
})
}
}
impl Encoder for FastTokenizer {
fn encode(&self, input: &str) -> Result<Encoding> {
let ids = self
.fast_encoder
.encode(input)
.map_err(|e| Error::msg(format!("Fastokens encode error: {e}")))?;
Ok(Encoding::Sp(ids))
}
fn encode_batch(&self, inputs: &[&str]) -> Result<Vec<Encoding>> {
inputs.par_iter().map(|input| self.encode(input)).collect()
}
}
impl Decoder for FastTokenizer {
fn decode(&self, token_ids: &[TokenIdType], skip_special_tokens: bool) -> Result<String> {
self.hf_decoder.decode(token_ids, skip_special_tokens)
}
}
impl Tokenizer for FastTokenizer {}
#[cfg(test)]
mod tests {
use super::*;
use crate::tokenizers::HuggingFaceTokenizer;
// Minimal synthetic BPE tokenizer with no normalizer or post-processor --
// compatible with fastokens. Vocab covers: H,T,a,d,e,h,i,l,o,r,s,t,w + punctuation.
const TOKENIZER_PATH: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/tests/data/sample-models/minimal-bpe/tokenizer.json"
);
#[test]
fn test_fast_encode_decode_roundtrip() {
let tokenizer = FastTokenizer::from_file(TOKENIZER_PATH).unwrap();
// Encode then decode: verifies both paths execute without error.
// With a null decoder, HF inserts spaces between tokens so exact equality
// is not expected here -- we just verify the operations succeed and produce
// non-empty results.
let text = "Hello, world!";
let encoding = tokenizer.encode(text).unwrap();
assert!(!encoding.token_ids().is_empty());
let decoded = tokenizer.decode(encoding.token_ids(), true).unwrap();
assert!(!decoded.is_empty());
// The decoded text should contain the same non-space characters
let enc_chars: String = text.chars().filter(|c| !c.is_whitespace()).collect();
let dec_chars: String = decoded.chars().filter(|c| !c.is_whitespace()).collect();
assert_eq!(
enc_chars, dec_chars,
"non-space characters must be preserved"
);
}
#[test]
fn test_fast_matches_hf_encoding() {
let fast = FastTokenizer::from_file(TOKENIZER_PATH).unwrap();
let hf = HuggingFaceTokenizer::from_file(TOKENIZER_PATH).unwrap();
for text in &["Hello, world!", "Hello", " world", "He llo"] {
let fast_ids = fast.encode(text).unwrap();
let hf_ids = hf.encode(text).unwrap();
assert_eq!(
fast_ids.token_ids(),
hf_ids.token_ids(),
"fastokens and HuggingFace must produce identical token IDs for '{text}'"
);
}
}
#[test]
fn test_fast_batch_encode() {
let tokenizer = FastTokenizer::from_file(TOKENIZER_PATH).unwrap();
let inputs = &["Hello", " world", "Hello, world!"];
let encodings = tokenizer.encode_batch(inputs).unwrap();
assert_eq!(encodings.len(), inputs.len());
for (enc, input) in encodings.iter().zip(inputs.iter()) {
assert!(
!enc.token_ids().is_empty(),
"encoding for '{input}' must be non-empty"
);
}
}
#[test]
fn test_fast_with_decode_stream() {
use crate::tokenizers::Tokenizer as TokenizerWrapper;
use std::sync::Arc;
let tokenizer = Arc::new(FastTokenizer::from_file(TOKENIZER_PATH).unwrap());
let wrapper = TokenizerWrapper::from(tokenizer);
// Encode a prompt and a continuation, then step through the decode stream
let prompt_ids = wrapper.encode("Hello").unwrap().token_ids().to_vec();
let continuation = ", world!";
let cont_ids = wrapper.encode(continuation).unwrap().token_ids().to_vec();
let mut stream = wrapper.decode_stream(&prompt_ids, true);
// Accumulate incremental chunks from decode_stream
let mut accumulated = String::new();
for id in &cont_ids {
if let Some(chunk) = stream.step(*id).unwrap() {
accumulated.push_str(&chunk);
}
}
// DecodeStream uses prompt tokens as context, so the expected text is
// decode(prompt + continuation) minus decode(prompt) -- not a bare
// decode(continuation) which lacks the surrounding context.
let mut all_ids = prompt_ids.clone();
all_ids.extend_from_slice(&cont_ids);
let full_text = wrapper.decode(&all_ids, true).unwrap();
let prompt_text = wrapper.decode(&prompt_ids, true).unwrap();
let expected = &full_text[prompt_text.len()..];
assert_eq!(
accumulated, expected,
"streamed chunks must equal context-aware decoded continuation"
);
}
}
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [],
"normalizer": null,
"pre_tokenizer": null,
"post_processor": null,
"decoder": null,
"model": {
"type": "BPE",
"dropout": null,
"unk_token": "<unk>",
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"ignore_merges": false,
"vocab": {
"<unk>": 0,
" ": 1,
"!": 2,
",": 3,
".": 4,
"H": 5,
"T": 6,
"a": 7,
"d": 8,
"e": 9,
"h": 10,
"i": 11,
"l": 12,
"o": 13,
"r": 14,
"s": 15,
"t": 16,
"w": 17,
"He": 18,
"ll": 19,
"llo": 20,
"or": 21,
"ld": 22
},
"merges": [
"H e",
"l l",
"ll o",
"o r",
"l d"
]
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment