"...ssh:/git@developer.sourcefind.cn:2222/OpenDAS/dynamo.git" did not exist on "f80ad6926a0f403c8e9486203dadc55e767f29b1"
Unverified Commit 66b7d2c7 authored by Paul Hendricks's avatar Paul Hendricks Committed by GitHub
Browse files

fix: updates versions and adds ahashmap to BPE (#2072)

parent f9b1757f
......@@ -36,6 +36,7 @@ dependencies = [
"cfg-if 1.0.0",
"getrandom 0.3.2",
"once_cell",
"serde",
"version_check",
"zerocopy",
]
......@@ -897,6 +898,15 @@ version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
[[package]]
name = "castaway"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dec551ab6e7578819132c713a93c022a05d60159dc86e7a7050223577484c55a"
dependencies = [
"rustversion",
]
[[package]]
name = "cbindgen"
version = "0.27.0"
......@@ -1077,6 +1087,21 @@ version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
[[package]]
name = "compact_str"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3fdb1325a1cece981e8a296ab8f0f9b63ae357bd0784a9faaf548cc7b480707a"
dependencies = [
"castaway",
"cfg-if 1.0.0",
"itoa",
"rustversion",
"ryu",
"serde",
"static_assertions",
]
[[package]]
name = "console"
version = "0.15.11"
......@@ -1488,6 +1513,15 @@ dependencies = [
"syn 2.0.100",
]
[[package]]
name = "dary_heap"
version = "0.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04d2cd9c18b9f454ed67da600630b021a8a80bf33f8c95896ab33aaf1c26b728"
dependencies = [
"serde",
]
[[package]]
name = "dashmap"
version = "5.5.3"
......@@ -1824,6 +1858,7 @@ dependencies = [
name = "dynamo-llm"
version = "0.3.2"
dependencies = [
"ahash",
"akin",
"aligned-vec",
"anyhow",
......@@ -1885,8 +1920,8 @@ dependencies = [
"tokio",
"tokio-stream",
"tokio-util",
"toktrie 0.6.31",
"toktrie_hf_tokenizers 0.6.31",
"toktrie 1.1.0",
"toktrie_hf_tokenizers 1.1.0",
"tracing",
"unicode-segmentation",
"url",
......@@ -3541,15 +3576,6 @@ dependencies = [
"either",
]
[[package]]
name = "itertools"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57"
dependencies = [
"either",
]
[[package]]
name = "itertools"
version = "0.12.1"
......@@ -3777,8 +3803,8 @@ dependencies = [
[[package]]
name = "llguidance"
version = "0.7.29"
source = "git+https://github.com/guidance-ai/llguidance.git?rev=2ce5ab8#2ce5ab8196f16dd8beba5a3d874eb1ab74e0268c"
version = "1.0.0"
source = "git+https://github.com/guidance-ai/llguidance.git?rev=c432092#c432092d37b8ccd1afeeff3e7f9c9a29aae0a87e"
dependencies = [
"anyhow",
"derivre",
......@@ -3786,7 +3812,7 @@ dependencies = [
"regex-syntax 0.8.5",
"serde",
"serde_json",
"toktrie 0.7.29",
"toktrie 1.0.0",
]
[[package]]
......@@ -4104,7 +4130,7 @@ dependencies = [
[[package]]
name = "mistralrs"
version = "0.6.0"
source = "git+https://github.com/EricLBuehler/mistral.rs.git#d38a7e198469eb88e883e36d153437c1fb326315"
source = "git+https://github.com/EricLBuehler/mistral.rs.git?rev=8a4faf3#8a4faf312069cf87b215c6c79e48a44e17b71062"
dependencies = [
"anyhow",
"candle-core 0.8.0",
......@@ -4126,7 +4152,7 @@ dependencies = [
[[package]]
name = "mistralrs-audio"
version = "0.6.0"
source = "git+https://github.com/EricLBuehler/mistral.rs.git#d38a7e198469eb88e883e36d153437c1fb326315"
source = "git+https://github.com/EricLBuehler/mistral.rs.git?rev=8a4faf3#8a4faf312069cf87b215c6c79e48a44e17b71062"
dependencies = [
"anyhow",
"apodize",
......@@ -4137,7 +4163,7 @@ dependencies = [
[[package]]
name = "mistralrs-core"
version = "0.6.0"
source = "git+https://github.com/EricLBuehler/mistral.rs.git#d38a7e198469eb88e883e36d153437c1fb326315"
source = "git+https://github.com/EricLBuehler/mistral.rs.git?rev=8a4faf3#8a4faf312069cf87b215c6c79e48a44e17b71062"
dependencies = [
"ahash",
"akin",
......@@ -4217,7 +4243,7 @@ dependencies = [
"tokio",
"tokio-rayon",
"tokio-tungstenite",
"toktrie_hf_tokenizers 0.7.29",
"toktrie_hf_tokenizers 1.0.0",
"toml",
"tqdm",
"tracing",
......@@ -4231,7 +4257,7 @@ dependencies = [
[[package]]
name = "mistralrs-mcp"
version = "0.6.0"
source = "git+https://github.com/EricLBuehler/mistral.rs.git#d38a7e198469eb88e883e36d153437c1fb326315"
source = "git+https://github.com/EricLBuehler/mistral.rs.git?rev=8a4faf3#8a4faf312069cf87b215c6c79e48a44e17b71062"
dependencies = [
"anyhow",
"async-trait",
......@@ -4251,7 +4277,7 @@ dependencies = [
[[package]]
name = "mistralrs-paged-attn"
version = "0.6.0"
source = "git+https://github.com/EricLBuehler/mistral.rs.git#d38a7e198469eb88e883e36d153437c1fb326315"
source = "git+https://github.com/EricLBuehler/mistral.rs.git?rev=8a4faf3#8a4faf312069cf87b215c6c79e48a44e17b71062"
dependencies = [
"anyhow",
"bindgen_cuda 0.1.6",
......@@ -4266,7 +4292,7 @@ dependencies = [
[[package]]
name = "mistralrs-quant"
version = "0.6.0"
source = "git+https://github.com/EricLBuehler/mistral.rs.git#d38a7e198469eb88e883e36d153437c1fb326315"
source = "git+https://github.com/EricLBuehler/mistral.rs.git?rev=8a4faf3#8a4faf312069cf87b215c6c79e48a44e17b71062"
dependencies = [
"bindgen_cuda 0.1.6",
"byteorder",
......@@ -4294,7 +4320,7 @@ dependencies = [
[[package]]
name = "mistralrs-vision"
version = "0.6.0"
source = "git+https://github.com/EricLBuehler/mistral.rs.git#d38a7e198469eb88e883e36d153437c1fb326315"
source = "git+https://github.com/EricLBuehler/mistral.rs.git?rev=8a4faf3#8a4faf312069cf87b215c6c79e48a44e17b71062"
dependencies = [
"candle-core 0.8.0",
"image",
......@@ -4695,11 +4721,11 @@ checksum = "b4ce411919553d3f9fa53a0880544cda985a112117a0444d5ff1e870a893d6ea"
[[package]]
name = "onig"
version = "6.4.0"
version = "6.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8c4b31c8722ad9171c6d77d3557db078cab2bd50afcc9d09c8b315c59df8ca4f"
checksum = "336b9c63443aceef14bea841b899035ae3abe89b7c486aaf4c5bd8aafedac3f0"
dependencies = [
"bitflags 1.3.2",
"bitflags 2.9.0",
"libc",
"once_cell",
"onig_sys",
......@@ -4707,9 +4733,9 @@ dependencies = [
[[package]]
name = "onig_sys"
version = "69.8.1"
version = "69.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7b829e3d7e9cc74c7e315ee8edb185bf4190da5acde74afd7fc59c35b1f086e7"
checksum = "c7f86c6eef3d6df15f23bcfb6af487cbd2fed4e5581d58d5bf1f5f8b7f6727dc"
dependencies = [
"cc",
"pkg-config",
......@@ -5519,12 +5545,12 @@ dependencies = [
[[package]]
name = "rayon-cond"
version = "0.3.0"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "059f538b55efd2309c9794130bc149c6a553db90e9d99c2030785c82f0bd7df9"
checksum = "2964d0cf57a3e7a06e8183d14a8b527195c706b7983549cd5462d5aa3747438f"
dependencies = [
"either",
"itertools 0.11.0",
"itertools 0.14.0",
"rayon",
]
......@@ -6541,6 +6567,12 @@ version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
[[package]]
name = "static_assertions"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
[[package]]
name = "stop-words"
version = "0.8.1"
......@@ -7071,25 +7103,26 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
[[package]]
name = "tokenizers"
version = "0.21.1"
version = "0.21.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3169b3195f925496c895caee7978a335d49218488ef22375267fba5a46a40bd7"
checksum = "4c3846d8588abed0daba25a0e47edd58ea15e450a6088b2575f5116fdb0b27ca"
dependencies = [
"ahash",
"aho-corasick",
"compact_str",
"dary_heap",
"derive_builder",
"esaxx-rs",
"fancy-regex",
"getrandom 0.2.16",
"getrandom 0.3.2",
"hf-hub",
"indicatif",
"itertools 0.13.0",
"lazy_static",
"itertools 0.14.0",
"log",
"macro_rules_attribute",
"monostate",
"onig",
"paste",
"rand 0.8.5",
"rand 0.9.1",
"rayon",
"rayon-cond",
"regex",
......@@ -7215,9 +7248,8 @@ dependencies = [
[[package]]
name = "toktrie"
version = "0.6.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32dfaa37eed7e440ee93b236b736600ccbbfa47e4a0b15f0b74a4d3f4cf9b5a3"
version = "1.0.0"
source = "git+https://github.com/guidance-ai/llguidance.git?rev=c432092#c432092d37b8ccd1afeeff3e7f9c9a29aae0a87e"
dependencies = [
"anyhow",
"bytemuck",
......@@ -7228,8 +7260,9 @@ dependencies = [
[[package]]
name = "toktrie"
version = "0.7.29"
source = "git+https://github.com/guidance-ai/llguidance.git?rev=2ce5ab8#2ce5ab8196f16dd8beba5a3d874eb1ab74e0268c"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "747b19d4f97f841cc720aaffb1fa3dbf08bc72abd9199dcf34b0fad7b1a3691c"
dependencies = [
"anyhow",
"bytemuck",
......@@ -7240,29 +7273,29 @@ dependencies = [
[[package]]
name = "toktrie_hf_tokenizers"
version = "0.6.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a5e426e5cfb8237a7244bc53e6136022987a16d66d07050ff81c0a56c5a8a25"
version = "1.0.0"
source = "git+https://github.com/guidance-ai/llguidance.git?rev=c432092#c432092d37b8ccd1afeeff3e7f9c9a29aae0a87e"
dependencies = [
"anyhow",
"log",
"serde",
"serde_json",
"tokenizers",
"toktrie 0.6.31",
"toktrie 1.0.0",
]
[[package]]
name = "toktrie_hf_tokenizers"
version = "0.7.29"
source = "git+https://github.com/guidance-ai/llguidance.git?rev=2ce5ab8#2ce5ab8196f16dd8beba5a3d874eb1ab74e0268c"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d77f942aa9bcd67f39dfeec0d5b80a40ae32e5ae38c0c58777b7d47fa393177f"
dependencies = [
"anyhow",
"log",
"serde",
"serde_json",
"tokenizers",
"toktrie 0.7.29",
"toktrie 1.1.0",
]
[[package]]
......
......@@ -45,7 +45,7 @@ derive-getters = { version = "0.5" }
either = { version = "1.13", features = ["serde"] }
etcd-client = { version = "0.14", features = ["tls"] }
futures = { version = "0.3" }
hf-hub = { version = "0.4.2", default-features = false, features = ["tokio", "rustls-tls"] }
hf-hub = { version = "0.4.2", default-features = false, features = ["tokio", "rustls-tls", "ureq"] }
humantime = { version = "2.2.0" }
libc = { version = "0.2" }
oneshot = { version = "0.1.11", features = ["std", "async"] }
......
......@@ -17,6 +17,20 @@ version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
[[package]]
name = "ahash"
version = "0.8.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
dependencies = [
"cfg-if 1.0.0",
"getrandom 0.3.2",
"once_cell",
"serde",
"version_check",
"zerocopy",
]
[[package]]
name = "aho-corasick"
version = "1.1.3"
......@@ -593,6 +607,15 @@ dependencies = [
"zip",
]
[[package]]
name = "castaway"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dec551ab6e7578819132c713a93c022a05d60159dc86e7a7050223577484c55a"
dependencies = [
"rustversion",
]
[[package]]
name = "cc"
version = "1.2.24"
......@@ -698,6 +721,21 @@ version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
[[package]]
name = "compact_str"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3fdb1325a1cece981e8a296ab8f0f9b63ae357bd0784a9faaf548cc7b480707a"
dependencies = [
"castaway",
"cfg-if 1.0.0",
"itoa",
"rustversion",
"ryu",
"serde",
"static_assertions",
]
[[package]]
name = "concurrent-queue"
version = "2.5.0"
......@@ -918,6 +956,15 @@ dependencies = [
"syn 2.0.100",
]
[[package]]
name = "dary_heap"
version = "0.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04d2cd9c18b9f454ed67da600630b021a8a80bf33f8c95896ab33aaf1c26b728"
dependencies = [
"serde",
]
[[package]]
name = "dashmap"
version = "5.5.3"
......@@ -1124,6 +1171,7 @@ dependencies = [
name = "dynamo-llm"
version = "0.3.2"
dependencies = [
"ahash",
"akin",
"anyhow",
"async-nats",
......@@ -2425,15 +2473,6 @@ version = "1.70.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
[[package]]
name = "itertools"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57"
dependencies = [
"either",
]
[[package]]
name = "itertools"
version = "0.13.0"
......@@ -3097,11 +3136,11 @@ checksum = "b4ce411919553d3f9fa53a0880544cda985a112117a0444d5ff1e870a893d6ea"
[[package]]
name = "onig"
version = "6.4.0"
version = "6.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8c4b31c8722ad9171c6d77d3557db078cab2bd50afcc9d09c8b315c59df8ca4f"
checksum = "336b9c63443aceef14bea841b899035ae3abe89b7c486aaf4c5bd8aafedac3f0"
dependencies = [
"bitflags 1.3.2",
"bitflags 2.9.0",
"libc",
"once_cell",
"onig_sys",
......@@ -3109,9 +3148,9 @@ dependencies = [
[[package]]
name = "onig_sys"
version = "69.8.1"
version = "69.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7b829e3d7e9cc74c7e315ee8edb185bf4190da5acde74afd7fc59c35b1f086e7"
checksum = "c7f86c6eef3d6df15f23bcfb6af487cbd2fed4e5581d58d5bf1f5f8b7f6727dc"
dependencies = [
"cc",
"pkg-config",
......@@ -3758,12 +3797,12 @@ dependencies = [
[[package]]
name = "rayon-cond"
version = "0.3.0"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "059f538b55efd2309c9794130bc149c6a553db90e9d99c2030785c82f0bd7df9"
checksum = "2964d0cf57a3e7a06e8183d14a8b527195c706b7983549cd5462d5aa3747438f"
dependencies = [
"either",
"itertools 0.11.0",
"itertools 0.14.0",
"rayon",
]
......@@ -4378,6 +4417,12 @@ version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
[[package]]
name = "static_assertions"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
[[package]]
name = "strsim"
version = "0.11.1"
......@@ -4645,24 +4690,26 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
[[package]]
name = "tokenizers"
version = "0.21.1"
version = "0.21.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3169b3195f925496c895caee7978a335d49218488ef22375267fba5a46a40bd7"
checksum = "4c3846d8588abed0daba25a0e47edd58ea15e450a6088b2575f5116fdb0b27ca"
dependencies = [
"ahash",
"aho-corasick",
"compact_str",
"dary_heap",
"derive_builder",
"esaxx-rs",
"getrandom 0.2.16",
"fancy-regex",
"getrandom 0.3.2",
"hf-hub",
"indicatif",
"itertools 0.13.0",
"lazy_static",
"itertools 0.14.0",
"log",
"macro_rules_attribute",
"monostate",
"onig",
"paste",
"rand 0.8.5",
"rand 0.9.1",
"rayon",
"rayon-cond",
"regex",
......@@ -4765,9 +4812,9 @@ dependencies = [
[[package]]
name = "toktrie"
version = "0.6.31"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32dfaa37eed7e440ee93b236b736600ccbbfa47e4a0b15f0b74a4d3f4cf9b5a3"
checksum = "747b19d4f97f841cc720aaffb1fa3dbf08bc72abd9199dcf34b0fad7b1a3691c"
dependencies = [
"anyhow",
"bytemuck",
......@@ -4778,9 +4825,9 @@ dependencies = [
[[package]]
name = "toktrie_hf_tokenizers"
version = "0.6.31"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a5e426e5cfb8237a7244bc53e6136022987a16d66d07050ff81c0a56c5a8a25"
checksum = "d77f942aa9bcd67f39dfeec0d5b80a40ae32e5ae38c0c58777b7d47fa393177f"
dependencies = [
"anyhow",
"log",
......
......@@ -39,7 +39,7 @@ async-stream = { workspace = true }
async-trait = { workspace = true }
either = { workspace = true }
indexmap = { version = "2.9.0", features = ["serde"] }
mistralrs = { git = "https://github.com/EricLBuehler/mistral.rs.git", version = "0.6.0" }
mistralrs = { git = "https://github.com/EricLBuehler/mistral.rs.git", version = "0.6.0", rev = "8a4faf3" }
serde_json = { workspace = true }
tokio = { workspace = true }
tracing = { workspace = true }
......@@ -101,7 +101,7 @@ unicode-segmentation = "1.12"
axum = { workspace = true }
# tokenizers
tokenizers = { version = "0.21.1", default-features = false, features = [
tokenizers = { version = "0.21.2", default-features = false, features = [
"onig",
"esaxx_fast",
"rustls-tls",
......@@ -110,8 +110,8 @@ sentencepiece = { version = "0.11.2", optional = true }
# backend
galil-seiferas = { version = "0.1" }
toktrie = { version = "0.6.28" }
toktrie_hf_tokenizers = { version = "0.6.28" }
toktrie = { version = "1.1" }
toktrie_hf_tokenizers = { version = "1.1" }
# preprocessor
bs62 = { version = "0.1" }
......@@ -127,6 +127,7 @@ memmap2 = "0.9.5"
# Publishers
zeromq = "0.4.1"
rmp-serde = "1.3"
ahash = "0.8.12"
[dev-dependencies]
approx = "0.5"
......
......@@ -27,8 +27,7 @@
// https://github.com/huggingface/transformers/blob/8685b3c5d2dd2550527773d2a02499495a759e31/src/transformers/convert_slow_tokenizer.py
use std::collections::HashMap;
use ahash::AHashMap;
use anyhow::Result;
use itertools::Itertools;
use tokenizers::{
......@@ -236,7 +235,8 @@ fn bpe_tokenizer(p: &PropsGGUF) -> Result<(Tokenizer, TokenizerKind, AddedTokens
})
.collect::<Vec<_>>();
let mut vocab = HashMap::new();
// Use ahash::AHashMap so that we satisfy Into<AHashMap<_>> bounds
let mut vocab: AHashMap<String, u32> = AHashMap::new();
for (i, token) in p.tokens.iter().enumerate() {
#[allow(clippy::cast_possible_truncation)]
vocab.insert(token.clone(), i as u32);
......@@ -266,7 +266,10 @@ fn bpe_tokenizer(p: &PropsGGUF) -> Result<(Tokenizer, TokenizerKind, AddedTokens
false, true, true,
)));
if add_bos_token.is_some_and(|x| x) {
let mut special_toks = HashMap::new();
// Use ahash::AHashMap so that we satisfy Into<AHashMap<_>> bounds
let mut special_toks: AHashMap<String, processors::template::SpecialToken> =
AHashMap::new();
special_toks.insert(
p.tokens[bos as usize].clone(),
template::SpecialToken::new(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment