Commit c7067fc2 authored by Graham King's avatar Graham King Committed by GitHub
Browse files

feat: Build pre-processor from GGUF (#344)

This lets us do:
```
dynamo-run out=llamacpp <gguf_file>
```

Previously a `--model-config <hf-repo>` was also required, to configure our tokenizer.
parent d29f7fcc
...@@ -695,11 +695,11 @@ version = "0.8.0" ...@@ -695,11 +695,11 @@ version = "0.8.0"
source = "git+https://github.com/EricLBuehler/candle.git?rev=496a8d2b#496a8d2bf8f88e3be4ea27332a209d66e8b404f4" source = "git+https://github.com/EricLBuehler/candle.git?rev=496a8d2b#496a8d2bf8f88e3be4ea27332a209d66e8b404f4"
dependencies = [ dependencies = [
"byteorder", "byteorder",
"candle-kernels", "candle-kernels 0.8.0",
"candle-metal-kernels", "candle-metal-kernels",
"cudarc 0.13.9 (registry+https://github.com/rust-lang/crates.io-index)", "cudarc 0.13.9 (registry+https://github.com/rust-lang/crates.io-index)",
"float8", "float8",
"gemm", "gemm 0.17.1",
"half", "half",
"memmap2", "memmap2",
"metal", "metal",
...@@ -714,6 +714,31 @@ dependencies = [ ...@@ -714,6 +714,31 @@ dependencies = [
"zip", "zip",
] ]
[[package]]
name = "candle-core"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "06ccf5ee3532e66868516d9b315f73aec9f34ea1a37ae98514534d458915dbf1"
dependencies = [
"byteorder",
"candle-kernels 0.8.4",
"cudarc 0.13.9 (registry+https://github.com/rust-lang/crates.io-index)",
"gemm 0.17.1",
"half",
"memmap2",
"num-traits",
"num_cpus",
"rand 0.9.0",
"rand_distr",
"rayon",
"safetensors",
"thiserror 1.0.69",
"ug",
"ug-cuda",
"yoke",
"zip",
]
[[package]] [[package]]
name = "candle-kernels" name = "candle-kernels"
version = "0.8.0" version = "0.8.0"
...@@ -722,6 +747,15 @@ dependencies = [ ...@@ -722,6 +747,15 @@ dependencies = [
"bindgen_cuda 0.1.5", "bindgen_cuda 0.1.5",
] ]
[[package]]
name = "candle-kernels"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a10885bd902fad1b8518ba2b22369aaed88a3d94e123533ad3ca73db33b1c8ca"
dependencies = [
"bindgen_cuda 0.1.5",
]
[[package]] [[package]]
name = "candle-metal-kernels" name = "candle-metal-kernels"
version = "0.8.0" version = "0.8.0"
...@@ -738,7 +772,7 @@ name = "candle-nn" ...@@ -738,7 +772,7 @@ name = "candle-nn"
version = "0.8.0" version = "0.8.0"
source = "git+https://github.com/EricLBuehler/candle.git?rev=496a8d2b#496a8d2bf8f88e3be4ea27332a209d66e8b404f4" source = "git+https://github.com/EricLBuehler/candle.git?rev=496a8d2b#496a8d2bf8f88e3be4ea27332a209d66e8b404f4"
dependencies = [ dependencies = [
"candle-core", "candle-core 0.8.0",
"candle-metal-kernels", "candle-metal-kernels",
"half", "half",
"metal", "metal",
...@@ -1383,6 +1417,15 @@ dependencies = [ ...@@ -1383,6 +1417,15 @@ dependencies = [
"crypto-common", "crypto-common",
] ]
[[package]]
name = "digit-layout"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09157630eece4139f6cc5a457556d308c3465ecd5af492f0e5aadc043997e2ce"
dependencies = [
"half",
]
[[package]] [[package]]
name = "dircpy" name = "dircpy"
version = "0.3.19" version = "0.3.19"
...@@ -1448,10 +1491,20 @@ dependencies = [ ...@@ -1448,10 +1491,20 @@ dependencies = [
"reborrow", "reborrow",
] ]
[[package]]
name = "dyn-stack"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "490bd48eb68fffcfed519b4edbfd82c69cbe741d175b84f0e0cbe8c57cbe0bdd"
dependencies = [
"bytemuck",
]
[[package]] [[package]]
name = "dynamo-llm" name = "dynamo-llm"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"akin",
"anyhow", "anyhow",
"async-openai", "async-openai",
"async-stream", "async-stream",
...@@ -1463,6 +1516,7 @@ dependencies = [ ...@@ -1463,6 +1516,7 @@ dependencies = [
"bs62", "bs62",
"bytemuck", "bytemuck",
"bytes", "bytes",
"candle-core 0.8.4",
"chrono", "chrono",
"cmake", "cmake",
"cudarc 0.13.9 (git+https://github.com/coreylowman/cudarc.git?rev=8c52e735b55bf8e979e1a16bd85e3dfe4f87c9fe)", "cudarc 0.13.9 (git+https://github.com/coreylowman/cudarc.git?rev=8c52e735b55bf8e979e1a16bd85e3dfe4f87c9fe)",
...@@ -1473,12 +1527,14 @@ dependencies = [ ...@@ -1473,12 +1527,14 @@ dependencies = [
"erased-serde", "erased-serde",
"futures", "futures",
"galil-seiferas", "galil-seiferas",
"ggus",
"hf-hub", "hf-hub",
"indexmap 2.8.0", "indexmap 2.8.0",
"insta", "insta",
"itertools 0.14.0", "itertools 0.14.0",
"libc", "libc",
"llama-cpp-2", "llama-cpp-2",
"memmap2",
"minijinja", "minijinja",
"minijinja-contrib", "minijinja-contrib",
"mistralrs", "mistralrs",
...@@ -1827,6 +1883,17 @@ dependencies = [ ...@@ -1827,6 +1883,17 @@ dependencies = [
"zune-inflate", "zune-inflate",
] ]
[[package]]
name = "fancy-regex"
version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e24cb5a94bcae1e5408b0effca5cd7172ea3c5755049c5f3af4cd283a165298"
dependencies = [
"bit-set",
"regex-automata 0.4.9",
"regex-syntax 0.8.5",
]
[[package]] [[package]]
name = "fastrand" name = "fastrand"
version = "2.3.0" version = "2.3.0"
...@@ -2072,17 +2139,37 @@ version = "0.17.1" ...@@ -2072,17 +2139,37 @@ version = "0.17.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ab24cc62135b40090e31a76a9b2766a501979f3070fa27f689c27ec04377d32" checksum = "6ab24cc62135b40090e31a76a9b2766a501979f3070fa27f689c27ec04377d32"
dependencies = [ dependencies = [
"dyn-stack", "dyn-stack 0.10.0",
"gemm-c32", "gemm-c32 0.17.1",
"gemm-c64", "gemm-c64 0.17.1",
"gemm-common", "gemm-common 0.17.1",
"gemm-f16", "gemm-f16 0.17.1",
"gemm-f32", "gemm-f32 0.17.1",
"gemm-f64", "gemm-f64 0.17.1",
"num-complex",
"num-traits",
"paste",
"raw-cpuid 10.7.0",
"seq-macro",
]
[[package]]
name = "gemm"
version = "0.18.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ab96b703d31950f1aeddded248bc95543c9efc7ac9c4a21fda8703a83ee35451"
dependencies = [
"dyn-stack 0.13.0",
"gemm-c32 0.18.2",
"gemm-c64 0.18.2",
"gemm-common 0.18.2",
"gemm-f16 0.18.2",
"gemm-f32 0.18.2",
"gemm-f64 0.18.2",
"num-complex", "num-complex",
"num-traits", "num-traits",
"paste", "paste",
"raw-cpuid", "raw-cpuid 11.5.0",
"seq-macro", "seq-macro",
] ]
...@@ -2092,12 +2179,27 @@ version = "0.17.1" ...@@ -2092,12 +2179,27 @@ version = "0.17.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9c030d0b983d1e34a546b86e08f600c11696fde16199f971cd46c12e67512c0" checksum = "b9c030d0b983d1e34a546b86e08f600c11696fde16199f971cd46c12e67512c0"
dependencies = [ dependencies = [
"dyn-stack", "dyn-stack 0.10.0",
"gemm-common", "gemm-common 0.17.1",
"num-complex",
"num-traits",
"paste",
"raw-cpuid 10.7.0",
"seq-macro",
]
[[package]]
name = "gemm-c32"
version = "0.18.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f6db9fd9f40421d00eea9dd0770045a5603b8d684654816637732463f4073847"
dependencies = [
"dyn-stack 0.13.0",
"gemm-common 0.18.2",
"num-complex", "num-complex",
"num-traits", "num-traits",
"paste", "paste",
"raw-cpuid", "raw-cpuid 11.5.0",
"seq-macro", "seq-macro",
] ]
...@@ -2107,12 +2209,27 @@ version = "0.17.1" ...@@ -2107,12 +2209,27 @@ version = "0.17.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fbb5f2e79fefb9693d18e1066a557b4546cd334b226beadc68b11a8f9431852a" checksum = "fbb5f2e79fefb9693d18e1066a557b4546cd334b226beadc68b11a8f9431852a"
dependencies = [ dependencies = [
"dyn-stack", "dyn-stack 0.10.0",
"gemm-common", "gemm-common 0.17.1",
"num-complex", "num-complex",
"num-traits", "num-traits",
"paste", "paste",
"raw-cpuid", "raw-cpuid 10.7.0",
"seq-macro",
]
[[package]]
name = "gemm-c64"
version = "0.18.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dfcad8a3d35a43758330b635d02edad980c1e143dc2f21e6fd25f9e4eada8edf"
dependencies = [
"dyn-stack 0.13.0",
"gemm-common 0.18.2",
"num-complex",
"num-traits",
"paste",
"raw-cpuid 11.5.0",
"seq-macro", "seq-macro",
] ]
...@@ -2123,17 +2240,38 @@ source = "registry+https://github.com/rust-lang/crates.io-index" ...@@ -2123,17 +2240,38 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2e7ea062c987abcd8db95db917b4ffb4ecdfd0668471d8dc54734fdff2354e8" checksum = "a2e7ea062c987abcd8db95db917b4ffb4ecdfd0668471d8dc54734fdff2354e8"
dependencies = [ dependencies = [
"bytemuck", "bytemuck",
"dyn-stack", "dyn-stack 0.10.0",
"half",
"num-complex",
"num-traits",
"once_cell",
"paste",
"pulp 0.18.22",
"raw-cpuid 10.7.0",
"rayon",
"seq-macro",
"sysctl 0.5.5",
]
[[package]]
name = "gemm-common"
version = "0.18.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a352d4a69cbe938b9e2a9cb7a3a63b7e72f9349174a2752a558a8a563510d0f3"
dependencies = [
"bytemuck",
"dyn-stack 0.13.0",
"half", "half",
"libm",
"num-complex", "num-complex",
"num-traits", "num-traits",
"once_cell", "once_cell",
"paste", "paste",
"pulp", "pulp 0.21.4",
"raw-cpuid", "raw-cpuid 11.5.0",
"rayon", "rayon",
"seq-macro", "seq-macro",
"sysctl", "sysctl 0.6.0",
] ]
[[package]] [[package]]
...@@ -2142,14 +2280,32 @@ version = "0.17.1" ...@@ -2142,14 +2280,32 @@ version = "0.17.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7ca4c06b9b11952071d317604acb332e924e817bd891bec8dfb494168c7cedd4" checksum = "7ca4c06b9b11952071d317604acb332e924e817bd891bec8dfb494168c7cedd4"
dependencies = [ dependencies = [
"dyn-stack", "dyn-stack 0.10.0",
"gemm-common", "gemm-common 0.17.1",
"gemm-f32", "gemm-f32 0.17.1",
"half",
"num-complex",
"num-traits",
"paste",
"raw-cpuid 10.7.0",
"rayon",
"seq-macro",
]
[[package]]
name = "gemm-f16"
version = "0.18.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cff95ae3259432f3c3410eaa919033cd03791d81cebd18018393dc147952e109"
dependencies = [
"dyn-stack 0.13.0",
"gemm-common 0.18.2",
"gemm-f32 0.18.2",
"half", "half",
"num-complex", "num-complex",
"num-traits", "num-traits",
"paste", "paste",
"raw-cpuid", "raw-cpuid 11.5.0",
"rayon", "rayon",
"seq-macro", "seq-macro",
] ]
...@@ -2160,12 +2316,27 @@ version = "0.17.1" ...@@ -2160,12 +2316,27 @@ version = "0.17.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e9a69f51aaefbd9cf12d18faf273d3e982d9d711f60775645ed5c8047b4ae113" checksum = "e9a69f51aaefbd9cf12d18faf273d3e982d9d711f60775645ed5c8047b4ae113"
dependencies = [ dependencies = [
"dyn-stack", "dyn-stack 0.10.0",
"gemm-common", "gemm-common 0.17.1",
"num-complex", "num-complex",
"num-traits", "num-traits",
"paste", "paste",
"raw-cpuid", "raw-cpuid 10.7.0",
"seq-macro",
]
[[package]]
name = "gemm-f32"
version = "0.18.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bc8d3d4385393304f407392f754cd2dc4b315d05063f62cf09f47b58de276864"
dependencies = [
"dyn-stack 0.13.0",
"gemm-common 0.18.2",
"num-complex",
"num-traits",
"paste",
"raw-cpuid 11.5.0",
"seq-macro", "seq-macro",
] ]
...@@ -2175,12 +2346,27 @@ version = "0.17.1" ...@@ -2175,12 +2346,27 @@ version = "0.17.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aa397a48544fadf0b81ec8741e5c0fba0043008113f71f2034def1935645d2b0" checksum = "aa397a48544fadf0b81ec8741e5c0fba0043008113f71f2034def1935645d2b0"
dependencies = [ dependencies = [
"dyn-stack", "dyn-stack 0.10.0",
"gemm-common", "gemm-common 0.17.1",
"num-complex",
"num-traits",
"paste",
"raw-cpuid 10.7.0",
"seq-macro",
]
[[package]]
name = "gemm-f64"
version = "0.18.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "35b2a4f76ce4b8b16eadc11ccf2e083252d8237c1b589558a49b0183545015bd"
dependencies = [
"dyn-stack 0.13.0",
"gemm-common 0.18.2",
"num-complex", "num-complex",
"num-traits", "num-traits",
"paste", "paste",
"raw-cpuid", "raw-cpuid 11.5.0",
"seq-macro", "seq-macro",
] ]
...@@ -2221,6 +2407,30 @@ dependencies = [ ...@@ -2221,6 +2407,30 @@ dependencies = [
"wasm-bindgen", "wasm-bindgen",
] ]
[[package]]
name = "ggml-quants"
version = "0.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a27693512784e0786212eb0bef841779a6337d2d04520ed475b4d5a864f98366"
dependencies = [
"digit-layout",
"half",
"rayon",
]
[[package]]
name = "ggus"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3ac5654356c6f7f6116905aeaf92ab002c3d03414ada5dbe0bb2e32aa5fea173"
dependencies = [
"fancy-regex",
"ggml-quants",
"indexmap 2.8.0",
"log",
"num_enum",
]
[[package]] [[package]]
name = "gif" name = "gif"
version = "0.13.1" version = "0.13.1"
...@@ -3403,7 +3613,7 @@ version = "0.4.0" ...@@ -3403,7 +3613,7 @@ version = "0.4.0"
source = "git+https://github.com/EricLBuehler/mistral.rs.git?rev=aaafc2ef#aaafc2efc6305c1a79eee632b177d76586df1646" source = "git+https://github.com/EricLBuehler/mistral.rs.git?rev=aaafc2ef#aaafc2efc6305c1a79eee632b177d76586df1646"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"candle-core", "candle-core 0.8.0",
"candle-nn", "candle-nn",
"clap", "clap",
"either", "either",
...@@ -3431,7 +3641,7 @@ dependencies = [ ...@@ -3431,7 +3641,7 @@ dependencies = [
"bindgen_cuda 0.1.5", "bindgen_cuda 0.1.5",
"bytemuck", "bytemuck",
"bytemuck_derive", "bytemuck_derive",
"candle-core", "candle-core 0.8.0",
"candle-nn", "candle-nn",
"cfgrammar", "cfgrammar",
"chrono", "chrono",
...@@ -3499,7 +3709,7 @@ source = "git+https://github.com/EricLBuehler/mistral.rs.git?rev=aaafc2ef#aaafc2 ...@@ -3499,7 +3709,7 @@ source = "git+https://github.com/EricLBuehler/mistral.rs.git?rev=aaafc2ef#aaafc2
dependencies = [ dependencies = [
"anyhow", "anyhow",
"bindgen_cuda 0.1.6", "bindgen_cuda 0.1.6",
"candle-core", "candle-core 0.8.0",
"float8", "float8",
"half", "half",
"metal", "metal",
...@@ -3514,7 +3724,7 @@ source = "git+https://github.com/EricLBuehler/mistral.rs.git?rev=aaafc2ef#aaafc2 ...@@ -3514,7 +3724,7 @@ source = "git+https://github.com/EricLBuehler/mistral.rs.git?rev=aaafc2ef#aaafc2
dependencies = [ dependencies = [
"bindgen_cuda 0.1.5", "bindgen_cuda 0.1.5",
"byteorder", "byteorder",
"candle-core", "candle-core 0.8.0",
"candle-nn", "candle-nn",
"float8", "float8",
"half", "half",
...@@ -3539,7 +3749,7 @@ name = "mistralrs-vision" ...@@ -3539,7 +3749,7 @@ name = "mistralrs-vision"
version = "0.4.0" version = "0.4.0"
source = "git+https://github.com/EricLBuehler/mistral.rs.git?rev=aaafc2ef#aaafc2efc6305c1a79eee632b177d76586df1646" source = "git+https://github.com/EricLBuehler/mistral.rs.git?rev=aaafc2ef#aaafc2efc6305c1a79eee632b177d76586df1646"
dependencies = [ dependencies = [
"candle-core", "candle-core 0.8.0",
"image", "image",
] ]
...@@ -3781,6 +3991,20 @@ dependencies = [ ...@@ -3781,6 +3991,20 @@ dependencies = [
"rand 0.8.5", "rand 0.8.5",
] ]
[[package]]
name = "num"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23"
dependencies = [
"num-bigint",
"num-complex",
"num-integer",
"num-iter",
"num-rational",
"num-traits",
]
[[package]] [[package]]
name = "num-bigint" name = "num-bigint"
version = "0.4.6" version = "0.4.6"
...@@ -3827,6 +4051,28 @@ dependencies = [ ...@@ -3827,6 +4051,28 @@ dependencies = [
"num-traits", "num-traits",
] ]
[[package]]
name = "num-iter"
version = "0.1.45"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf"
dependencies = [
"autocfg",
"num-integer",
"num-traits",
]
[[package]]
name = "num-rational"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824"
dependencies = [
"num-bigint",
"num-integer",
"num-traits",
]
[[package]] [[package]]
name = "num-traits" name = "num-traits"
version = "0.2.19" version = "0.2.19"
...@@ -4456,6 +4702,20 @@ dependencies = [ ...@@ -4456,6 +4702,20 @@ dependencies = [
"reborrow", "reborrow",
] ]
[[package]]
name = "pulp"
version = "0.21.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "95fb7a99b37aaef4c7dd2fd15a819eb8010bfc7a2c2155230d51f497316cad6d"
dependencies = [
"bytemuck",
"cfg-if 1.0.0",
"libm",
"num-complex",
"reborrow",
"version_check",
]
[[package]] [[package]]
name = "pyo3" name = "pyo3"
version = "0.23.5" version = "0.23.5"
...@@ -4754,6 +5014,15 @@ dependencies = [ ...@@ -4754,6 +5014,15 @@ dependencies = [
"bitflags 1.3.2", "bitflags 1.3.2",
] ]
[[package]]
name = "raw-cpuid"
version = "11.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c6df7ab838ed27997ba19a4664507e6f82b41fe6e20be42929332156e5e85146"
dependencies = [
"bitflags 2.9.0",
]
[[package]] [[package]]
name = "rawpointer" name = "rawpointer"
version = "0.2.1" version = "0.2.1"
...@@ -5785,6 +6054,20 @@ dependencies = [ ...@@ -5785,6 +6054,20 @@ dependencies = [
"walkdir", "walkdir",
] ]
[[package]]
name = "sysctl"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "01198a2debb237c62b6826ec7081082d951f46dbb64b0e8c7649a452230d1dfc"
dependencies = [
"bitflags 2.9.0",
"byteorder",
"enum-as-inner",
"libc",
"thiserror 1.0.69",
"walkdir",
]
[[package]] [[package]]
name = "sysinfo" name = "sysinfo"
version = "0.30.13" version = "0.30.13"
...@@ -6441,6 +6724,40 @@ version = "0.1.7" ...@@ -6441,6 +6724,40 @@ version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971" checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971"
[[package]]
name = "ug"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "03719c61a91b51541f076dfdba45caacf750b230cefaa4b32d6f5411c3f7f437"
dependencies = [
"gemm 0.18.2",
"half",
"libloading",
"memmap2",
"num",
"num-traits",
"num_cpus",
"rayon",
"safetensors",
"serde",
"thiserror 1.0.69",
"tracing",
"yoke",
]
[[package]]
name = "ug-cuda"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "50758486d7941f8b0a636ba7e29455c07071f41590beac1fd307ec893e8db69a"
dependencies = [
"cudarc 0.13.9 (registry+https://github.com/rust-lang/crates.io-index)",
"half",
"serde",
"thiserror 1.0.69",
"ug",
]
[[package]] [[package]]
name = "unarray" name = "unarray"
version = "0.1.4" version = "0.1.4"
......
...@@ -177,12 +177,7 @@ dynamo-run in=none out=sglang --model-path ~/llm_models/DeepSeek-R1-Distill-Llam ...@@ -177,12 +177,7 @@ dynamo-run in=none out=sglang --model-path ~/llm_models/DeepSeek-R1-Distill-Llam
- `cargo build --features llamacpp,cuda` - `cargo build --features llamacpp,cuda`
- `dynamo-run out=llama_cpp --model-path ~/llm_models/Llama-3.2-3B-Instruct-Q6_K.gguf --model-config ~/llm_models/Llama-3.2-3B-Instruct/` - `dynamo-run out=llama_cp ~/llm_models/Llama-3.2-3B-Instruct-Q6_K.gguf`
The extra `--model-config` flag is because:
- llama_cpp only runs GGUF
- We send it tokens, meaning we do the tokenization ourself, so we need a tokenizer
- We don't yet read it out of the GGUF (TODO), so we need an HF repo with `tokenizer.json` et al
If the build step also builds llama_cpp libraries into the same folder as the binary ("libllama.so", "libggml.so", "libggml-base.so", "libggml-cpu.so", "libggml-cuda.so"), then `dynamo-run` will need to find those at runtime. Set `LD_LIBRARY_PATH`, and be sure to deploy them alongside the `dynamo-run` binary. If the build step also builds llama_cpp libraries into the same folder as the binary ("libllama.so", "libggml.so", "libggml-base.so", "libggml-cpu.so", "libggml-cuda.so"), then `dynamo-run` will need to find those at runtime. Set `LD_LIBRARY_PATH`, and be sure to deploy them alongside the `dynamo-run` binary.
...@@ -215,7 +210,7 @@ Run (still inside that virtualenv) - HF repo: ...@@ -215,7 +210,7 @@ Run (still inside that virtualenv) - HF repo:
Run (still inside that virtualenv) - GGUF: Run (still inside that virtualenv) - GGUF:
``` ```
./dynamo-run in=http out=vllm --model-path ~/llm_models/Llama-3.2-3B-Instruct-Q6_K.gguf --model-config ~/llm_models/Llama-3.2-3B-Instruct/ ./dynamo-run in=http out=vllm ~/llm_models/Llama-3.2-3B-Instruct-Q6_K.gguf
``` ```
+ Multi-node: + Multi-node:
......
...@@ -74,7 +74,9 @@ pub async fn prepare_engine( ...@@ -74,7 +74,9 @@ pub async fn prepare_engine(
let preprocessor = OpenAIPreprocessor::new(*card.clone()) let preprocessor = OpenAIPreprocessor::new(*card.clone())
.await? .await?
.into_operator(); .into_operator();
let backend = Backend::from_mdc(*card.clone()).await?.into_operator(); let backend = Backend::from_tokenizer(card.tokenizer_hf()?)
.await?
.into_operator();
let engine = ServiceBackend::from_engine(inner_engine); let engine = ServiceBackend::from_engine(inner_engine);
let pipeline = frontend let pipeline = frontend
......
...@@ -126,46 +126,49 @@ pub async fn run( ...@@ -126,46 +126,49 @@ pub async fn run(
// Load the model deployment card, if any // Load the model deployment card, if any
// Only used by some engines, so without those feature flags it's unused. // Only used by some engines, so without those feature flags it's unused.
#[allow(unused_variables)] #[allow(unused_variables)]
let (maybe_card_path, maybe_card) = match (&model_path, &flags.model_config) { let maybe_card = match (&model_path, &flags.model_config) {
// --model-config takes precedence // --model-config takes precedence
(_, Some(model_config)) => { (_, Some(model_config)) => {
let card = match ModelDeploymentCard::from_local_path(model_config, model_name.as_deref()).await {
match ModelDeploymentCard::from_local_path(model_config, model_name.as_deref()) Ok(card) => Some(card),
.await Err(e) => {
{ tracing::error!(
Ok(card) => Some(card), "Failed to load model card from --model-config path {}: {e}",
Err(e) => { model_config.display(),
tracing::error!( );
"Failed to load model card from config path {}: {}", None
model_config.display(), }
e }
);
None
}
};
(Some(model_config.clone()), card)
} }
// If --model-path is an HF repo use that // If --model-path is an HF repo use that
(Some(model_path), _) if model_path.is_dir() => { (Some(model_path), _) if model_path.is_dir() => {
let card = match ModelDeploymentCard::from_local_path(model_path, model_name.as_deref()) match ModelDeploymentCard::from_local_path(model_path, model_name.as_deref()).await {
.await
{
Ok(card) => Some(card), Ok(card) => Some(card),
Err(e) => { Err(e) => {
tracing::error!( tracing::error!(
"Failed to load model card from model path {}: {}", "Failed to load model card from --model-path {}: {e}",
model_path.display(), model_path.display(),
e
); );
None None
} }
}; }
(Some(model_path.clone()), card) }
(Some(model_path), _) if model_path.is_file() => {
match ModelDeploymentCard::from_gguf(model_path, model_name.as_deref()).await {
Ok(card) => Some(card),
Err(e) => {
tracing::error!(
"Failed to load model card from GGUF {}: {e}",
model_path.display(),
);
None
}
}
} }
// Otherwise we don't have one, but we only need it if we're tokenizing // Otherwise we don't have one, but we only need it if we're tokenizing
_ => { _ => {
tracing::debug!("No model card path provided (neither --model-config nor a directory in --model-path)"); tracing::debug!("No model card path provided (neither --model-config nor a directory in --model-path)");
(None, None) None
} }
}; };
...@@ -276,17 +279,9 @@ pub async fn run( ...@@ -276,17 +279,9 @@ pub async fn run(
"out=vllm requires flag --model-path=<full-path-to-hf-repo-or-model-gguf>" "out=vllm requires flag --model-path=<full-path-to-hf-repo-or-model-gguf>"
); );
}; };
let Some(card_path) = maybe_card_path else {
// If we have a gguf we also need a model card because we don't currently parse
// tokenizer et al out of gguf.
anyhow::bail!(
"Running GGUF files also requires a `--model-config` for the tokenizer et al."
);
};
let Some(card) = maybe_card.clone() else { let Some(card) = maybe_card.clone() else {
anyhow::bail!( anyhow::bail!(
"Failed to load model card: either unsupported HuggingFace repo format \ "Unable to build tokenizer. out=vllm requires --model-path to be an HF repo with fast tokenizer (tokenizer.json) or a GGUF file"
or for GGUF files --model-config is missing."
); );
}; };
let Some(sock_prefix) = zmq_socket_prefix else { let Some(sock_prefix) = zmq_socket_prefix else {
...@@ -311,7 +306,6 @@ pub async fn run( ...@@ -311,7 +306,6 @@ pub async fn run(
// vllm multi-node only the leader runs vllm // vllm multi-node only the leader runs vllm
let (engine, vllm_future) = vllm::make_leader_engine( let (engine, vllm_future) = vllm::make_leader_engine(
cancel_token.clone(), cancel_token.clone(),
&card_path,
&model_path, &model_path,
&sock_prefix, &sock_prefix,
node_conf, node_conf,
......
...@@ -80,12 +80,6 @@ fn main() -> anyhow::Result<()> { ...@@ -80,12 +80,6 @@ fn main() -> anyhow::Result<()> {
let Some(model_path) = flags.model_path_flag else { let Some(model_path) = flags.model_path_flag else {
anyhow::bail!("vllm subprocess requires --model-path flag"); anyhow::bail!("vllm subprocess requires --model-path flag");
}; };
let Some(model_config) = flags.model_config else {
anyhow::bail!("vllm subprocess requires --model-config");
};
if !model_config.is_dir() {
anyhow::bail!("vllm subprocess requires model config path to be a directory containing tokenizer.json, config.json, etc");
}
if cfg!(feature = "vllm") { if cfg!(feature = "vllm") {
#[cfg(feature = "vllm")] #[cfg(feature = "vllm")]
{ {
...@@ -97,7 +91,6 @@ fn main() -> anyhow::Result<()> { ...@@ -97,7 +91,6 @@ fn main() -> anyhow::Result<()> {
}; };
return vllm::run_subprocess( return vllm::run_subprocess(
ZMQ_SOCKET_PREFIX, ZMQ_SOCKET_PREFIX,
&model_config,
&model_path, &model_path,
node_config, node_config,
flags.tensor_parallel_size, flags.tensor_parallel_size,
......
...@@ -26,6 +26,12 @@ dependencies = [ ...@@ -26,6 +26,12 @@ dependencies = [
"memchr", "memchr",
] ]
[[package]]
name = "akin"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1763692fc1416554cf051efc56a3de5595eca47299d731cc5c2b583adf8b4d2f"
[[package]] [[package]]
name = "android-tzdata" name = "android-tzdata"
version = "0.1.1" version = "0.1.1"
...@@ -97,6 +103,15 @@ version = "1.0.97" ...@@ -97,6 +103,15 @@ version = "1.0.97"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dcfed56ad506cb2c684a14971b8861fdc3baaaae314b9e5f9bb532cbe3ba7a4f" checksum = "dcfed56ad506cb2c684a14971b8861fdc3baaaae314b9e5f9bb532cbe3ba7a4f"
[[package]]
name = "arbitrary"
version = "1.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dde20b3d026af13f561bdd0f15edf01fc734f0dafcedbaf42bba506a9517f223"
dependencies = [
"derive_arbitrary",
]
[[package]] [[package]]
name = "arrayref" name = "arrayref"
version = "0.3.9" version = "0.3.9"
...@@ -424,6 +439,21 @@ dependencies = [ ...@@ -424,6 +439,21 @@ dependencies = [
"syn 2.0.100", "syn 2.0.100",
] ]
[[package]]
name = "bit-set"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3"
dependencies = [
"bit-vec",
]
[[package]]
name = "bit-vec"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7"
[[package]] [[package]]
name = "bitflags" name = "bitflags"
version = "1.3.2" version = "1.3.2"
...@@ -480,6 +510,9 @@ name = "bytemuck" ...@@ -480,6 +510,9 @@ name = "bytemuck"
version = "1.22.0" version = "1.22.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6b1fc10dbac614ebc03540c9dbd60e83887fda27794998c6528f1782047d540" checksum = "b6b1fc10dbac614ebc03540c9dbd60e83887fda27794998c6528f1782047d540"
dependencies = [
"bytemuck_derive",
]
[[package]] [[package]]
name = "bytemuck_derive" name = "bytemuck_derive"
...@@ -507,6 +540,28 @@ dependencies = [ ...@@ -507,6 +540,28 @@ dependencies = [
"serde", "serde",
] ]
[[package]]
name = "candle-core"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "06ccf5ee3532e66868516d9b315f73aec9f34ea1a37ae98514534d458915dbf1"
dependencies = [
"byteorder",
"gemm 0.17.1",
"half",
"memmap2",
"num-traits",
"num_cpus",
"rand 0.9.0",
"rand_distr",
"rayon",
"safetensors",
"thiserror 1.0.69",
"ug",
"yoke",
"zip",
]
[[package]] [[package]]
name = "cc" name = "cc"
version = "1.2.16" version = "1.2.16"
...@@ -755,6 +810,12 @@ version = "0.8.21" ...@@ -755,6 +810,12 @@ version = "0.8.21"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
[[package]]
name = "crunchy"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "43da5946c66ffcc7745f48db692ffbb10a83bfe0afd96235c5c2a4fb23994929"
[[package]] [[package]]
name = "crypto-common" name = "crypto-common"
version = "0.1.6" version = "0.1.6"
...@@ -870,6 +931,17 @@ dependencies = [ ...@@ -870,6 +931,17 @@ dependencies = [
"syn 2.0.100", "syn 2.0.100",
] ]
[[package]]
name = "derive_arbitrary"
version = "1.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "30542c1ad912e0e3d22a1935c290e12e8a29d704a420177a31faad4a601a0800"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
]
[[package]] [[package]]
name = "derive_builder" name = "derive_builder"
version = "0.20.2" version = "0.20.2"
...@@ -911,6 +983,15 @@ dependencies = [ ...@@ -911,6 +983,15 @@ dependencies = [
"crypto-common", "crypto-common",
] ]
[[package]]
name = "digit-layout"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09157630eece4139f6cc5a457556d308c3465ecd5af492f0e5aadc043997e2ce"
dependencies = [
"half",
]
[[package]] [[package]]
name = "dircpy" name = "dircpy"
version = "0.3.19" version = "0.3.19"
...@@ -954,10 +1035,30 @@ dependencies = [ ...@@ -954,10 +1035,30 @@ dependencies = [
"syn 2.0.100", "syn 2.0.100",
] ]
[[package]]
name = "dyn-stack"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56e53799688f5632f364f8fb387488dd05db9fe45db7011be066fc20e7027f8b"
dependencies = [
"bytemuck",
"reborrow",
]
[[package]]
name = "dyn-stack"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "490bd48eb68fffcfed519b4edbfd82c69cbe741d175b84f0e0cbe8c57cbe0bdd"
dependencies = [
"bytemuck",
]
[[package]] [[package]]
name = "dynamo-llm" name = "dynamo-llm"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"akin",
"anyhow", "anyhow",
"async-openai", "async-openai",
"async-stream", "async-stream",
...@@ -969,6 +1070,7 @@ dependencies = [ ...@@ -969,6 +1070,7 @@ dependencies = [
"bs62", "bs62",
"bytemuck", "bytemuck",
"bytes", "bytes",
"candle-core",
"chrono", "chrono",
"cmake", "cmake",
"derive-getters", "derive-getters",
...@@ -978,9 +1080,11 @@ dependencies = [ ...@@ -978,9 +1080,11 @@ dependencies = [
"erased-serde", "erased-serde",
"futures", "futures",
"galil-seiferas", "galil-seiferas",
"ggus",
"indexmap 2.8.0", "indexmap 2.8.0",
"itertools 0.14.0", "itertools 0.14.0",
"libc", "libc",
"memmap2",
"minijinja", "minijinja",
"minijinja-contrib", "minijinja-contrib",
"prometheus", "prometheus",
...@@ -1123,6 +1227,18 @@ version = "1.0.0" ...@@ -1123,6 +1227,18 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0"
[[package]]
name = "enum-as-inner"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1e6a265c649f3f5979b601d26f1d05ada116434c87741c9493cb56218f76cbc"
dependencies = [
"heck",
"proc-macro2",
"quote",
"syn 2.0.100",
]
[[package]] [[package]]
name = "enum-ordinalize" name = "enum-ordinalize"
version = "4.3.0" version = "4.3.0"
...@@ -1226,6 +1342,17 @@ dependencies = [ ...@@ -1226,6 +1342,17 @@ dependencies = [
"pin-project-lite", "pin-project-lite",
] ]
[[package]]
name = "fancy-regex"
version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e24cb5a94bcae1e5408b0effca5cd7172ea3c5755049c5f3af4cd283a165298"
dependencies = [
"bit-set",
"regex-automata 0.4.9",
"regex-syntax 0.8.5",
]
[[package]] [[package]]
name = "fastrand" name = "fastrand"
version = "2.3.0" version = "2.3.0"
...@@ -1407,6 +1534,243 @@ dependencies = [ ...@@ -1407,6 +1534,243 @@ dependencies = [
"unchecked-index", "unchecked-index",
] ]
[[package]]
name = "gemm"
version = "0.17.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ab24cc62135b40090e31a76a9b2766a501979f3070fa27f689c27ec04377d32"
dependencies = [
"dyn-stack 0.10.0",
"gemm-c32 0.17.1",
"gemm-c64 0.17.1",
"gemm-common 0.17.1",
"gemm-f16 0.17.1",
"gemm-f32 0.17.1",
"gemm-f64 0.17.1",
"num-complex",
"num-traits",
"paste",
"raw-cpuid 10.7.0",
"seq-macro",
]
[[package]]
name = "gemm"
version = "0.18.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ab96b703d31950f1aeddded248bc95543c9efc7ac9c4a21fda8703a83ee35451"
dependencies = [
"dyn-stack 0.13.0",
"gemm-c32 0.18.2",
"gemm-c64 0.18.2",
"gemm-common 0.18.2",
"gemm-f16 0.18.2",
"gemm-f32 0.18.2",
"gemm-f64 0.18.2",
"num-complex",
"num-traits",
"paste",
"raw-cpuid 11.5.0",
"seq-macro",
]
[[package]]
name = "gemm-c32"
version = "0.17.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9c030d0b983d1e34a546b86e08f600c11696fde16199f971cd46c12e67512c0"
dependencies = [
"dyn-stack 0.10.0",
"gemm-common 0.17.1",
"num-complex",
"num-traits",
"paste",
"raw-cpuid 10.7.0",
"seq-macro",
]
[[package]]
name = "gemm-c32"
version = "0.18.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f6db9fd9f40421d00eea9dd0770045a5603b8d684654816637732463f4073847"
dependencies = [
"dyn-stack 0.13.0",
"gemm-common 0.18.2",
"num-complex",
"num-traits",
"paste",
"raw-cpuid 11.5.0",
"seq-macro",
]
[[package]]
name = "gemm-c64"
version = "0.17.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fbb5f2e79fefb9693d18e1066a557b4546cd334b226beadc68b11a8f9431852a"
dependencies = [
"dyn-stack 0.10.0",
"gemm-common 0.17.1",
"num-complex",
"num-traits",
"paste",
"raw-cpuid 10.7.0",
"seq-macro",
]
[[package]]
name = "gemm-c64"
version = "0.18.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dfcad8a3d35a43758330b635d02edad980c1e143dc2f21e6fd25f9e4eada8edf"
dependencies = [
"dyn-stack 0.13.0",
"gemm-common 0.18.2",
"num-complex",
"num-traits",
"paste",
"raw-cpuid 11.5.0",
"seq-macro",
]
[[package]]
name = "gemm-common"
version = "0.17.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2e7ea062c987abcd8db95db917b4ffb4ecdfd0668471d8dc54734fdff2354e8"
dependencies = [
"bytemuck",
"dyn-stack 0.10.0",
"half",
"num-complex",
"num-traits",
"once_cell",
"paste",
"pulp 0.18.22",
"raw-cpuid 10.7.0",
"rayon",
"seq-macro",
"sysctl 0.5.5",
]
[[package]]
name = "gemm-common"
version = "0.18.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a352d4a69cbe938b9e2a9cb7a3a63b7e72f9349174a2752a558a8a563510d0f3"
dependencies = [
"bytemuck",
"dyn-stack 0.13.0",
"half",
"libm",
"num-complex",
"num-traits",
"once_cell",
"paste",
"pulp 0.21.4",
"raw-cpuid 11.5.0",
"rayon",
"seq-macro",
"sysctl 0.6.0",
]
[[package]]
name = "gemm-f16"
version = "0.17.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7ca4c06b9b11952071d317604acb332e924e817bd891bec8dfb494168c7cedd4"
dependencies = [
"dyn-stack 0.10.0",
"gemm-common 0.17.1",
"gemm-f32 0.17.1",
"half",
"num-complex",
"num-traits",
"paste",
"raw-cpuid 10.7.0",
"rayon",
"seq-macro",
]
[[package]]
name = "gemm-f16"
version = "0.18.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cff95ae3259432f3c3410eaa919033cd03791d81cebd18018393dc147952e109"
dependencies = [
"dyn-stack 0.13.0",
"gemm-common 0.18.2",
"gemm-f32 0.18.2",
"half",
"num-complex",
"num-traits",
"paste",
"raw-cpuid 11.5.0",
"rayon",
"seq-macro",
]
[[package]]
name = "gemm-f32"
version = "0.17.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e9a69f51aaefbd9cf12d18faf273d3e982d9d711f60775645ed5c8047b4ae113"
dependencies = [
"dyn-stack 0.10.0",
"gemm-common 0.17.1",
"num-complex",
"num-traits",
"paste",
"raw-cpuid 10.7.0",
"seq-macro",
]
[[package]]
name = "gemm-f32"
version = "0.18.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bc8d3d4385393304f407392f754cd2dc4b315d05063f62cf09f47b58de276864"
dependencies = [
"dyn-stack 0.13.0",
"gemm-common 0.18.2",
"num-complex",
"num-traits",
"paste",
"raw-cpuid 11.5.0",
"seq-macro",
]
[[package]]
name = "gemm-f64"
version = "0.17.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aa397a48544fadf0b81ec8741e5c0fba0043008113f71f2034def1935645d2b0"
dependencies = [
"dyn-stack 0.10.0",
"gemm-common 0.17.1",
"num-complex",
"num-traits",
"paste",
"raw-cpuid 10.7.0",
"seq-macro",
]
[[package]]
name = "gemm-f64"
version = "0.18.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "35b2a4f76ce4b8b16eadc11ccf2e083252d8237c1b589558a49b0183545015bd"
dependencies = [
"dyn-stack 0.13.0",
"gemm-common 0.18.2",
"num-complex",
"num-traits",
"paste",
"raw-cpuid 11.5.0",
"seq-macro",
]
[[package]] [[package]]
name = "generic-array" name = "generic-array"
version = "0.14.7" version = "0.14.7"
...@@ -1444,6 +1808,30 @@ dependencies = [ ...@@ -1444,6 +1808,30 @@ dependencies = [
"wasm-bindgen", "wasm-bindgen",
] ]
[[package]]
name = "ggml-quants"
version = "0.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a27693512784e0786212eb0bef841779a6337d2d04520ed475b4d5a864f98366"
dependencies = [
"digit-layout",
"half",
"rayon",
]
[[package]]
name = "ggus"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3ac5654356c6f7f6116905aeaf92ab002c3d03414ada5dbe0bb2e32aa5fea173"
dependencies = [
"fancy-regex",
"ggml-quants",
"indexmap 2.8.0",
"log",
"num_enum",
]
[[package]] [[package]]
name = "gimli" name = "gimli"
version = "0.31.1" version = "0.31.1"
...@@ -1475,6 +1863,20 @@ dependencies = [ ...@@ -1475,6 +1863,20 @@ dependencies = [
"tracing", "tracing",
] ]
[[package]]
name = "half"
version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7db2ff139bba50379da6aa0766b52fdcb62cb5b263009b09ed58ba604e14bbd1"
dependencies = [
"bytemuck",
"cfg-if 1.0.0",
"crunchy",
"num-traits",
"rand 0.9.0",
"rand_distr",
]
[[package]] [[package]]
name = "hashbrown" name = "hashbrown"
version = "0.12.3" version = "0.12.3"
...@@ -1493,6 +1895,12 @@ version = "0.5.0" ...@@ -1493,6 +1895,12 @@ version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
[[package]]
name = "hermit-abi"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024"
[[package]] [[package]]
name = "hf-hub" name = "hf-hub"
version = "0.4.2" version = "0.4.2"
...@@ -1989,6 +2397,12 @@ dependencies = [ ...@@ -1989,6 +2397,12 @@ dependencies = [
"windows-targets 0.52.6", "windows-targets 0.52.6",
] ]
[[package]]
name = "libm"
version = "0.2.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa"
[[package]] [[package]]
name = "libredox" name = "libredox"
version = "0.1.3" version = "0.1.3"
...@@ -2082,6 +2496,16 @@ version = "2.7.4" ...@@ -2082,6 +2496,16 @@ version = "2.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
[[package]]
name = "memmap2"
version = "0.9.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd3f7eed9d3848f8b98834af67102b720745c4ec028fcd0aa0239277e7de374f"
dependencies = [
"libc",
"stable_deref_trait",
]
[[package]] [[package]]
name = "memo-map" name = "memo-map"
version = "0.3.3" version = "0.3.3"
...@@ -2321,6 +2745,20 @@ dependencies = [ ...@@ -2321,6 +2745,20 @@ dependencies = [
"rand 0.8.5", "rand 0.8.5",
] ]
[[package]]
name = "num"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23"
dependencies = [
"num-bigint",
"num-complex",
"num-integer",
"num-iter",
"num-rational",
"num-traits",
]
[[package]] [[package]]
name = "num-bigint" name = "num-bigint"
version = "0.4.6" version = "0.4.6"
...@@ -2331,6 +2769,16 @@ dependencies = [ ...@@ -2331,6 +2769,16 @@ dependencies = [
"num-traits", "num-traits",
] ]
[[package]]
name = "num-complex"
version = "0.4.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495"
dependencies = [
"bytemuck",
"num-traits",
]
[[package]] [[package]]
name = "num-conv" name = "num-conv"
version = "0.1.0" version = "0.1.0"
...@@ -2346,6 +2794,28 @@ dependencies = [ ...@@ -2346,6 +2794,28 @@ dependencies = [
"num-traits", "num-traits",
] ]
[[package]]
name = "num-iter"
version = "0.1.45"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf"
dependencies = [
"autocfg",
"num-integer",
"num-traits",
]
[[package]]
name = "num-rational"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824"
dependencies = [
"num-bigint",
"num-integer",
"num-traits",
]
[[package]] [[package]]
name = "num-traits" name = "num-traits"
version = "0.2.19" version = "0.2.19"
...@@ -2353,6 +2823,38 @@ source = "registry+https://github.com/rust-lang/crates.io-index" ...@@ -2353,6 +2823,38 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
dependencies = [ dependencies = [
"autocfg", "autocfg",
"libm",
]
[[package]]
name = "num_cpus"
version = "1.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43"
dependencies = [
"hermit-abi",
"libc",
]
[[package]]
name = "num_enum"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4e613fc340b2220f734a8595782c551f1250e969d87d3be1ae0579e8d4065179"
dependencies = [
"num_enum_derive",
]
[[package]]
name = "num_enum_derive"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "af1844ef2428cc3e1cb900be36181049ef3d3193c63e43026cfe202983b27a56"
dependencies = [
"proc-macro-crate",
"proc-macro2",
"quote",
"syn 2.0.100",
] ]
[[package]] [[package]]
...@@ -2587,6 +3089,15 @@ dependencies = [ ...@@ -2587,6 +3089,15 @@ dependencies = [
"syn 2.0.100", "syn 2.0.100",
] ]
[[package]]
name = "proc-macro-crate"
version = "3.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "edce586971a4dfaa28950c6f18ed55e0406c1ab88bbce2c6f6293a7aaba73d35"
dependencies = [
"toml_edit",
]
[[package]] [[package]]
name = "proc-macro-error-attr2" name = "proc-macro-error-attr2"
version = "2.0.0" version = "2.0.0"
...@@ -2704,6 +3215,32 @@ version = "2.28.0" ...@@ -2704,6 +3215,32 @@ version = "2.28.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94" checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94"
[[package]]
name = "pulp"
version = "0.18.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a0a01a0dc67cf4558d279f0c25b0962bd08fc6dec0137699eae304103e882fe6"
dependencies = [
"bytemuck",
"libm",
"num-complex",
"reborrow",
]
[[package]]
name = "pulp"
version = "0.21.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "95fb7a99b37aaef4c7dd2fd15a819eb8010bfc7a2c2155230d51f497316cad6d"
dependencies = [
"bytemuck",
"cfg-if 1.0.0",
"libm",
"num-complex",
"reborrow",
"version_check",
]
[[package]] [[package]]
name = "pyo3" name = "pyo3"
version = "0.23.5" version = "0.23.5"
...@@ -2934,6 +3471,34 @@ dependencies = [ ...@@ -2934,6 +3471,34 @@ dependencies = [
"getrandom 0.3.2", "getrandom 0.3.2",
] ]
[[package]]
name = "rand_distr"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463"
dependencies = [
"num-traits",
"rand 0.9.0",
]
[[package]]
name = "raw-cpuid"
version = "10.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c297679cb867470fa8c9f67dbba74a78d78e3e98d7cf2b08d6d71540f797332"
dependencies = [
"bitflags 1.3.2",
]
[[package]]
name = "raw-cpuid"
version = "11.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c6df7ab838ed27997ba19a4664507e6f82b41fe6e20be42929332156e5e85146"
dependencies = [
"bitflags 2.9.0",
]
[[package]] [[package]]
name = "rayon" name = "rayon"
version = "1.10.0" version = "1.10.0"
...@@ -2965,6 +3530,12 @@ dependencies = [ ...@@ -2965,6 +3530,12 @@ dependencies = [
"crossbeam-utils", "crossbeam-utils",
] ]
[[package]]
name = "reborrow"
version = "0.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "03251193000f4bd3b042892be858ee50e8b3719f2b08e5833ac4353724632430"
[[package]] [[package]]
name = "redox_syscall" name = "redox_syscall"
version = "0.5.10" version = "0.5.10"
...@@ -3238,6 +3809,16 @@ version = "1.0.20" ...@@ -3238,6 +3809,16 @@ version = "1.0.20"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
[[package]]
name = "safetensors"
version = "0.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "44560c11236a6130a46ce36c836a62936dc81ebf8c36a37947423571be0e55b6"
dependencies = [
"serde",
"serde_json",
]
[[package]] [[package]]
name = "same-file" name = "same-file"
version = "1.0.6" version = "1.0.6"
...@@ -3323,6 +3904,12 @@ dependencies = [ ...@@ -3323,6 +3904,12 @@ dependencies = [
"serde", "serde",
] ]
[[package]]
name = "seq-macro"
version = "0.3.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc"
[[package]] [[package]]
name = "serde" name = "serde"
version = "1.0.219" version = "1.0.219"
...@@ -3616,6 +4203,34 @@ dependencies = [ ...@@ -3616,6 +4203,34 @@ dependencies = [
"syn 2.0.100", "syn 2.0.100",
] ]
[[package]]
name = "sysctl"
version = "0.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec7dddc5f0fee506baf8b9fdb989e242f17e4b11c61dfbb0635b705217199eea"
dependencies = [
"bitflags 2.9.0",
"byteorder",
"enum-as-inner",
"libc",
"thiserror 1.0.69",
"walkdir",
]
[[package]]
name = "sysctl"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "01198a2debb237c62b6826ec7081082d951f46dbb64b0e8c7649a452230d1dfc"
dependencies = [
"bitflags 2.9.0",
"byteorder",
"enum-as-inner",
"libc",
"thiserror 1.0.69",
"walkdir",
]
[[package]] [[package]]
name = "system-deps" name = "system-deps"
version = "6.2.2" version = "6.2.2"
...@@ -4131,6 +4746,27 @@ version = "1.18.0" ...@@ -4131,6 +4746,27 @@ version = "1.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f" checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f"
[[package]]
name = "ug"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "03719c61a91b51541f076dfdba45caacf750b230cefaa4b32d6f5411c3f7f437"
dependencies = [
"gemm 0.18.2",
"half",
"libloading",
"memmap2",
"num",
"num-traits",
"num_cpus",
"rayon",
"safetensors",
"serde",
"thiserror 1.0.69",
"tracing",
"yoke",
]
[[package]] [[package]]
name = "uncased" name = "uncased"
version = "0.9.10" version = "0.9.10"
...@@ -4904,6 +5540,21 @@ dependencies = [ ...@@ -4904,6 +5540,21 @@ dependencies = [
"syn 2.0.100", "syn 2.0.100",
] ]
[[package]]
name = "zip"
version = "1.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9cc23c04387f4da0374be4533ad1208cbb091d5c11d070dfef13676ad6497164"
dependencies = [
"arbitrary",
"crc32fast",
"crossbeam-utils",
"displaydoc",
"indexmap 2.8.0",
"num_enum",
"thiserror 1.0.69",
]
[[package]] [[package]]
name = "zmq" name = "zmq"
version = "0.10.0" version = "0.10.0"
......
...@@ -36,7 +36,7 @@ sentencepiece = ["dep:sentencepiece"] ...@@ -36,7 +36,7 @@ sentencepiece = ["dep:sentencepiece"]
trtllm = [] trtllm = []
vllm = [] vllm = []
cuda = ["mistralrs/cuda", "llama-cpp-2/cuda"] cuda = ["mistralrs/cuda", "llama-cpp-2/cuda", "candle-core/cuda"]
metal = ["mistralrs/metal", "llama-cpp-2/metal"] metal = ["mistralrs/metal", "llama-cpp-2/metal"]
vulkan = ["llama-cpp-2/vulkan"] vulkan = ["llama-cpp-2/vulkan"]
...@@ -68,9 +68,11 @@ uuid = { workspace = true } ...@@ -68,9 +68,11 @@ uuid = { workspace = true }
xxhash-rust = { workspace = true } xxhash-rust = { workspace = true }
strum = { workspace = true } strum = { workspace = true }
akin = "0.4.0"
async-openai = "0.27.2" async-openai = "0.27.2"
blake3 = "1" blake3 = "1"
bytemuck = "1.22" bytemuck = "1.22"
candle-core = { version = "0.8.0" }
derive-getters = "0.5" derive-getters = "0.5"
regex = "1" regex = "1"
rayon = "1" rayon = "1"
...@@ -78,7 +80,6 @@ rayon = "1" ...@@ -78,7 +80,6 @@ rayon = "1"
# kv_cuda # kv_cuda
cudarc = { git = "https://github.com/coreylowman/cudarc.git", rev = "8c52e735b55bf8e979e1a16bd85e3dfe4f87c9fe", features = ["cuda-12040"], optional = true } cudarc = { git = "https://github.com/coreylowman/cudarc.git", rev = "8c52e735b55bf8e979e1a16bd85e3dfe4f87c9fe", features = ["cuda-12040"], optional = true }
ndarray = { version = "0.16", optional = true } ndarray = { version = "0.16", optional = true }
# candle-core = { version = "0.8.3", features = ["cuda"], optional = true }
# half = "2.4.1" # half = "2.4.1"
pyo3 = { version = "0.23.3", default-features = false, features = [ pyo3 = { version = "0.23.3", default-features = false, features = [
...@@ -138,6 +139,9 @@ pyo3-async-runtimes = { version = "0.23.0", optional = true, default-features = ...@@ -138,6 +139,9 @@ pyo3-async-runtimes = { version = "0.23.0", optional = true, default-features =
] } ] }
pythonize = { version = "0.23", optional = true } pythonize = { version = "0.23", optional = true }
# GGUF
ggus = "0.4.0"
memmap2 = "0.9.5"
[dev-dependencies] [dev-dependencies]
hf-hub = { workspace = true } hf-hub = { workspace = true }
......
...@@ -52,9 +52,6 @@ use crate::protocols::{ ...@@ -52,9 +52,6 @@ use crate::protocols::{
use crate::tokenizers::{DecodeStream, HuggingFaceTokenizer, Tokenizer}; use crate::tokenizers::{DecodeStream, HuggingFaceTokenizer, Tokenizer};
use tokenizers::Tokenizer as HfTokenizer; use tokenizers::Tokenizer as HfTokenizer;
use toktrie::TokTrie;
use toktrie_hf_tokenizers::ByteTokenizer;
/// Represents the output stream from the execution engine /// Represents the output stream from the execution engine
pub type ExecutionOutputStream = Annotated<LLMEngineOutput>; pub type ExecutionOutputStream = Annotated<LLMEngineOutput>;
...@@ -64,12 +61,8 @@ pub type ExecutionContext = ServerStreamingEngine<BackendInput, ExecutionOutputS ...@@ -64,12 +61,8 @@ pub type ExecutionContext = ServerStreamingEngine<BackendInput, ExecutionOutputS
/// Backend handles resource management and orchestrates LLM execution /// Backend handles resource management and orchestrates LLM execution
#[allow(dead_code)] #[allow(dead_code)]
pub struct Backend { pub struct Backend {
mdc: ModelDeploymentCard, pub tokenizer: Tokenizer, // Handles token encoding/decoding
pub tokenizer: Tokenizer, // Handles token encoding/decoding validate_engine_decode: bool, // Enable validation of engine decoding
tok_trie: Arc<TokTrie>, // Efficient token lookup structure
eos_token_ids: Vec<TokenIdType>, // End of sequence token IDs
validate_engine_decode: bool, // Enable validation of engine decoding
mdcsum: String, // Model deployment checksum
} }
/// Internal state for managing token decoding and stream processing /// Internal state for managing token decoding and stream processing
...@@ -78,52 +71,40 @@ struct DecoderUnfoldState { ...@@ -78,52 +71,40 @@ struct DecoderUnfoldState {
stream: ManyOut<ExecutionOutputStream>, stream: ManyOut<ExecutionOutputStream>,
decoder: Decoder, decoder: Decoder,
validate_engine_decode: bool, validate_engine_decode: bool,
mdcsum: String,
} }
impl Backend { impl Backend {
pub async fn from_mdc(mdc: ModelDeploymentCard) -> Result<Arc<Self>> { pub async fn from_tokenizer(tokenizer: HfTokenizer) -> Result<Arc<Self>> {
let info = mdc.model_info.get_model_info().await?;
let tokenizer = match &mdc.tokenizer {
TokenizerKind::HfTokenizerJson(file) => {
HfTokenizer::from_file(file).map_err(Error::msg)?
}
};
let bt = ByteTokenizer::from_tokenizer(tokenizer.clone())?;
let toktrie = TokTrie::from(&bt.tokrx_info(), &bt.token_bytes());
let mdcsum = mdc.mdcsum();
let tokenizer = HuggingFaceTokenizer::from_tokenizer(tokenizer); let tokenizer = HuggingFaceTokenizer::from_tokenizer(tokenizer);
let tokenizer = Tokenizer::from(Arc::new(tokenizer)); let tokenizer = Tokenizer::from(Arc::new(tokenizer));
Ok(Arc::new(Self { Ok(Arc::new(Self {
mdc,
tokenizer, tokenizer,
tok_trie: Arc::new(toktrie),
eos_token_ids: info.eos_token_ids(),
validate_engine_decode: false, validate_engine_decode: false,
mdcsum,
})) }))
} }
pub async fn from_mdc(mdc: ModelDeploymentCard) -> Result<Arc<Self>> {
let tokenizer = match &mdc.tokenizer {
TokenizerKind::HfTokenizerJson(file) => {
HfTokenizer::from_file(file).map_err(Error::msg)?
}
TokenizerKind::GGUF(t) => *t.clone(),
};
Self::from_tokenizer(tokenizer).await
}
fn decoder( fn decoder(
&self, &self,
stream: ManyOut<ExecutionOutputStream>, stream: ManyOut<ExecutionOutputStream>,
stop_conditions: StopConditions, stop_conditions: StopConditions,
) -> DecoderUnfoldState { ) -> DecoderUnfoldState {
let decoder = Decoder::new( let decoder = Decoder::new(self.tokenizer.decode_stream(false), stop_conditions);
self.tokenizer.decode_stream(false),
stop_conditions,
self.mdcsum.clone(),
);
DecoderUnfoldState { DecoderUnfoldState {
stream, stream,
decoder, decoder,
validate_engine_decode: self.validate_engine_decode, validate_engine_decode: self.validate_engine_decode,
mdcsum: self.mdcsum.clone(),
} }
} }
} }
...@@ -223,7 +204,7 @@ impl ...@@ -223,7 +204,7 @@ impl
}); });
// convert stream of processed Annotated<LLMEngineOutput> to Annotated<BackendOutput> // convert stream of processed Annotated<LLMEngineOutput> to Annotated<BackendOutput>
let mdcsum = self.mdcsum.clone(); //let mdcsum = self.mdcsum.clone();
let stream = processed_stream.map(move |output| { let stream = processed_stream.map(move |output| {
output.map_data(|data| { output.map_data(|data| {
Ok(BackendOutput { Ok(BackendOutput {
...@@ -233,7 +214,7 @@ impl ...@@ -233,7 +214,7 @@ impl
cum_log_probs: data.cum_log_probs, cum_log_probs: data.cum_log_probs,
log_probs: data.log_probs, log_probs: data.log_probs,
finish_reason: data.finish_reason, finish_reason: data.finish_reason,
mdcsum: mdcsum.clone(), //mdcsum: mdcsum.clone(),
}) })
}) })
}); });
...@@ -275,9 +256,8 @@ pub struct Decoder { ...@@ -275,9 +256,8 @@ pub struct Decoder {
// the number of bytes currently jailed // the number of bytes currently jailed
jailed_bytes: usize, jailed_bytes: usize,
// mdcsum // mdcsum
mdcsum: String, //mdcsum: String,
} }
#[allow(dead_code)] #[allow(dead_code)]
...@@ -331,7 +311,7 @@ impl Decoder { ...@@ -331,7 +311,7 @@ impl Decoder {
pub fn new( pub fn new(
decode_stream: DecodeStream, decode_stream: DecodeStream,
stop_condition: StopConditions, stop_condition: StopConditions,
mdcsum: String, //mdcsum: String,
) -> Self { ) -> Self {
let hidden_stop_ids: HashSet<TokenIdType> = stop_condition let hidden_stop_ids: HashSet<TokenIdType> = stop_condition
.stop_token_ids_hidden .stop_token_ids_hidden
...@@ -364,7 +344,6 @@ impl Decoder { ...@@ -364,7 +344,6 @@ impl Decoder {
jail: String::new(), jail: String::new(),
jail_max_bytes, jail_max_bytes,
jailed_bytes: 0, jailed_bytes: 0,
mdcsum,
} }
} }
......
...@@ -38,8 +38,6 @@ mod worker; ...@@ -38,8 +38,6 @@ mod worker;
pub async fn make_leader_engine( pub async fn make_leader_engine(
cancel_token: CancellationToken, cancel_token: CancellationToken,
// Where to find the tokenzier, and config.json
card_path: &Path,
// Full path to the model, either a GGUF file or an HF repo dir // Full path to the model, either a GGUF file or an HF repo dir
model_path: &Path, model_path: &Path,
// Unique string to name zmq sockets // Unique string to name zmq sockets
...@@ -63,7 +61,6 @@ pub async fn make_leader_engine( ...@@ -63,7 +61,6 @@ pub async fn make_leader_engine(
let mut engine = VllmEngine::new( let mut engine = VllmEngine::new(
cancel_token, cancel_token,
sock_code, sock_code,
card_path,
model_path, model_path,
node_conf, node_conf,
tensor_parallel_size, tensor_parallel_size,
......
...@@ -35,7 +35,6 @@ impl VllmEngine { ...@@ -35,7 +35,6 @@ impl VllmEngine {
pub async fn new( pub async fn new(
cancel_token: CancellationToken, cancel_token: CancellationToken,
sock_code: &str, sock_code: &str,
card_path: &Path,
model_path: &Path, model_path: &Path,
node_conf: MultiNodeConfig, node_conf: MultiNodeConfig,
tensor_parallel_size: u32, tensor_parallel_size: u32,
...@@ -43,7 +42,6 @@ impl VllmEngine { ...@@ -43,7 +42,6 @@ impl VllmEngine {
let w = worker::start( let w = worker::start(
cancel_token.clone(), cancel_token.clone(),
sock_code, sock_code,
card_path,
model_path, model_path,
node_conf, node_conf,
tensor_parallel_size, tensor_parallel_size,
......
...@@ -30,9 +30,8 @@ from vllm.usage.usage_lib import UsageContext ...@@ -30,9 +30,8 @@ from vllm.usage.usage_lib import UsageContext
engine_args = AsyncEngineArgs( engine_args = AsyncEngineArgs(
model=f"{model_path}", model=f"{model_path}",
served_model_name=None, served_model_name=None,
tokenizer=f"{tokenizer_path}",
task='generate', task='generate',
tokenizer_mode='auto', skip_tokenizer_init=True,
seed=0, seed=0,
max_model_len=8192, max_model_len=8192,
max_seq_len_to_capture=8192, max_seq_len_to_capture=8192,
...@@ -43,7 +42,16 @@ engine_args = AsyncEngineArgs( ...@@ -43,7 +42,16 @@ engine_args = AsyncEngineArgs(
ipc_path = f"ipc:///tmp/{socket_id}"; ipc_path = f"ipc:///tmp/{socket_id}";
engine_alive = multiprocessing.Value('b', True, lock=False) engine_alive = multiprocessing.Value('b', True, lock=False)
# 0.7.3
run_mp_engine(engine_args, UsageContext.OPENAI_API_SERVER, ipc_path, engine_alive) run_mp_engine(engine_args, UsageContext.OPENAI_API_SERVER, ipc_path, engine_alive)
# 0.8.1
# TODO: In 0.8+ first argument is VllmConfig, not AsyncEngineArgs
# disable_log_stats = False
# disable_log_requests = True
# run_mp_engine(engine_args, UsageContext.OPENAI_API_SERVER, ipc_path, disable_log_stats, disable_log_requests, engine_alive)
"#; "#;
/// Start the Python vllm engine that listens on zmq socket /// Start the Python vllm engine that listens on zmq socket
...@@ -51,7 +59,6 @@ run_mp_engine(engine_args, UsageContext.OPENAI_API_SERVER, ipc_path, engine_aliv ...@@ -51,7 +59,6 @@ run_mp_engine(engine_args, UsageContext.OPENAI_API_SERVER, ipc_path, engine_aliv
/// This does not return until vllm exits. /// This does not return until vllm exits.
pub fn run_subprocess( pub fn run_subprocess(
socket_id: &str, socket_id: &str,
model_card_path: &Path,
model_path: &Path, model_path: &Path,
node_config: MultiNodeConfig, node_config: MultiNodeConfig,
tp_size: u32, tp_size: u32,
...@@ -60,12 +67,10 @@ pub fn run_subprocess( ...@@ -60,12 +67,10 @@ pub fn run_subprocess(
if let Ok(venv) = env::var("VIRTUAL_ENV") { if let Ok(venv) = env::var("VIRTUAL_ENV") {
let _ = Python::with_gil(|py| crate::engines::fix_venv(venv, py)); let _ = Python::with_gil(|py| crate::engines::fix_venv(venv, py));
} }
let card = model_card_path.display().to_string();
let model_path_str = model_path.display().to_string(); let model_path_str = model_path.display().to_string();
Python::with_gil(|py| { Python::with_gil(|py| {
let locals = [ let locals = [
("socket_id", socket_id), ("socket_id", socket_id),
("tokenizer_path", card.as_str()),
("model_path", model_path_str.as_str()), ("model_path", model_path_str.as_str()),
("tp_size_str", &tp_size.to_string()), ("tp_size_str", &tp_size.to_string()),
("nnodes_str", &node_config.num_nodes.to_string()), ("nnodes_str", &node_config.num_nodes.to_string()),
......
...@@ -160,7 +160,6 @@ struct Logprob { ...@@ -160,7 +160,6 @@ struct Logprob {
pub async fn start( pub async fn start(
cancel_token: CancellationToken, cancel_token: CancellationToken,
sock_code: &str, sock_code: &str,
card_path: &Path,
model_path: &Path, model_path: &Path,
_node_conf: MultiNodeConfig, _node_conf: MultiNodeConfig,
tensor_parallel_size: u32, tensor_parallel_size: u32,
...@@ -180,14 +179,7 @@ pub async fn start( ...@@ -180,14 +179,7 @@ pub async fn start(
metrics, metrics,
} = zmq_sockets(sock_code)?; } = zmq_sockets(sock_code)?;
let vllm_process = start_vllm( let vllm_process = start_vllm(model_path, &py_imports, data, tensor_parallel_size).await?;
card_path,
model_path,
&py_imports,
data,
tensor_parallel_size,
)
.await?;
let vllm_join_handle = watch_vllm(cancel_token.clone(), vllm_process); let vllm_join_handle = watch_vllm(cancel_token.clone(), vllm_process);
tokio::spawn(heartbeat_loop(cancel_token.clone(), heartbeat)); tokio::spawn(heartbeat_loop(cancel_token.clone(), heartbeat));
...@@ -308,7 +300,6 @@ fn zmq_sockets(sock_code: &str) -> anyhow::Result<Sockets> { ...@@ -308,7 +300,6 @@ fn zmq_sockets(sock_code: &str) -> anyhow::Result<Sockets> {
/// Start the vllm python sub-process and wait for it to start /// Start the vllm python sub-process and wait for it to start
async fn start_vllm( async fn start_vllm(
card_path: &Path,
model_path: &Path, model_path: &Path,
python_imports: &Imports, python_imports: &Imports,
mut data_socket: async_zmq::Dealer<IntoIter<Vec<u8>>, Vec<u8>>, mut data_socket: async_zmq::Dealer<IntoIter<Vec<u8>>, Vec<u8>>,
...@@ -316,7 +307,6 @@ async fn start_vllm( ...@@ -316,7 +307,6 @@ async fn start_vllm(
) -> anyhow::Result<tokio::process::Child> { ) -> anyhow::Result<tokio::process::Child> {
let vllm_args = [ let vllm_args = [
"--internal-vllm-process", "--internal-vllm-process",
&format!("--model-config={}", card_path.display()),
&format!("--model-path={}", model_path.display()), &format!("--model-path={}", model_path.display()),
&format!("--tensor-parallel-size={tensor_parallel_size}"), &format!("--tensor-parallel-size={tensor_parallel_size}"),
]; ];
......
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Adapted from mistral.rs
//
// MIT License
//
// Copyright (c) 2025 Eric Buehler
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
mod content;
mod gguf_metadata;
mod gguf_tokenizer;
use strum::EnumString;
use anyhow::{Context, Result};
pub(crate) use content::Content;
pub(crate) use gguf_metadata::ContentConfig;
pub(crate) use gguf_tokenizer::convert_gguf_to_hf_tokenizer;
use std::str::FromStr;
pub const GGUF_MULTI_FILE_DELIMITER: &str = " ";
#[derive(Debug, EnumString, Clone, Copy, strum::Display)]
#[strum(serialize_all = "lowercase")]
pub enum GGUFArchitecture {
Llama,
Mpt,
Gptneox,
Gptj,
Gpt2,
Bloom,
Falcon,
Mamba,
Rwkv,
Phi2,
Phi3,
Starcoder2,
Qwen2,
}
// Wraps from_str() for some convenience:
// - Case-insensitive variant matching (TODO: is this desirable?)
// - Customized error until potential upstream support: https://github.com/Peternator7/strum/issues/332
impl GGUFArchitecture {
pub fn from_value<T: AsRef<str> + std::fmt::Display>(value: T) -> Result<Self> {
Self::from_str(&value.as_ref().to_ascii_lowercase())
.with_context(|| format!("Unknown GGUF architecture `{value}`"))
.map_err(anyhow::Error::msg)
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Adapted from mistral.rs
//
// MIT License
//
// Copyright (c) 2025 Eric Buehler
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
use std::collections::HashMap;
use anyhow::Context;
use candle_core::{
quantized::gguf_file::{self, Value},
Result,
};
use tracing::info;
use super::GGUFArchitecture;
// Internal invariant: contents and readers must be paired.
/// This abstracts the files for a GGUF model and enables multiple files to be used.
pub struct Content {
_contents: Vec<gguf_file::Content>,
arch: GGUFArchitecture,
all_metadata: HashMap<String, Value>,
}
impl Content {
/// Create a `Content` from a set of file readers.
pub fn from_readers<R: std::io::Seek + std::io::Read>(readers: &mut [&mut R]) -> Result<Self> {
let mut contents = Vec::new();
let n_readers = readers.len();
for reader in readers.iter_mut() {
contents.push(gguf_file::Content::read(reader)?);
}
let n_splits = contents
.iter()
.filter_map(|ct| {
ct.metadata
.get("split.count")
.map(|val| val.to_u64().unwrap())
})
.fold(Vec::new(), |mut accum, x| {
if !accum.contains(&x) {
accum.push(x);
}
accum
});
if n_splits.len() > 1 {
candle_core::bail!("GGUF files have differing `split.count` values: {n_splits:?}. Perhaps the GGUF files do not match?");
}
#[allow(clippy::cast_possible_truncation)]
if !n_splits.is_empty() && n_readers != n_splits[0] as usize {
candle_core::bail!(
"Number of GGUF files does not match the number of splits, expected {} files.",
n_splits[0]
);
} else if n_splits.len() == 1 {
info!("GGUF file has been split into {} shards", n_splits[0]);
}
let mut arch = None;
for ct in &contents {
if !ct.metadata.contains_key("general.architecture") {
continue;
}
arch = Some(
ct.metadata["general.architecture"]
.to_string()
.context("Model metadata should have declared an architecture")
.and_then(GGUFArchitecture::from_value)
.unwrap(),
);
}
let arch = arch.expect("GGUF files must specify `general.architecture`");
let mut all_metadata = HashMap::new();
for content in &contents {
all_metadata.extend(content.metadata.clone())
}
Ok(Self {
_contents: contents,
arch,
all_metadata,
})
}
pub fn arch(&self) -> GGUFArchitecture {
self.arch
}
/// Get all metadatas
pub fn get_metadata(&self) -> &HashMap<String, Value> {
&self.all_metadata
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Adapted from mistral.rs
//
// MIT License
//
// Copyright (c) 2025 Eric Buehler
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
use akin::akin;
use anyhow::ensure;
use anyhow::Result;
use candle_core::quantized::gguf_file;
use std::collections::HashMap;
use tracing::warn;
use crate::gguf::Content;
#[allow(dead_code)]
#[derive(Debug)]
pub struct ContentConfig {
max_seq_len: usize,
hidden_size: usize,
num_attn_heads: usize,
num_kv_heads: usize,
num_layers: usize,
key_length: Option<usize>,
value_length: Option<usize>,
}
#[allow(clippy::cast_possible_truncation)]
impl From<&Content> for ContentConfig {
fn from(value: &Content) -> Self {
let metadata = value.get_metadata();
let arch = metadata["general.architecture"].to_string().unwrap();
Self {
max_seq_len: metadata[&format!("{arch}.context_length")]
.to_u64()
.unwrap() as usize,
hidden_size: metadata[&format!("{arch}.embedding_length")]
.to_u64()
.unwrap() as usize,
num_attn_heads: metadata[&format!("{arch}.attention.head_count")]
.to_u64()
.unwrap() as usize,
num_kv_heads: metadata[&format!("{arch}.attention.head_count_kv")]
.to_u64()
.unwrap() as usize,
num_layers: metadata[&format!("{arch}.block_count")].to_u64().unwrap() as usize,
key_length: metadata
.get(&format!("{arch}.attention.key_length"))
.map(|x| x.to_u64().unwrap() as usize),
value_length: metadata
.get(&format!("{arch}.attention.value_length"))
.map(|x| x.to_u64().unwrap() as usize),
}
}
}
#[allow(dead_code)]
impl ContentConfig {
pub fn max_seq_len(&self) -> usize {
self.max_seq_len
}
pub fn hidden_size(&self) -> usize {
self.hidden_size
}
pub fn num_attn_heads(&self) -> usize {
self.num_attn_heads
}
pub fn num_kv_heads(&self) -> usize {
self.num_kv_heads
}
pub fn num_layers(&self) -> usize {
self.num_layers
}
pub fn k_head_dim(&self) -> usize {
self.key_length
.unwrap_or(self.hidden_size / self.num_attn_heads)
}
pub fn v_head_dim(&self) -> usize {
self.value_length
.unwrap_or(self.hidden_size / self.num_attn_heads)
}
}
pub struct ContentMetadata<'a> {
pub path_prefix: &'a str,
pub metadata: &'a HashMap<String, gguf_file::Value>,
}
impl ContentMetadata<'_> {
// Retrieve a prop the struct needs by querying the metadata content:
pub fn get_value<T: TryFromValue>(&self, field_name: &str) -> Result<T, anyhow::Error> {
let prop_key = format!("{prefix}.{field_name}", prefix = self.path_prefix);
let value = self.metadata.get(&prop_key).cloned();
// Unwrap the inner value of the `Value` enum via trait method,
// otherwise format error with prop key as context:
value
.try_value_into()
.or_else(|e| anyhow::bail!("`{prop_key}` `{e}`"))
}
// Fail early - Catch all missing mandatory keys upfront:
pub fn has_required_keys(&self, fields: &[&str]) -> Result<()> {
let mut all_props_are_present = true;
for field_name in fields {
let prop_key = format!("{prefix}.{field_name}", prefix = self.path_prefix);
if !self.metadata.contains_key(&prop_key) {
all_props_are_present = false;
warn!("Expected GGUF metadata to have key: `{prop_key}`");
}
}
ensure!(all_props_are_present, "Tokenizer is missing required props");
Ok(())
}
}
// These traits below are a workaround for converting candles GGUF `Value` enum type wrapper.
// A better upstream approach would instead be to provide serialize/deserialize support?
pub trait TryFromValue {
fn try_from_value(value: gguf_file::Value) -> Result<Self, candle_core::Error>
where
Self: Sized;
}
// Value wrapped types, each has a different conversion method:
// NOTE: Type conversion methods internally bail with "not a <into type> <input value>"
// https://docs.rs/candle-core/latest/candle_core/quantized/gguf_file/enum.Value.html#variants
akin! {
let &types = [String, bool, f32, f64, i8, i16, i32, i64, u8, u16, u32, u64];
let &to_type = [
value.to_string().cloned(),
value.to_bool(),
value.to_f32(),
value.to_f64(),
value.to_i8(),
value.to_i16(),
value.to_i32(),
value.to_i64(),
value.to_u8(),
value.to_u16(),
value.to_u32(),
value.to_u64(),
];
impl TryFromValue for *types {
fn try_from_value(value: gguf_file::Value) -> Result<Self, candle_core::Error> {
*to_type.or_else(|_| candle_core::bail!("value is not a `*types`"))
}
}
}
// Vec<Value> to Vec<T> from above types:
impl<T: TryFromValue> TryFromValue for Vec<T> {
fn try_from_value(value_vec: gguf_file::Value) -> Result<Self, candle_core::Error> {
value_vec
.to_vec()
.or_else(|_| candle_core::bail!("value is not a `Vec`"))?
.clone()
.into_iter()
.map(|item| T::try_from_value(item))
.collect()
}
}
pub trait TryValueInto<T>: Sized {
fn try_value_into(self) -> Result<T, candle_core::Error>;
}
impl<T: TryFromValue> TryValueInto<T> for gguf_file::Value {
fn try_value_into(self) -> Result<T, candle_core::Error> {
T::try_from_value(self)
}
}
impl<T: TryFromValue> TryValueInto<T> for Option<gguf_file::Value> {
fn try_value_into(self) -> Result<T, candle_core::Error> {
match self {
Some(value) => value.try_value_into(),
None => candle_core::bail!("Expected `Option<gguf_file::Value>` to contain a value"),
}
}
}
This diff is collapsed.
...@@ -22,6 +22,7 @@ pub mod backend; ...@@ -22,6 +22,7 @@ pub mod backend;
pub mod common; pub mod common;
pub mod disagg_router; pub mod disagg_router;
pub mod engines; pub mod engines;
pub mod gguf;
pub mod http; pub mod http;
pub mod kv_router; pub mod kv_router;
pub mod model_card; pub mod model_card;
......
...@@ -20,7 +20,7 @@ use anyhow::{Context, Result}; ...@@ -20,7 +20,7 @@ use anyhow::{Context, Result};
use std::fs; use std::fs;
use std::path::Path; use std::path::Path;
use crate::model_card::model::{File, ModelInfoType, PromptFormatterArtifact, TokenizerKind}; use crate::model_card::model::{ModelInfoType, PromptFormatterArtifact, TokenizerKind};
impl ModelDeploymentCard { impl ModelDeploymentCard {
/// Creates a ModelDeploymentCard from a local directory path. /// Creates a ModelDeploymentCard from a local directory path.
...@@ -58,6 +58,33 @@ impl ModelDeploymentCard { ...@@ -58,6 +58,33 @@ impl ModelDeploymentCard {
Self::from_repo(&repo_id, model_name).await Self::from_repo(&repo_id, model_name).await
} }
pub async fn from_gguf(gguf_file: &Path, model_name: Option<&str>) -> anyhow::Result<Self> {
let model_name = model_name.map(|s| s.to_string()).or_else(|| {
gguf_file
.iter()
.last()
.map(|n| n.to_string_lossy().to_string())
});
let Some(model_name) = model_name else {
// I think this would only happy on an empty path
anyhow::bail!(
"Could not extract model name from path '{}'",
gguf_file.display()
);
};
Ok(Self {
display_name: model_name.to_string(),
service_name: model_name.to_string(),
model_info: ModelInfoType::GGUF(gguf_file.to_path_buf()),
tokenizer: TokenizerKind::from_gguf(gguf_file)?,
prompt_formatter: Some(PromptFormatterArtifact::GGUF(gguf_file.to_path_buf())),
prompt_context: None, // TODO - auto-detect prompt context
revision: 0,
last_published: None,
requires_preprocessing: true,
})
}
/// TODO: This will be implemented after nova-hub is integrated with the model-card /// TODO: This will be implemented after nova-hub is integrated with the model-card
/// TODO: Attempt to auto-detect model type and construct an MDC from a NGC repo /// TODO: Attempt to auto-detect model type and construct an MDC from a NGC repo
pub async fn from_ngc_repo(_: &str) -> anyhow::Result<Self> { pub async fn from_ngc_repo(_: &str) -> anyhow::Result<Self> {
...@@ -127,7 +154,7 @@ impl TokenizerKind { ...@@ -127,7 +154,7 @@ impl TokenizerKind {
} }
/// Checks if the provided path contains the expected file. /// Checks if the provided path contains the expected file.
async fn check_for_file(repo_id: &str, file: &str) -> anyhow::Result<File> { async fn check_for_file(repo_id: &str, file: &str) -> anyhow::Result<String> {
let mut files = check_for_files(repo_id, vec![file.to_string()]).await?; let mut files = check_for_files(repo_id, vec![file.to_string()]).await?;
let file = files let file = files
.remove(file) .remove(file)
...@@ -135,7 +162,7 @@ async fn check_for_file(repo_id: &str, file: &str) -> anyhow::Result<File> { ...@@ -135,7 +162,7 @@ async fn check_for_file(repo_id: &str, file: &str) -> anyhow::Result<File> {
Ok(file) Ok(file)
} }
async fn check_for_files(repo_id: &str, files: Vec<String>) -> Result<HashMap<String, File>> { async fn check_for_files(repo_id: &str, files: Vec<String>) -> Result<HashMap<String, String>> {
let dir_entries = let dir_entries =
fs::read_dir(repo_id).with_context(|| format!("Failed to read directory: {}", repo_id))?; fs::read_dir(repo_id).with_context(|| format!("Failed to read directory: {}", repo_id))?;
let mut found_files = HashMap::new(); let mut found_files = HashMap::new();
......
...@@ -25,19 +25,21 @@ ...@@ -25,19 +25,21 @@
//! - Prompt formatter settings (PromptFormatterArtifact) //! - Prompt formatter settings (PromptFormatterArtifact)
//! - Various metadata like revision, publish time, etc. //! - Various metadata like revision, publish time, etc.
use crate::protocols::TokenIdType;
use anyhow::Result;
use either::Either;
use serde::{Deserialize, Serialize};
use std::sync::Arc;
use std::fmt; use std::fmt;
use std::path::Path; use std::fs::File;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::time::Duration; use std::time::Duration;
use anyhow::{Context, Result};
use derive_builder::Builder; use derive_builder::Builder;
use dynamo_runtime::slug::Slug; use dynamo_runtime::slug::Slug;
use either::Either;
use serde::{Deserialize, Serialize};
use tokenizers::Tokenizer as HfTokenizer;
use crate::gguf::{Content, ContentConfig};
use crate::protocols::TokenIdType;
pub const BUCKET_NAME: &str = "mdc"; pub const BUCKET_NAME: &str = "mdc";
...@@ -48,18 +50,18 @@ pub const BUCKET_TTL: Duration = Duration::from_secs(5 * 60); ...@@ -48,18 +50,18 @@ pub const BUCKET_TTL: Duration = Duration::from_secs(5 * 60);
/// If a model deployment card hasn't been refreshed in this much time the worker is likely gone /// If a model deployment card hasn't been refreshed in this much time the worker is likely gone
const CARD_MAX_AGE: chrono::TimeDelta = chrono::TimeDelta::minutes(5); const CARD_MAX_AGE: chrono::TimeDelta = chrono::TimeDelta::minutes(5);
pub type File = String;
#[derive(Serialize, Deserialize, Clone, Debug)] #[derive(Serialize, Deserialize, Clone, Debug)]
#[serde(rename_all = "snake_case")] #[serde(rename_all = "snake_case")]
pub enum ModelInfoType { pub enum ModelInfoType {
HfConfigJson(File), HfConfigJson(String),
GGUF(PathBuf),
} }
#[derive(Serialize, Deserialize, Clone, Debug)] #[derive(Serialize, Deserialize, Clone, Debug)]
#[serde(rename_all = "snake_case")] #[serde(rename_all = "snake_case")]
pub enum TokenizerKind { pub enum TokenizerKind {
HfTokenizerJson(File), HfTokenizerJson(String),
GGUF(Box<HfTokenizer>),
} }
/// Supported types of prompt formatters. /// Supported types of prompt formatters.
...@@ -77,7 +79,8 @@ pub enum TokenizerKind { ...@@ -77,7 +79,8 @@ pub enum TokenizerKind {
#[derive(Serialize, Deserialize, Clone, Debug)] #[derive(Serialize, Deserialize, Clone, Debug)]
#[serde(rename_all = "snake_case")] #[serde(rename_all = "snake_case")]
pub enum PromptFormatterArtifact { pub enum PromptFormatterArtifact {
HfTokenizerConfigJson(File), HfTokenizerConfigJson(String),
GGUF(PathBuf),
} }
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, Hash)] #[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, Hash)]
...@@ -194,6 +197,15 @@ impl ModelDeploymentCard { ...@@ -194,6 +197,15 @@ impl ModelDeploymentCard {
false false
} }
} }
pub fn tokenizer_hf(&self) -> anyhow::Result<HfTokenizer> {
match &self.tokenizer {
TokenizerKind::HfTokenizerJson(file) => {
HfTokenizer::from_file(file).map_err(anyhow::Error::msg)
}
TokenizerKind::GGUF(t) => Ok(*t.clone()),
}
}
} }
impl fmt::Display for ModelDeploymentCard { impl fmt::Display for ModelDeploymentCard {
...@@ -221,13 +233,14 @@ pub trait ModelInfo: Send + Sync { ...@@ -221,13 +233,14 @@ pub trait ModelInfo: Send + Sync {
impl ModelInfoType { impl ModelInfoType {
pub async fn get_model_info(&self) -> Result<Arc<dyn ModelInfo>> { pub async fn get_model_info(&self) -> Result<Arc<dyn ModelInfo>> {
match self { match self {
Self::HfConfigJson(info) => HFConfigJsonFile::from_file(info).await, Self::HfConfigJson(info) => HFConfig::from_json_file(info).await,
Self::GGUF(path) => HFConfig::from_gguf(path),
} }
} }
} }
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
struct HFConfigJsonFile { struct HFConfig {
bos_token_id: TokenIdType, bos_token_id: TokenIdType,
#[serde(with = "either::serde_untagged")] #[serde(with = "either::serde_untagged")]
...@@ -253,15 +266,46 @@ struct HFConfigJsonFile { ...@@ -253,15 +266,46 @@ struct HFConfigJsonFile {
vocab_size: usize, vocab_size: usize,
} }
impl HFConfigJsonFile { impl HFConfig {
async fn from_file(file: &File) -> Result<Arc<dyn ModelInfo>> { async fn from_json_file(file: &String) -> Result<Arc<dyn ModelInfo>> {
let contents = std::fs::read_to_string(file)?; let contents = std::fs::read_to_string(file)?;
let config: Self = serde_json::from_str(&contents)?; let config: Self = serde_json::from_str(&contents)?;
Ok(Arc::new(config)) Ok(Arc::new(config))
} }
fn from_gguf(gguf_file: &Path) -> Result<Arc<dyn ModelInfo>> {
let content = load_gguf(gguf_file)?;
let model_config_metadata: ContentConfig = (&content).into();
let num_hidden_layers =
content.get_metadata()[&format!("{}.block_count", content.arch())].to_u32()? as usize;
let bos_token_id = content.get_metadata()["tokenizer.ggml.bos_token_id"].to_u32()?;
let eos_token_id = content.get_metadata()["tokenizer.ggml.eos_token_id"].to_u32()?;
// to_vec returns a Vec that's already there, so it's cheap
let vocab_size = content.get_metadata()["tokenizer.ggml.tokens"]
.to_vec()?
.len();
let arch = content.arch().to_string();
Ok(Arc::new(HFConfig {
bos_token_id,
eos_token_id: Either::Left(eos_token_id),
architectures: vec![format!("{}ForCausalLM", capitalize(&arch))],
// "general.architecture"
model_type: arch,
// "llama.context_length"
max_position_embeddings: model_config_metadata.max_seq_len(),
// "llama.block_count"
num_hidden_layers,
// "llama.attention.head_count"
num_attention_heads: model_config_metadata.num_attn_heads(),
// "tokenizer.ggml.tokens".len()
vocab_size,
}))
}
} }
impl ModelInfo for HFConfigJsonFile { impl ModelInfo for HFConfig {
fn model_type(&self) -> String { fn model_type(&self) -> String {
self.model_type.clone() self.model_type.clone()
} }
...@@ -285,3 +329,33 @@ impl ModelInfo for HFConfigJsonFile { ...@@ -285,3 +329,33 @@ impl ModelInfo for HFConfigJsonFile {
self.vocab_size self.vocab_size
} }
} }
impl TokenizerKind {
pub fn from_gguf(gguf_file: &Path) -> anyhow::Result<Self> {
let content = load_gguf(gguf_file)?;
let out = crate::gguf::convert_gguf_to_hf_tokenizer(&content)
.with_context(|| gguf_file.display().to_string())?;
Ok(TokenizerKind::GGUF(Box::new(out.tokenizer)))
}
}
fn load_gguf(gguf_file: &Path) -> anyhow::Result<Content> {
let filename = gguf_file.display().to_string();
let mut f = File::open(gguf_file).with_context(|| filename.clone())?;
// vec because GGUF can be split into multiple files (shards)
let mut readers = vec![&mut f];
crate::gguf::Content::from_readers(&mut readers).with_context(|| filename.clone())
}
fn capitalize(s: &str) -> String {
s.chars()
.enumerate()
.map(|(i, c)| {
if i == 0 {
c.to_uppercase().to_string()
} else {
c.to_lowercase().to_string()
}
})
.collect()
}
...@@ -74,6 +74,9 @@ impl OpenAIPreprocessor { ...@@ -74,6 +74,9 @@ impl OpenAIPreprocessor {
let tokenizer = match &mdc.tokenizer { let tokenizer = match &mdc.tokenizer {
TokenizerKind::HfTokenizerJson(file) => HuggingFaceTokenizer::from_file(file)?, TokenizerKind::HfTokenizerJson(file) => HuggingFaceTokenizer::from_file(file)?,
TokenizerKind::GGUF(tokenizer) => {
HuggingFaceTokenizer::from_tokenizer(*tokenizer.clone())
}
}; };
let tokenizer = Arc::new(tokenizer); let tokenizer = Arc::new(tokenizer);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment