Unverified Commit 1b1265e6 authored by Graham King's avatar Graham King Committed by GitHub
Browse files

chore: Remove GGUF support (#3488)


Signed-off-by: default avatarGraham King <grahamk@nvidia.com>
parent 0c4c4d1d
......@@ -34861,23 +34861,6 @@ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR I
 
```
 
## ggus - 0.4.0
**Repository URL**: https://github.com/InfiniTensor/gguf
**License Type(s)**: MIT
### License: https://raw.githubusercontent.com/InfiniTensor/gguf/HEAD/LICENSE
```
The MIT License (MIT)
Copyright © 2024 YdrMaster
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
```
## gif - 0.13.1
**Repository URL**: https://github.com/image-rs/image-gif
**License Type(s)**: Apache-2.0 OR MIT
......@@ -1958,15 +1958,6 @@ dependencies = [
"crypto-common",
]
[[package]]
name = "digit-layout"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09157630eece4139f6cc5a457556d308c3465ecd5af492f0e5aadc043997e2ce"
dependencies = [
"half 2.6.0",
]
[[package]]
name = "dircpy"
version = "0.3.19"
......@@ -2168,14 +2159,12 @@ dependencies = [
"futures",
"futures-util",
"galil-seiferas",
"ggus",
"hf-hub",
"humantime",
"insta",
"itertools 0.14.0",
"json-five",
"lazy_static",
"memmap2",
"minijinja",
"minijinja-contrib",
"modelexpress-client",
......@@ -3235,30 +3224,6 @@ dependencies = [
"wasm-bindgen",
]
[[package]]
name = "ggml-quants"
version = "0.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a27693512784e0786212eb0bef841779a6337d2d04520ed475b4d5a864f98366"
dependencies = [
"digit-layout",
"half 2.6.0",
"rayon",
]
[[package]]
name = "ggus"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3ac5654356c6f7f6116905aeaf92ab002c3d03414ada5dbe0bb2e32aa5fea173"
dependencies = [
"fancy-regex 0.14.0",
"ggml-quants",
"indexmap 2.11.4",
"log",
"num_enum",
]
[[package]]
name = "gif"
version = "0.13.3"
......
......@@ -11,12 +11,6 @@
#
# Pass `--interactive` or `-i` for text chat instead of HTTP server.
#
# For static mode (no etcd auto-discovery):
# - python -m dynamo.frontend --model-name Qwen3-0.6B-Q8_0.gguf --model-path ~/llms/Qwen3-0.6B --static-endpoint dynamo.backend.generate
# Worker example:
# - cd lib/bindings/python/examples/hello_world
# - python server_sglang_static.py
#
# For TLS:
# - python -m dynamo.frontend --http-port 8443 --tls-cert-path cert.pem --tls-key-path key.pem
#
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Usage: `python -m dynamo.mocker --model-path /data/models/Qwen3-0.6B-Q8_0.gguf`
# Usage: `python -m dynamo.mocker --model-path /data/models/Qwen3-0.6B`
# Now supports vLLM-style individual arguments for MockEngineArgs
import argparse
......
......@@ -61,7 +61,6 @@ if __name__ == "__main__":
The `model_path` can be:
- A HuggingFace repo ID, optionally prefixed with `hf://`. It is downloaded and cached locally.
- The path to a checkout of a HuggingFace repo - any folder containing safetensor files as well as `config.json`, `tokenizer.json` and `tokenizer_config.json`.
- The path to a GGUF file, if your engine supports that.
The `model_input` can be:
- ModelInput.Tokens. Your engine expects pre-processed input (token IDs). Dynamo handles tokenization and pre-processing.
......@@ -72,7 +71,7 @@ The `model_type` can be:
- ModelType.Completions. Your `generate` method receives a `request` and must return a response dict of the older [Completions](https://platform.openai.com/docs/api-reference/completions).
`register_llm` can also take the following kwargs:
- `model_name`: The name to call the model. Your incoming HTTP requests model name must match this. Defaults to the hugging face repo name, the folder name, or the GGUF file name.
- `model_name`: The name to call the model. Your incoming HTTP requests model name must match this. Defaults to the hugging face repo name or the folder name.
- `context_length`: Max model length in tokens. Defaults to the model's set max. Only set this if you need to reduce KV cache allocation to fit into VRAM.
- `kv_cache_block_size`: Size of a KV block for the engine, in tokens. Defaults to 16.
- `migration_limit`: Maximum number of times a request may be [migrated to another Instance](../architecture/request_migration.md). Defaults to 0.
......
# Dynamo Run
`dynamo-run` is a Rust binary that lets you easily run a model, explore the Dynamo components, and demonstrates the Rust API. It supports the `mistral.rs` and `llama.cpp` engines. `mistralrs` is the default for safe tensors, `llama.cpp` for GGUF files.
`dynamo-run` is a Rust binary that lets you easily run a model, explore the Dynamo components, and demonstrates the Rust API. It supports the `mistral.rs` and `llama.cpp` engines. `mistralrs` is the default engine.
It is primarily for development and rapid prototyping. For production use we recommend the Python wrapped components, see the main project README.
......@@ -33,7 +33,7 @@ dynamo-run out=<engine> <HUGGING_FACE_ORGANIZATION/MODEL_NAME>
For gated models (such as meta-llama/Llama-3.2-3B-Instruct), you must set an `HF_TOKEN` environment variable.
The parameter can be the ID of a HuggingFace repository (which will be downloaded), a GPT-Generated Unified Format (GGUF) file, or a folder containing safetensors, config.json, or similar (perhaps a locally checked out HuggingFace repository).
The parameter can be the ID of a HuggingFace repository (which will be downloaded) or a folder containing safetensors, config.json, or similar (perhaps a locally checked out HuggingFace repository).
### Run a model from local file
......@@ -44,29 +44,23 @@ To run a model from local file:
See the following sections for details.
#### Download model from Hugging Face
One of the models available from Hugging Face should be high quality and fast on almost any machine: https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF
For example, try https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF/blob/main/Llama-3.2-3B-Instruct-Q4_K_M.gguf
To download model file:
```
curl -L -o Llama-3.2-3B-Instruct-Q4_K_M.gguf "https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF/resolve/main/Llama-3.2-3B-Instruct-Q4_K_M.gguf?download=true"
```
This model available from Hugging Face should be high quality and fast on almost any machine: https://huggingface.co/Qwen/Qwen3-0.6B
To run the model:
*Text interface*
```
dynamo-run Llama-3.2-3B-Instruct-Q4_K_M.gguf # or path to a Hugging Face repo checkout instead of the GGUF file
dynamo-run Qwen/Qwen3-0.6B
```
You can also pipe a prompt into `dynamo-run`:
```
echo 'What is the capital of Tuvalu?' | dynamo-run ~/llms/Qwen3-0.6B-Q8_0.gguf --context-length 4096
echo 'What is the capital of Tuvalu?' | dynamo-run Qwen/Qwen3-0.6B --context-length 4096
```
*HTTP interface*
```
dynamo-run in=http out=mistralrs Llama-3.2-3B-Instruct-Q4_K_M.gguf
dynamo-run in=http out=mistralrs Qwen/Qwen3-0.6B
```
You can also list models or send a request:
......@@ -77,7 +71,7 @@ curl localhost:8080/v1/models
*Send a request*
```
curl -d '{"model": "Llama-3.2-3B-Instruct-Q4_K_M", "max_completion_tokens": 2049, "messages":[{"role":"user", "content": "What is the capital of South Africa?" }]}' -H 'Content-Type: application/json' http://localhost:8080/v1/chat/completions
curl -d '{"model": "Qwen/Qwen3-0.6B", "max_completion_tokens": 2049, "messages":[{"role":"user", "content": "What is the capital of South Africa?" }]}' -H 'Content-Type: application/json' http://localhost:8080/v1/chat/completions
```
## Distributed System
......@@ -139,17 +133,6 @@ In the P/D disaggregated setup you would have `deepseek-distill-llama8b.prefill.
For output it is always only `out=auto`. This tells Dynamo to auto-discover the instances, group them by model, and load balance appropriately (depending on `--router-mode` flag). The exception is static workers, see that section.
### Static workers without etcd
Normally in the distributed system the frontend uses etcd to discover workers. The option exists to have a static endpoint without etcd.
```
Node 1: dynamo-run in=http out=dyn://dynamo.backend.generate --model-name Qwen3-0.6B-Q8_0.gguf --model-path ~/llms/Qwen3-0.6B
Node 2: dynamo-run in=dyn://dynamo.backend.generate out=llamacpp ~/llms/Qwen3-0.6B-Q8_0.gguf --static-worker --context-length 4096
```
Note how `out=` points to a single endpoint, which must match the worker. The model's name and config (to do pre-processing) are usually discovered by the frontend via etcd. Now we must pass them in (`--model-name` and `--model-path`).
### KV-aware routing
```
......@@ -254,7 +237,7 @@ The input defaults to `in=text`. The output defaults to `out=mistralrs` engine,
### mistralrs
[mistral.rs](https://github.com/EricLBuehler/mistral.rs) is a pure Rust engine that is fast to run, fast to load, supports GGUF as well as safetensors, and runs well on CPU as well as GPU. For those reasons it is the default engine.
[mistral.rs](https://github.com/EricLBuehler/mistral.rs) is a pure Rust engine that is fast to run and fast to load, and runs well on CPU as well as GPU. For those reasons it is the default engine.
```
dynamo-run Qwen/Qwen3-4B
......@@ -268,32 +251,6 @@ dynamo-run in=text out=mistralrs Qwen/Qwen3-4B
If you have multiple GPUs, `mistral.rs` does automatic tensor parallelism. You do not need to pass any extra flags to dynamo-run to enable it.
### llamacpp
[llama.cpp](https://github.com/ggml-org/llama.cpp) is built for CPU by default. For an optimized build pass the appropriate feature flag (highly recommended):
```
cargo build --features cuda|metal|vulkan -p dynamo-run
```
For GNU OpenMP support add the `openmp` feature. On Ubuntu this requires `libgomp1` (part of `build-essential`) at build and runtime.
```
cargo build --features cuda,openmp -p dynamo-run
```
```
dynamo-run out=llamacpp ~/llms/gemma-3-1b-it-q4_0.gguf
dynamo-run out=llamacpp ~/llms/Qwen3-0.6B-Q8_0.gguf # From https://huggingface.co/ggml-org
```
Note that in some cases we are unable to extract the tokenizer from the GGUF, and so a Hugging Face checkout of a matching model must also be passed. Dynamo uses the weights from the GGUF and the pre-processor (`tokenizer.json`, etc) from the `--model-config`:
```
dynamo-run out=llamacpp ~/llms/Llama-4-Scout-17B-16E-Instruct-UD-IQ1_S.gguf --context-length 32768 --model-config ~/llms/Llama-4-Scout-17B-16E-Instruct
```
If you have multiple GPUs, llama.cpp does automatic tensor parallelism. You do not need to pass any extra flags to `dynamo-run` to enable it.
### Mocker engine
The mocker engine is a mock vLLM implementation designed for testing and development purposes. It simulates realistic token generation timing without requiring actual model inference, making it useful for:
......@@ -417,7 +374,6 @@ if __name__ == "__main__":
The `model_path` can be:
- A HuggingFace repo ID, optionally prefixed with `hf://`. It is downloaded and cached locally.
- The path to a checkout of a HuggingFace repo - any folder containing safetensor files as well as `config.json`, `tokenizer.json` and `tokenizer_config.json`.
- The path to a GGUF file, if your engine supports that.
The `model_input` can be:
- ModelInput.Tokens. Your engine expects pre-processed input (token IDs). Dynamo handles tokenization and pre-processing.
......@@ -428,7 +384,7 @@ The `model_type` can be:
- ModelType.Completions. Your `generate` method receives a `request` and must return a response dict of the older [Completions](https://platform.openai.com/docs/api-reference/completions).
`register_llm` can also take the following kwargs:
- `model_name`: The name to call the model. Your incoming HTTP requests model name must match this. Defaults to the hugging face repo name, the folder name, or the GGUF file name.
- `model_name`: The name to call the model. Your incoming HTTP requests model name must match this. Defaults to the hugging face repo name, or the folder name.
- `context_length`: Max model length in tokens. Defaults to the model's set max. Only set this if you need to reduce KV cache allocation to fit into VRAM.
- `kv_cache_block_size`: Size of a KV block for the engine, in tokens. Defaults to 16.
- `user_data`: Optional dictionary containing custom metadata for worker behavior (e.g., LoRA configuration). Defaults to None.
......
......@@ -21,7 +21,6 @@ pub struct Flags {
/// The model. The options depend on the engine.
///
/// The full list - only mistralrs supports all three currently:
/// - Full path to a GGUF file
/// - Full path of a checked out Hugging Face repository containing safetensor files
/// - Name of a Hugging Face repository, e.g 'google/flan-t5-small'. The model will be
/// downloaded and cached.
......@@ -56,7 +55,6 @@ pub struct Flags {
/// llamacpp only
///
/// The path to the tokenizer and model config because:
/// - llama_cpp only runs GGUF files
/// - our engine is a 'core' engine in that we do the tokenization, so we need the vocab
/// - TODO: we don't yet extract that from the GGUF. Once we do we can remove this flag.
#[arg(long)]
......
......@@ -182,34 +182,8 @@ fn print_cuda(output: &Output) {
#[cfg(not(any(feature = "mistralrs", feature = "llamacpp")))]
fn print_cuda(_output: &Output) {}
fn default_engine_for(local_model: &LocalModel) -> Output {
let default_engine = if local_model.card().is_gguf() {
gguf_default()
} else {
fn default_engine_for(_local_model: &LocalModel) -> Output {
safetensors_default()
};
tracing::info!(
"Using default engine: {default_engine}. Use out=<engine> to specify one of {}",
Output::available_engines().join(", ")
);
default_engine
}
fn gguf_default() -> Output {
#[cfg(feature = "llamacpp")]
{
Output::LlamaCpp
}
#[cfg(all(feature = "mistralrs", not(feature = "llamacpp")))]
{
Output::MistralRs
}
#[cfg(not(any(feature = "mistralrs", feature = "llamacpp")))]
{
Output::Echo
}
}
fn safetensors_default() -> Output {
......
......@@ -20,8 +20,7 @@ Verbosity:
Example:
- cargo build --features cuda -p dynamo-run
- cd target/debug
- ./dynamo-run Qwen/Qwen3-0.6B
- OR: ./dynamo-run /data/models/Llama-3.2-1B-Instruct-Q4_K_M.gguf
- ./dynamo-run Qwen/Qwen3-0.6B (OR ./dynamo-run /data/hf-checkouts/Qwen3-0.6B)
See `docs/guides/dynamo_run.md` in the repo for full details.
"#;
......
......@@ -19,7 +19,6 @@ pub enum Output {
Static(String),
#[cfg(feature = "mistralrs")]
/// Run inference on a model in a GGUF file using mistralrs w/ candle
MistralRs,
#[cfg(feature = "llamacpp")]
......
......@@ -1333,15 +1333,6 @@ dependencies = [
"crypto-common",
]
[[package]]
name = "digit-layout"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09157630eece4139f6cc5a457556d308c3465ecd5af492f0e5aadc043997e2ce"
dependencies = [
"half",
]
[[package]]
name = "dircpy"
version = "0.3.19"
......@@ -1496,12 +1487,10 @@ dependencies = [
"futures",
"futures-util",
"galil-seiferas",
"ggus",
"hf-hub",
"humantime",
"itertools 0.14.0",
"json-five",
"memmap2",
"minijinja",
"minijinja-contrib",
"modelexpress-client",
......@@ -2374,30 +2363,6 @@ dependencies = [
"wasm-bindgen",
]
[[package]]
name = "ggml-quants"
version = "0.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a27693512784e0786212eb0bef841779a6337d2d04520ed475b4d5a864f98366"
dependencies = [
"digit-layout",
"half",
"rayon",
]
[[package]]
name = "ggus"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3ac5654356c6f7f6116905aeaf92ab002c3d03414ada5dbe0bb2e32aa5fea173"
dependencies = [
"fancy-regex 0.14.0",
"ggml-quants",
"indexmap 2.11.0",
"log",
"num_enum",
]
[[package]]
name = "gif"
version = "0.13.3"
......
......@@ -135,10 +135,6 @@ minijinja = { version = "2.10.2", features = ["loader"] }
minijinja-contrib = { version = "2.10.2", features = ["pycompat"] }
json-five = { version = "0.3" }
# GGUF
ggus = "0.4.0"
memmap2 = "0.9.5"
# Publishers
zeromq = "0.4.1"
rmp-serde = "1.3"
......
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
// Adapted from mistral.rs
//
// MIT License
//
// Copyright (c) 2025 Eric Buehler
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
mod content;
mod gguf_metadata;
mod gguf_tokenizer;
use strum::EnumString;
use anyhow::{Context, Result};
pub(crate) use content::Content;
pub(crate) use gguf_metadata::ContentConfig;
pub use gguf_metadata::ModelConfigLike;
pub(crate) use gguf_tokenizer::convert_gguf_to_hf_tokenizer;
use std::str::FromStr;
pub const GGUF_MULTI_FILE_DELIMITER: &str = " ";
#[derive(Debug, EnumString, Clone, Copy, strum::Display)]
#[strum(serialize_all = "lowercase")]
pub enum GGUFArchitecture {
Llama,
Mpt,
Gptneox,
Gptj,
Gpt2,
Bloom,
Falcon,
Mamba,
Rwkv,
Phi2,
Phi3,
Starcoder2,
Qwen2,
Qwen3,
Gemma3,
Granite,
}
// Wraps from_str() for some convenience:
// - Case-insensitive variant matching (TODO: is this desirable?)
// - Customized error until potential upstream support: https://github.com/Peternator7/strum/issues/332
impl GGUFArchitecture {
pub fn from_value<T: AsRef<str> + std::fmt::Display>(value: T) -> Result<Self> {
Self::from_str(&value.as_ref().to_ascii_lowercase())
.with_context(|| format!("Unknown GGUF architecture `{value}`"))
.map_err(anyhow::Error::msg)
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
// Adapted from mistral.rs
//
// MIT License
//
// Copyright (c) 2025 Eric Buehler
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
use std::collections::HashMap;
use anyhow::Context;
use candle_core::{
Result,
quantized::gguf_file::{self, Value},
};
use tracing::info;
use super::GGUFArchitecture;
// Internal invariant: contents and readers must be paired.
/// This abstracts the files for a GGUF model and enables multiple files to be used.
pub struct Content {
_contents: Vec<gguf_file::Content>,
arch: GGUFArchitecture,
all_metadata: HashMap<String, Value>,
}
impl Content {
/// Create a `Content` from a set of file readers.
pub fn from_readers<R: std::io::Seek + std::io::Read>(readers: &mut [&mut R]) -> Result<Self> {
let mut contents = Vec::new();
let n_readers = readers.len();
for reader in readers.iter_mut() {
contents.push(gguf_file::Content::read(reader)?);
}
let n_splits = contents
.iter()
.filter_map(|ct| {
ct.metadata
.get("split.count")
.map(|val| val.to_u64().unwrap())
})
.fold(Vec::new(), |mut accum, x| {
if !accum.contains(&x) {
accum.push(x);
}
accum
});
if n_splits.len() > 1 {
candle_core::bail!(
"GGUF files have differing `split.count` values: {n_splits:?}. Perhaps the GGUF files do not match?"
);
}
#[allow(clippy::cast_possible_truncation)]
if !n_splits.is_empty() && n_readers != n_splits[0] as usize {
candle_core::bail!(
"Number of GGUF files does not match the number of splits, expected {} files.",
n_splits[0]
);
} else if n_splits.len() == 1 {
info!("GGUF file has been split into {} shards", n_splits[0]);
}
let mut arch = None;
for ct in &contents {
if !ct.metadata.contains_key("general.architecture") {
continue;
}
arch = Some(
ct.metadata["general.architecture"]
.to_string()
.context("Model metadata should have declared an architecture")
.and_then(GGUFArchitecture::from_value)
.unwrap(),
);
}
let arch = arch.expect("GGUF files must specify `general.architecture`");
let mut all_metadata = HashMap::new();
for content in &contents {
all_metadata.extend(content.metadata.clone())
}
Ok(Self {
_contents: contents,
arch,
all_metadata,
})
}
pub fn arch(&self) -> GGUFArchitecture {
self.arch
}
/// Get all metadatas
pub fn get_metadata(&self) -> &HashMap<String, Value> {
&self.all_metadata
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
// Adapted from mistral.rs
//
// MIT License
//
// Copyright (c) 2025 Eric Buehler
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
use akin::akin;
use anyhow::Result;
use anyhow::ensure;
use candle_core::quantized::gguf_file;
use std::collections::HashMap;
use tracing::warn;
use crate::gguf::Content;
pub trait ModelConfigLike {
fn max_seq_len(&self) -> usize;
fn num_layers(&self) -> usize;
fn hidden_size(&self) -> usize;
fn num_kv_heads(&self) -> usize;
fn num_attn_heads(&self) -> usize;
fn k_head_dim(&self) -> usize;
fn v_head_dim(&self) -> usize;
}
#[allow(dead_code)]
#[derive(Debug)]
pub struct ContentConfig {
max_seq_len: usize,
hidden_size: usize,
num_attn_heads: usize,
num_kv_heads: usize,
num_layers: usize,
key_length: Option<usize>,
value_length: Option<usize>,
}
#[allow(clippy::cast_possible_truncation)]
impl From<&Content> for ContentConfig {
fn from(value: &Content) -> Self {
let metadata = value.get_metadata();
let arch = metadata["general.architecture"].to_string().unwrap();
Self {
max_seq_len: metadata[&format!("{arch}.context_length")]
.to_u64()
.unwrap() as usize,
hidden_size: metadata[&format!("{arch}.embedding_length")]
.to_u64()
.unwrap() as usize,
num_attn_heads: metadata[&format!("{arch}.attention.head_count")]
.to_u64()
.unwrap() as usize,
num_kv_heads: metadata[&format!("{arch}.attention.head_count_kv")]
.to_u64()
.unwrap() as usize,
num_layers: metadata[&format!("{arch}.block_count")].to_u64().unwrap() as usize,
key_length: metadata
.get(&format!("{arch}.attention.key_length"))
.map(|x| x.to_u64().unwrap() as usize),
value_length: metadata
.get(&format!("{arch}.attention.value_length"))
.map(|x| x.to_u64().unwrap() as usize),
}
}
}
impl ModelConfigLike for ContentConfig {
fn max_seq_len(&self) -> usize {
self.max_seq_len
}
fn hidden_size(&self) -> usize {
self.hidden_size
}
fn num_attn_heads(&self) -> usize {
self.num_attn_heads
}
fn num_kv_heads(&self) -> usize {
self.num_kv_heads
}
fn num_layers(&self) -> usize {
self.num_layers
}
fn k_head_dim(&self) -> usize {
self.key_length
.unwrap_or(self.hidden_size / self.num_attn_heads)
}
fn v_head_dim(&self) -> usize {
self.value_length
.unwrap_or(self.hidden_size / self.num_attn_heads)
}
}
pub struct ContentMetadata<'a> {
pub path_prefix: &'a str,
pub metadata: &'a HashMap<String, gguf_file::Value>,
}
impl ContentMetadata<'_> {
// Retrieve a prop the struct needs by querying the metadata content:
pub fn get_value<T: TryFromValue>(&self, field_name: &str) -> Result<T, anyhow::Error> {
let prop_key = format!("{prefix}.{field_name}", prefix = self.path_prefix);
let value = self.metadata.get(&prop_key).cloned();
// Unwrap the inner value of the `Value` enum via trait method,
// otherwise format error with prop key as context:
value
.try_value_into()
.or_else(|e| anyhow::bail!("`{prop_key}` `{e}`"))
}
// Fail early - Catch all missing mandatory keys upfront:
pub fn has_required_keys(&self, fields: &[&str]) -> Result<()> {
let mut all_props_are_present = true;
for field_name in fields {
let prop_key = format!("{prefix}.{field_name}", prefix = self.path_prefix);
if !self.metadata.contains_key(&prop_key) {
all_props_are_present = false;
warn!("Expected GGUF metadata to have key: `{prop_key}`");
}
}
ensure!(all_props_are_present, "Tokenizer is missing required props");
Ok(())
}
}
// These traits below are a workaround for converting candles GGUF `Value` enum type wrapper.
// A better upstream approach would instead be to provide serialize/deserialize support?
pub trait TryFromValue {
fn try_from_value(value: gguf_file::Value) -> Result<Self, candle_core::Error>
where
Self: Sized;
}
// Value wrapped types, each has a different conversion method:
// NOTE: Type conversion methods internally bail with "not a <into type> <input value>"
// https://docs.rs/candle-core/latest/candle_core/quantized/gguf_file/enum.Value.html#variants
akin! {
let &types = [String, bool, f32, f64, i8, i16, i32, i64, u8, u16, u32, u64];
let &to_type = [
value.to_string().cloned(),
value.to_bool(),
value.to_f32(),
value.to_f64(),
value.to_i8(),
value.to_i16(),
value.to_i32(),
value.to_i64(),
value.to_u8(),
value.to_u16(),
value.to_u32(),
value.to_u64(),
];
impl TryFromValue for *types {
fn try_from_value(value: gguf_file::Value) -> Result<Self, candle_core::Error> {
*to_type.or_else(|_| candle_core::bail!("value is not a `*types`"))
}
}
}
// Vec<Value> to Vec<T> from above types:
impl<T: TryFromValue> TryFromValue for Vec<T> {
fn try_from_value(value_vec: gguf_file::Value) -> Result<Self, candle_core::Error> {
value_vec
.to_vec()
.or_else(|_| candle_core::bail!("value is not a `Vec`"))?
.clone()
.into_iter()
.map(|item| T::try_from_value(item))
.collect()
}
}
pub trait TryValueInto<T>: Sized {
fn try_value_into(self) -> Result<T, candle_core::Error>;
}
impl<T: TryFromValue> TryValueInto<T> for gguf_file::Value {
fn try_value_into(self) -> Result<T, candle_core::Error> {
T::try_from_value(self)
}
}
impl<T: TryFromValue> TryValueInto<T> for Option<gguf_file::Value> {
fn try_value_into(self) -> Result<T, candle_core::Error> {
match self {
Some(value) => value.try_value_into(),
None => candle_core::bail!("Expected `Option<gguf_file::Value>` to contain a value"),
}
}
}
This diff is collapsed.
......@@ -1061,8 +1061,8 @@ async fn list_models_openai(
data.push(ModelListing {
id: model_name.clone(),
object: "object",
created, // Where would this come from? The GGUF?
owned_by: "nvidia".to_string(), // Get organization from GGUF
created, // Where would this come from?
owned_by: "nvidia".to_string(), // Get organization from config
});
}
......
......@@ -17,7 +17,6 @@ pub mod discovery;
pub mod endpoint_type;
pub mod engines;
pub mod entrypoint;
pub mod gguf;
pub mod grpc;
pub mod http;
pub mod hub;
......
......@@ -192,7 +192,6 @@ impl LocalModelBuilder {
///
/// The model name will depend on what "model_path" is:
/// - A folder: The last part of the folder name: "/data/llms/Qwen2.5-3B-Instruct" -> "Qwen2.5-3B-Instruct"
/// - A file: The GGUF filename: "/data/llms/Qwen2.5-3B-Instruct-Q6_K.gguf" -> "Qwen2.5-3B-Instruct-Q6_K.gguf"
/// - An HF repo: The HF repo name: "Qwen/Qwen3-0.6B" stays the same
pub async fn build(&mut self) -> anyhow::Result<LocalModel> {
// Generate an endpoint ID for this model if the user didn't provide one.
......@@ -379,12 +378,6 @@ impl LocalModel {
self.namespace.as_deref()
}
pub fn is_gguf(&self) -> bool {
// GGUF is the only file (not-folder) we accept, so we don't need to check the extension
// We will error when we come to parse it
self.full_path.is_file()
}
/// An endpoint to identify this model by.
pub fn endpoint_id(&self) -> &EndpointId {
&self.endpoint_id
......
......@@ -13,11 +13,10 @@
//! - Prompt formatter settings (PromptFormatterArtifact)
use std::fmt;
use std::fs::File;
use std::path::{Path, PathBuf};
use std::sync::{Arc, OnceLock};
use crate::common::checked_file::{CheckedFile, Checksum};
use crate::common::checked_file::CheckedFile;
use crate::local_model::runtime_config::ModelRuntimeConfig;
use crate::model_type::{ModelInput, ModelType};
use anyhow::{Context, Result};
......@@ -30,7 +29,6 @@ use dynamo_runtime::{slug::Slug, storage::key_value_store::Versioned, transports
use serde::{Deserialize, Serialize};
use tokenizers::Tokenizer as HfTokenizer;
use crate::gguf::{Content, ContentConfig, ModelConfigLike};
use crate::protocols::TokenIdType;
/// Identify model deployment cards in the key-value store
......@@ -40,14 +38,12 @@ pub const ROOT_PATH: &str = "v1/mdc";
#[serde(rename_all = "snake_case")]
pub enum ModelInfoType {
HfConfigJson(CheckedFile),
GGUF(PathBuf),
}
impl ModelInfoType {
pub fn checksum(&self) -> String {
match self {
ModelInfoType::HfConfigJson(c) => c.checksum().to_string(),
ModelInfoType::GGUF(_) => Checksum::default().to_string(),
}
}
}
......@@ -56,14 +52,12 @@ impl ModelInfoType {
#[serde(rename_all = "snake_case")]
pub enum TokenizerKind {
HfTokenizerJson(CheckedFile),
GGUF(Box<HfTokenizer>),
}
impl TokenizerKind {
pub fn checksum(&self) -> String {
match self {
TokenizerKind::HfTokenizerJson(c) => c.checksum().to_string(),
TokenizerKind::GGUF(_) => Checksum::default().to_string(),
}
}
}
......@@ -85,7 +79,6 @@ impl TokenizerKind {
pub enum PromptFormatterArtifact {
HfTokenizerConfigJson(CheckedFile),
HfChatTemplate(CheckedFile),
GGUF(PathBuf),
}
impl PromptFormatterArtifact {
......@@ -93,7 +86,6 @@ impl PromptFormatterArtifact {
match self {
PromptFormatterArtifact::HfTokenizerConfigJson(c) => c.checksum().to_string(),
PromptFormatterArtifact::HfChatTemplate(c) => c.checksum().to_string(),
PromptFormatterArtifact::GGUF(_) => Checksum::default().to_string(),
}
}
}
......@@ -112,14 +104,12 @@ pub enum PromptContextMixin {
#[serde(rename_all = "snake_case")]
pub enum GenerationConfig {
HfGenerationConfigJson(CheckedFile),
GGUF(PathBuf),
}
impl GenerationConfig {
pub fn checksum(&self) -> String {
match self {
GenerationConfig::HfGenerationConfigJson(c) => c.checksum().to_string(),
GenerationConfig::GGUF(_) => Checksum::default().to_string(),
}
}
}
......@@ -308,20 +298,12 @@ impl ModelDeploymentCard {
.map_err(anyhow::Error::msg)
.with_context(|| p.display().to_string())
}
Some(TokenizerKind::GGUF(t)) => Ok(*t.clone()),
None => {
anyhow::bail!("Blank ModelDeploymentCard does not have a tokenizer");
}
}
}
pub fn is_gguf(&self) -> bool {
match &self.model_info {
Some(info) => info.is_gguf(),
None => false,
}
}
/// Move the files this MDC uses into the NATS object store.
/// Updates the URI's to point to NATS.
pub async fn move_to_nats(&mut self, nats_client: nats::Client) -> Result<()> {
......@@ -451,24 +433,14 @@ impl ModelDeploymentCard {
self.slug = Slug::from_string(name);
}
/// Build an in-memory ModelDeploymentCard from either:
/// - a folder containing config.json, tokenizer.json and token_config.json
/// - a GGUF file
/// With an optional custom template
/// Build an in-memory ModelDeploymentCard from a folder containing config.json,
/// tokenizer.json and tokenizer_config.json (i.e. a huggingface repo checkout).
/// Optional custom template.
pub fn load_from_disk(
config_path: impl AsRef<Path>,
custom_template_path: Option<&Path>,
) -> anyhow::Result<ModelDeploymentCard> {
let config_path = config_path.as_ref();
if config_path.is_dir() {
Self::from_local_path(config_path, custom_template_path)
} else {
// GGUF files don't support custom templates yet
if custom_template_path.is_some() {
anyhow::bail!("Custom templates are not supported for GGUF files");
}
Self::from_gguf(config_path)
}
Self::from_local_path(config_path.as_ref(), custom_template_path)
}
pub fn requires_preprocessing(&self) -> bool {
......@@ -531,47 +503,6 @@ impl ModelDeploymentCard {
Self::from_repo(&repo_id, model_name, custom_template_path)
}
fn from_gguf(gguf_file: &Path) -> anyhow::Result<Self> {
let model_name = gguf_file
.iter()
.next_back()
.map(|n| n.to_string_lossy().to_string());
let Some(model_name) = model_name else {
// I think this would only happy on an empty path
anyhow::bail!(
"Could not extract model name from path '{}'",
gguf_file.display()
);
};
// TODO: we do this in HFConfig also, unify
let content = load_gguf(gguf_file)?;
let context_length = content.get_metadata()[&format!("{}.context_length", content.arch())]
.to_u32()
.unwrap_or(0);
tracing::debug!(context_length, "Loaded context length from GGUF");
Ok(Self {
display_name: model_name.to_string(),
slug: Slug::from_string(model_name),
model_info: Some(ModelInfoType::GGUF(gguf_file.to_path_buf())),
tokenizer: Some(TokenizerKind::from_gguf(gguf_file)?),
gen_config: None, // AFAICT there is no equivalent in a GGUF
prompt_formatter: Some(PromptFormatterArtifact::GGUF(gguf_file.to_path_buf())),
chat_template_file: None,
prompt_context: None, // TODO - auto-detect prompt context
context_length,
kv_cache_block_size: 0,
migration_limit: 0,
model_type: Default::default(), // set later
model_input: Default::default(), // set later
user_data: None,
runtime_config: ModelRuntimeConfig::default(),
cache_dir: None,
checksum: OnceLock::new(),
})
}
fn from_repo(
repo_id: &str,
model_name: &str,
......@@ -686,12 +617,8 @@ impl ModelInfoType {
};
Ok(HFConfig::from_json_file(path)?)
}
Self::GGUF(path) => Ok(HFConfig::from_gguf(path)?),
}
}
pub fn is_gguf(&self) -> bool {
matches!(self, Self::GGUF(_))
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
......@@ -822,44 +749,6 @@ impl HFConfig {
Ok(Arc::new(config))
}
fn from_gguf(gguf_file: &Path) -> Result<Arc<dyn ModelInfo>> {
let content = load_gguf(gguf_file)?;
let model_config_metadata: ContentConfig = (&content).into();
let num_hidden_layers =
content.get_metadata()[&format!("{}.block_count", content.arch())].to_u32()? as usize;
let bos_token_id = content.get_metadata()["tokenizer.ggml.bos_token_id"].to_u32()?;
let eos_token_id = content.get_metadata()["tokenizer.ggml.eos_token_id"].to_u32()?;
// to_vec returns a Vec that's already there, so it's cheap
let vocab_size = content.get_metadata()["tokenizer.ggml.tokens"]
.to_vec()?
.len();
let arch = content.arch().to_string();
Ok(Arc::new(HFConfig {
architectures: vec![format!("{}ForCausalLM", capitalize(&arch))],
// "general.architecture"
model_type: arch,
text_config: Some(HFTextConfig {
bos_token_id: None,
final_bos_token_id: bos_token_id,
eos_token_id: None,
final_eos_token_ids: vec![eos_token_id],
// "llama.context_length"
max_position_embeddings: Some(model_config_metadata.max_seq_len()),
// "llama.block_count"
num_hidden_layers,
// "llama.attention.head_count"
num_attention_heads: Some(model_config_metadata.num_attn_heads()),
// "tokenizer.ggml.tokens".len()
vocab_size: Some(vocab_size),
}),
eos_token_id: None,
}))
}
}
impl ModelInfo for HFConfig {
......@@ -888,31 +777,6 @@ impl ModelInfo for HFConfig {
}
}
impl TokenizerKind {
pub fn from_gguf(gguf_file: &Path) -> anyhow::Result<Self> {
let content = load_gguf(gguf_file)?;
let out = crate::gguf::convert_gguf_to_hf_tokenizer(&content)
.with_context(|| gguf_file.display().to_string())?;
Ok(TokenizerKind::GGUF(Box::new(out.tokenizer)))
}
}
pub(crate) fn load_gguf(gguf_file: &Path) -> anyhow::Result<Content> {
let filename = gguf_file.display().to_string();
let mut f = File::open(gguf_file).with_context(|| filename.clone())?;
// vec because GGUF can be split into multiple files (shards)
let mut readers = vec![&mut f];
crate::gguf::Content::from_readers(&mut readers).with_context(|| filename.clone())
}
fn capitalize(s: &str) -> String {
let mut chars = s.chars();
match chars.next() {
None => String::new(),
Some(first) => first.to_uppercase().collect::<String>() + &chars.as_str().to_lowercase(),
}
}
impl ModelInfoType {
pub fn from_repo(repo_id: &str) -> Result<Self> {
let f = CheckedFile::from_disk(PathBuf::from(repo_id).join("config.json"))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment