feat(dynamo-run): Use llama.cpp as the default engine for GGUF (#1276)

Previously `mistral.rs` was the default engine for both safetensors and GGUF models. Now it is only the default for safetensors, `llama.cpp` becomes the default for GGUF. Why? - Since #1177 `llama.cpp` is built-in by default, so we can switch. - `llama.cpp` is very very good at running GGUF (but can't run other types of model), so we should switch. Dynamo's multi-engine support gives us a secret super-power: we can use the best engine for this specific format or model. We can still run GGUF with mistralrs by doing `out=mistralrs`.

feat(dynamo-run): Use llama.cpp as the default engine for GGUF (#1276)
Previously `mistral.rs` was the default engine for both safetensors and GGUF models. Now it is only the default for safetensors, `llama.cpp` becomes the default for GGUF. Why? - Since #1177 `llama.cpp` is built-in by default, so we can switch. - `llama.cpp` is very very good at running GGUF (but can't run other types of model), so we should switch. Dynamo's multi-engine support gives us a secret super-power: we can use the best engine for this specific format or model. We can still run GGUF with mistralrs by doing `out=mistralrs`.
3e3c3b10 · Graham King · GitHub · f9ba6f5c · 3e3c3b10 · 3e3c3b10
Unverified Commit 3e3c3b10 authored May 29, 2025 by Graham King Committed by GitHub May 29, 2025
5 changed files
--- a/launch/dynamo-run/src/lib.rs
+++ b/launch/dynamo-run/src/lib.rs
@@ -41,13 +41,21 @@ pub enum EngineConfig {
    },
 }

+fn is_in_dynamic(in_opt: &Input) -> bool {
+    matches!(in_opt, Input::Endpoint(_))
+}
+
+fn is_out_dynamic(out_opt: &Option<Output>) -> bool {
+    matches!(out_opt, Some(Output::Dynamic))
+}
+
 pub async fn run(
    runtime: dynamo_runtime::Runtime,
    in_opt: Input,
-    out_opt: Output,
+    out_opt: Option<Output>,
    flags: Flags,
 ) -> anyhow::Result<()> {
-    if matches!(&in_opt, Input::Endpoint(_)) && matches!(&out_opt, Output::Dynamic) {
+    if is_in_dynamic(&in_opt) && is_out_dynamic(&out_opt) {
        anyhow::bail!("Cannot use endpoint for both in and out");
    }

@@ -57,13 +65,12 @@ pub async fn run(
        .clone()
        .or(flags.model_path_flag.clone());

-    let mut local_model: LocalModel = match out_opt {
+    let mut local_model: LocalModel = if is_out_dynamic(&out_opt) {
        // If output is dynamic we are ingress and don't have a local model, but making an
        // empty one cleans up the code.
-        Output::Dynamic => Default::default(),
-
+        Default::default()
+    } else {
        // All other output types have a local model
-        _ => {
        match &maybe_path {
            Some(model_path) => {
                LocalModel::prepare(
@@ -81,7 +88,6 @@ pub async fn run(
                }
            }
        }
-        }
    };

    // Only set if user provides. Usually loaded from tokenizer_config.json
@@ -108,6 +114,20 @@ pub async fn run(
    // We may need it later
    let card = local_model.card().clone();

+    let out_opt = out_opt.unwrap_or_else(|| {
+        let default_engine = if card.is_gguf() {
+            Output::LlamaCpp
+        } else {
+            Output::MistralRs
+        };
+        tracing::info!(
+            "Using default engine: {default_engine}. Use out=<engine> to specify one of {}",
+            Output::available_engines().join(", ")
+        );
+        default_engine
+    });
+    print_cuda(&out_opt);
+
    // Create the engine matching `out`
    let engine_config = match out_opt {
        Output::Dynamic => {
@@ -344,3 +364,39 @@ async fn stopper(
    // Keep it alive until the engine has stopped.
    drop(py_script);
 }
+
+/// If the user will benefit from CUDA/Metal/Vulkan, remind them to build with it.
+/// If they have it, celebrate!
+// Only mistralrs and llamacpp need to be built with CUDA.
+// The Python engines only need it at runtime.
+#[cfg(any(feature = "mistralrs", feature = "llamacpp"))]
+fn print_cuda(output: &Output) {
+    // These engines maybe be compiled in, but are they the chosen one?
+    match output {
+        #[cfg(feature = "mistralrs")]
+        Output::MistralRs => {}
+        #[cfg(feature = "llamacpp")]
+        Output::LlamaCpp => {}
+        _ => {
+            return;
+        }
+    }
+
+    #[cfg(feature = "cuda")]
+    {
+        tracing::info!("CUDA on");
+    }
+    #[cfg(feature = "metal")]
+    {
+        tracing::info!("Metal on");
+    }
+    #[cfg(feature = "vulkan")]
+    {
+        tracing::info!("Vulkan on");
+    }
+    #[cfg(not(any(feature = "cuda", feature = "metal", feature = "vulkan")))]
+    tracing::info!("CPU mode. Rebuild with `--features cuda|metal|vulkan` for better performance");
+}
+
+#[cfg(not(any(feature = "mistralrs", feature = "llamacpp")))]
+fn print_cuda(_output: &Output) {}
--- a/launch/dynamo-run/src/main.rs
+++ b/launch/dynamo-run/src/main.rs
@@ -103,21 +103,9 @@ async fn wrapper(runtime: dynamo_runtime::Runtime) -> anyhow::Result<()> {
        }
        None => Input::default(),
    };
-    let out_opt = match out_opt {
-        Some(x) => {
+    if out_opt.is_some() {
        non_flag_params += 1;
-            x
-        }
-        None => {
-            let default_engine = Output::default(); // smart default based on feature flags
-            tracing::info!(
-                "Using default engine: {default_engine}. Use out=<engine> to specify one of {}",
-                Output::available_engines().join(", ")
-            );
-            default_engine
    }
-    };
-    print_cuda(&out_opt);

    // Clap skips the first argument expecting it to be the binary name, so add it back
    // Note `--model-path` has index=1 (in lib.rs) so that doesn't need a flag.
@@ -129,39 +117,3 @@ async fn wrapper(runtime: dynamo_runtime::Runtime) -> anyhow::Result<()> {

    dynamo_run::run(runtime, in_opt, out_opt, flags).await
 }
-
-/// If the user will benefit from CUDA/Metal/Vulkan, remind them to build with it.
-/// If they have it, celebrate!
-// Only mistralrs and llamacpp need to be built with CUDA.
-// The Python engines only need it at runtime.
-#[cfg(any(feature = "mistralrs", feature = "llamacpp"))]
-fn print_cuda(output: &Output) {
-    // These engines maybe be compiled in, but are they the chosen one?
-    match output {
-        #[cfg(feature = "mistralrs")]
-        Output::MistralRs => {}
-        #[cfg(feature = "llamacpp")]
-        Output::LlamaCpp => {}
-        _ => {
-            return;
-        }
-    }
-
-    #[cfg(feature = "cuda")]
-    {
-        tracing::info!("CUDA on");
-    }
-    #[cfg(feature = "metal")]
-    {
-        tracing::info!("Metal on");
-    }
-    #[cfg(feature = "vulkan")]
-    {
-        tracing::info!("Vulkan on");
-    }
-    #[cfg(not(any(feature = "cuda", feature = "metal", feature = "vulkan")))]
-    tracing::info!("CPU mode. Rebuild with `--features cuda|metal|vulkan` for better performance");
-}
-
-#[cfg(not(any(feature = "mistralrs", feature = "llamacpp")))]
-fn print_cuda(_output: &Output) {}
--- a/launch/dynamo-run/src/opt.rs
+++ b/launch/dynamo-run/src/opt.rs
@@ -165,23 +165,6 @@ impl fmt::Display for Output {
    }
 }

-/// Returns the engine to use if user did not say on cmd line.
-/// Nearly always defaults to mistralrs which has no dependencies and we include by default.
-/// If built with --no-default-features default to subprocess vllm.
-#[allow(unused_assignments, unused_mut)]
-impl Default for Output {
-    fn default() -> Self {
-        let mut out = Output::Vllm;
-
-        #[cfg(feature = "mistralrs")]
-        {
-            out = Output::MistralRs;
-        }
-
-        out
-    }
-}
-
 impl Output {
    #[allow(unused_mut)]
    pub fn available_engines() -> Vec<String> {

--- a/lib/llm/src/local_model.rs
+++ b/lib/llm/src/local_model.rs
@@ -62,6 +62,12 @@ impl LocalModel {
        &self.card.service_name
    }

+    pub fn is_gguf(&self) -> bool {
+        // GGUF is the only file (not-folder) we accept, so we don't need to check the extension
+        // We will error when we come to parse it
+        self.full_path.is_file()
+    }
+
    /// Override max number of tokens in context. We usually only do this to limit kv cache allocation.
    pub fn set_context_length(&mut self, context_length: usize) {
        self.card.context_length = context_length;

--- a/lib/llm/src/model_card/model.rs
+++ b/lib/llm/src/model_card/model.rs
@@ -226,6 +226,13 @@ impl ModelDeploymentCard {
        }
    }

+    pub fn is_gguf(&self) -> bool {
+        match &self.model_info {
+            Some(info) => info.is_gguf(),
+            None => false,
+        }
+    }
+
    /// Move the files this MDC uses into the NATS object store.
    /// Updates the URI's to point to NATS.
    pub async fn move_to_nats(&mut self, nats_client: nats::Client) -> Result<()> {
@@ -380,6 +387,9 @@ impl ModelInfoType {
            Self::GGUF(path) => HFConfig::from_gguf(path),
        }
    }
+    pub fn is_gguf(&self) -> bool {
+        matches!(self, Self::GGUF(_))
+    }
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]