chore(bindings): Remove mistralrs / llama.cpp (#1970)

182d3b5d · Graham King · GitHub · def6eaa9 · 182d3b5d · 182d3b5d
Unverified Commit 182d3b5d authored Jul 16, 2025 by Graham King Committed by GitHub Jul 16, 2025
6 changed files
--- a/lib/bindings/python/Cargo.lock
+++ b/lib/bindings/python/Cargo.lock
--- a/lib/bindings/python/Cargo.toml
+++ b/lib/bindings/python/Cargo.toml
@@ -36,14 +36,10 @@ crate-type = ["cdylib", "rlib"]
 [features]
 default = []
 block-manager = ["dynamo-llm/block-manager", "dep:dlpark"]
-mistralrs = ["dep:dynamo-engine-mistralrs"]
-llamacpp = ["dep:dynamo-engine-llamacpp"]
 [dependencies]
 dynamo-llm = { path = "../../llm" }
 dynamo-runtime = { path = "../../runtime" }
-dynamo-engine-mistralrs = { path = "../../engines/mistralrs", features = ["cuda"], optional = true }
-dynamo-engine-llamacpp = { path = "../../engines/llamacpp", features = ["cuda", "dynamic-link"], optional = true }
 anyhow = { version = "1" }
 async-openai = { version = "0.29.0" }

--- a/lib/bindings/python/README.md
+++ b/lib/bindings/python/README.md
@@ -46,26 +46,6 @@ uv pip install maturin
 maturin develop --uv
 ```
-5. Experimental: To allow using mistral.rs and llama.cpp via the bindings, build with feature flags:
-```
-maturin develop --features mistralrs,llamacpp --release
-```
-`--release` is optional. It builds slower but the resulting library is significantly faster.
-See `examples/cli/cli.py` for usage.
-They will both be built for CUDA by default. If you see a runtime error `CUDA_ERROR_STUB_LIBRARY` this is because
-the stub `libcuda.so` is earlier on the library search path than the real libcuda. Try removing the `rpath` from the library:
-```
-patchelf --set-rpath '' _core.cpython-312-x86_64-linux-gnu.so
-```
-If you include the `llamacpp` feature flag, `libllama.so` and `libggml.so` (and family) will need to be available at runtime.
 ## Run Examples
 ### Prerequisite

--- a/lib/bindings/python/examples/cli/cli.py
+++ b/lib/bindings/python/examples/cli/cli.py
@@ -3,7 +3,7 @@
 # Example cli using the Python bindings, similar to `dynamo-run`.
 #
-# Usage: `python cli.py in=text out=mistralrs <your-model>`.
+# Usage: `python cli.py in=text out=echo <your-model>`.
 # `in` can be:
 # - "http": OpenAI compliant HTTP server
 # - "text": Interactive text chat
@@ -13,28 +13,12 @@
 #
 # `out` can be:
 # - "dyn": Run as the frontend node. Auto-discover workers and route traffic to them.
-# - "mistralrs", "llamacpp", "sglang", "vllm", "trtllm", "echo": An LLM worker.
+# - "sglang", "vllm", "trtllm", "echo": An LLM worker.
 #
 # Must be in a virtualenv with the Dynamo bindings (or wheel) installed.
 #
-# To use mistralrs or llamacpp you must build the library with those features:
+# There is no provided llama.cpp engine here, but there is one in components/llama_cpp/. It would be
-# ```
+# easy enough to copy the few Python lines from there to here and add an `out=llama_cpp`.
-# maturin develop --features mistralrs,llamacpp --release
-# ```
-#
-# `--release` is optional. It builds slower but the resulting library is significantly faster.
-#
-# They will both be built for CUDA by default. If you see a runtime error `CUDA_ERROR_STUB_LIBRARY` this is because
-# the stub `libcuda.so` is earlier on the library search path than the real libcuda. Try removing
-# the `rpath` from the library:
-#
-# ```
-# patchelf --set-rpath '' _core.cpython-312-x86_64-linux-gnu.so
-# ```
-#
-# If you include the `llamacpp` feature flag, `libllama.so` and `libggml.so` (and family) will need to be
-# available at runtime.
-#
 import argparse
 import asyncio
@@ -79,7 +63,7 @@ def parse_args():
    # --- Step 2: Argparse for flags and the model path ---
    parser = argparse.ArgumentParser(
        description="Dynamo example CLI: Connect inputs to an engine",
-        usage="python cli.py in=text out=mistralrs <your-model>",
+        usage="python cli.py in=text out=echo <your-model>",
        formatter_class=argparse.RawTextHelpFormatter,  # To preserve multi-line help formatting
    )
@@ -186,8 +170,6 @@ async def run():
    engine_type_map = {
        "echo": EngineType.Echo,
-        "mistralrs": EngineType.MistralRs,
-        "llamacpp": EngineType.LlamaCpp,
        "dyn": EngineType.Dynamic,
    }
    out_mode = args["out_mode"]

--- a/lib/bindings/python/rust/llm/entrypoint.rs
+++ b/lib/bindings/python/rust/llm/entrypoint.rs
@@ -17,10 +17,8 @@ use dynamo_runtime::protocols::Endpoint as EndpointId;
 #[repr(i32)]
 pub enum EngineType {
    Echo = 1,
-    MistralRs = 2,
+    Dynamic = 2,
-    LlamaCpp = 3,
+    Mocker = 3,
-    Dynamic = 4,
-    Mocker = 5,
 }
 #[pyclass]
@@ -157,40 +155,6 @@ async fn select_engine(
                model: Box::new(local_model),
            }
        }
-        EngineType::MistralRs => {
-            #[cfg(feature = "mistralrs")]
-            {
-                RsEngineConfig::StaticFull {
-                    engine: dynamo_engine_mistralrs::make_engine(&local_model).await?,
-                    model: Box::new(local_model),
-                }
-            }
-            #[cfg(not(feature = "mistralrs"))]
-            {
-                anyhow::bail!(
-                    "mistralrs engine is not enabled. Rebuild bindings with `--features mistralrs`"
-                );
-            }
-        }
-        EngineType::LlamaCpp => {
-            #[cfg(feature = "llamacpp")]
-            {
-                RsEngineConfig::StaticCore {
-                    engine: dynamo_engine_llamacpp::make_engine(
-                        distributed_runtime.inner.primary_token(),
-                        &local_model,
-                    )
-                    .await?,
-                    model: Box::new(local_model),
-                }
-            }
-            #[cfg(not(feature = "llamacpp"))]
-            {
-                anyhow::bail!(
-                    "llamacpp engine is not enabled. Rebuild bindings with `--features llamacpp`"
-                );
-            }
-        }
    };
    Ok(inner)

--- a/lib/runtime/src/runtime.rs
+++ b/lib/runtime/src/runtime.rs
@@ -30,7 +30,7 @@ use crate::config::{self, RuntimeConfig};
 use futures::Future;
 use once_cell::sync::OnceCell;
-use std::sync::{Arc, Mutex};
+use std::sync::Arc;
 use tokio::{signal, task::JoinHandle};
 pub use tokio_util::sync::CancellationToken;