Unverified Commit 182d3b5d authored by Graham King's avatar Graham King Committed by GitHub
Browse files

chore(bindings): Remove mistralrs / llama.cpp (#1970)

parent def6eaa9
This diff is collapsed.
......@@ -36,14 +36,10 @@ crate-type = ["cdylib", "rlib"]
[features]
default = []
block-manager = ["dynamo-llm/block-manager", "dep:dlpark"]
mistralrs = ["dep:dynamo-engine-mistralrs"]
llamacpp = ["dep:dynamo-engine-llamacpp"]
[dependencies]
dynamo-llm = { path = "../../llm" }
dynamo-runtime = { path = "../../runtime" }
dynamo-engine-mistralrs = { path = "../../engines/mistralrs", features = ["cuda"], optional = true }
dynamo-engine-llamacpp = { path = "../../engines/llamacpp", features = ["cuda", "dynamic-link"], optional = true }
anyhow = { version = "1" }
async-openai = { version = "0.29.0" }
......
......@@ -46,26 +46,6 @@ uv pip install maturin
maturin develop --uv
```
5. Experimental: To allow using mistral.rs and llama.cpp via the bindings, build with feature flags:
```
maturin develop --features mistralrs,llamacpp --release
```
`--release` is optional. It builds slower but the resulting library is significantly faster.
See `examples/cli/cli.py` for usage.
They will both be built for CUDA by default. If you see a runtime error `CUDA_ERROR_STUB_LIBRARY` this is because
the stub `libcuda.so` is earlier on the library search path than the real libcuda. Try removing the `rpath` from the library:
```
patchelf --set-rpath '' _core.cpython-312-x86_64-linux-gnu.so
```
If you include the `llamacpp` feature flag, `libllama.so` and `libggml.so` (and family) will need to be available at runtime.
## Run Examples
### Prerequisite
......
......@@ -3,7 +3,7 @@
# Example cli using the Python bindings, similar to `dynamo-run`.
#
# Usage: `python cli.py in=text out=mistralrs <your-model>`.
# Usage: `python cli.py in=text out=echo <your-model>`.
# `in` can be:
# - "http": OpenAI compliant HTTP server
# - "text": Interactive text chat
......@@ -13,28 +13,12 @@
#
# `out` can be:
# - "dyn": Run as the frontend node. Auto-discover workers and route traffic to them.
# - "mistralrs", "llamacpp", "sglang", "vllm", "trtllm", "echo": An LLM worker.
# - "sglang", "vllm", "trtllm", "echo": An LLM worker.
#
# Must be in a virtualenv with the Dynamo bindings (or wheel) installed.
#
# To use mistralrs or llamacpp you must build the library with those features:
# ```
# maturin develop --features mistralrs,llamacpp --release
# ```
#
# `--release` is optional. It builds slower but the resulting library is significantly faster.
#
# They will both be built for CUDA by default. If you see a runtime error `CUDA_ERROR_STUB_LIBRARY` this is because
# the stub `libcuda.so` is earlier on the library search path than the real libcuda. Try removing
# the `rpath` from the library:
#
# ```
# patchelf --set-rpath '' _core.cpython-312-x86_64-linux-gnu.so
# ```
#
# If you include the `llamacpp` feature flag, `libllama.so` and `libggml.so` (and family) will need to be
# available at runtime.
#
# There is no provided llama.cpp engine here, but there is one in components/llama_cpp/. It would be
# easy enough to copy the few Python lines from there to here and add an `out=llama_cpp`.
import argparse
import asyncio
......@@ -79,7 +63,7 @@ def parse_args():
# --- Step 2: Argparse for flags and the model path ---
parser = argparse.ArgumentParser(
description="Dynamo example CLI: Connect inputs to an engine",
usage="python cli.py in=text out=mistralrs <your-model>",
usage="python cli.py in=text out=echo <your-model>",
formatter_class=argparse.RawTextHelpFormatter, # To preserve multi-line help formatting
)
......@@ -186,8 +170,6 @@ async def run():
engine_type_map = {
"echo": EngineType.Echo,
"mistralrs": EngineType.MistralRs,
"llamacpp": EngineType.LlamaCpp,
"dyn": EngineType.Dynamic,
}
out_mode = args["out_mode"]
......
......@@ -17,10 +17,8 @@ use dynamo_runtime::protocols::Endpoint as EndpointId;
#[repr(i32)]
pub enum EngineType {
Echo = 1,
MistralRs = 2,
LlamaCpp = 3,
Dynamic = 4,
Mocker = 5,
Dynamic = 2,
Mocker = 3,
}
#[pyclass]
......@@ -157,40 +155,6 @@ async fn select_engine(
model: Box::new(local_model),
}
}
EngineType::MistralRs => {
#[cfg(feature = "mistralrs")]
{
RsEngineConfig::StaticFull {
engine: dynamo_engine_mistralrs::make_engine(&local_model).await?,
model: Box::new(local_model),
}
}
#[cfg(not(feature = "mistralrs"))]
{
anyhow::bail!(
"mistralrs engine is not enabled. Rebuild bindings with `--features mistralrs`"
);
}
}
EngineType::LlamaCpp => {
#[cfg(feature = "llamacpp")]
{
RsEngineConfig::StaticCore {
engine: dynamo_engine_llamacpp::make_engine(
distributed_runtime.inner.primary_token(),
&local_model,
)
.await?,
model: Box::new(local_model),
}
}
#[cfg(not(feature = "llamacpp"))]
{
anyhow::bail!(
"llamacpp engine is not enabled. Rebuild bindings with `--features llamacpp`"
);
}
}
};
Ok(inner)
......
......@@ -30,7 +30,7 @@ use crate::config::{self, RuntimeConfig};
use futures::Future;
use once_cell::sync::OnceCell;
use std::sync::{Arc, Mutex};
use std::sync::Arc;
use tokio::{signal, task::JoinHandle};
pub use tokio_util::sync::CancellationToken;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment