Unverified Commit 182d3b5d authored by Graham King's avatar Graham King Committed by GitHub
Browse files

chore(bindings): Remove mistralrs / llama.cpp (#1970)

parent def6eaa9
This diff is collapsed.
...@@ -36,14 +36,10 @@ crate-type = ["cdylib", "rlib"] ...@@ -36,14 +36,10 @@ crate-type = ["cdylib", "rlib"]
[features] [features]
default = [] default = []
block-manager = ["dynamo-llm/block-manager", "dep:dlpark"] block-manager = ["dynamo-llm/block-manager", "dep:dlpark"]
mistralrs = ["dep:dynamo-engine-mistralrs"]
llamacpp = ["dep:dynamo-engine-llamacpp"]
[dependencies] [dependencies]
dynamo-llm = { path = "../../llm" } dynamo-llm = { path = "../../llm" }
dynamo-runtime = { path = "../../runtime" } dynamo-runtime = { path = "../../runtime" }
dynamo-engine-mistralrs = { path = "../../engines/mistralrs", features = ["cuda"], optional = true }
dynamo-engine-llamacpp = { path = "../../engines/llamacpp", features = ["cuda", "dynamic-link"], optional = true }
anyhow = { version = "1" } anyhow = { version = "1" }
async-openai = { version = "0.29.0" } async-openai = { version = "0.29.0" }
......
...@@ -46,26 +46,6 @@ uv pip install maturin ...@@ -46,26 +46,6 @@ uv pip install maturin
maturin develop --uv maturin develop --uv
``` ```
5. Experimental: To allow using mistral.rs and llama.cpp via the bindings, build with feature flags:
```
maturin develop --features mistralrs,llamacpp --release
```
`--release` is optional. It builds slower but the resulting library is significantly faster.
See `examples/cli/cli.py` for usage.
They will both be built for CUDA by default. If you see a runtime error `CUDA_ERROR_STUB_LIBRARY` this is because
the stub `libcuda.so` is earlier on the library search path than the real libcuda. Try removing the `rpath` from the library:
```
patchelf --set-rpath '' _core.cpython-312-x86_64-linux-gnu.so
```
If you include the `llamacpp` feature flag, `libllama.so` and `libggml.so` (and family) will need to be available at runtime.
## Run Examples ## Run Examples
### Prerequisite ### Prerequisite
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
# Example cli using the Python bindings, similar to `dynamo-run`. # Example cli using the Python bindings, similar to `dynamo-run`.
# #
# Usage: `python cli.py in=text out=mistralrs <your-model>`. # Usage: `python cli.py in=text out=echo <your-model>`.
# `in` can be: # `in` can be:
# - "http": OpenAI compliant HTTP server # - "http": OpenAI compliant HTTP server
# - "text": Interactive text chat # - "text": Interactive text chat
...@@ -13,28 +13,12 @@ ...@@ -13,28 +13,12 @@
# #
# `out` can be: # `out` can be:
# - "dyn": Run as the frontend node. Auto-discover workers and route traffic to them. # - "dyn": Run as the frontend node. Auto-discover workers and route traffic to them.
# - "mistralrs", "llamacpp", "sglang", "vllm", "trtllm", "echo": An LLM worker. # - "sglang", "vllm", "trtllm", "echo": An LLM worker.
# #
# Must be in a virtualenv with the Dynamo bindings (or wheel) installed. # Must be in a virtualenv with the Dynamo bindings (or wheel) installed.
# #
# To use mistralrs or llamacpp you must build the library with those features: # There is no provided llama.cpp engine here, but there is one in components/llama_cpp/. It would be
# ``` # easy enough to copy the few Python lines from there to here and add an `out=llama_cpp`.
# maturin develop --features mistralrs,llamacpp --release
# ```
#
# `--release` is optional. It builds slower but the resulting library is significantly faster.
#
# They will both be built for CUDA by default. If you see a runtime error `CUDA_ERROR_STUB_LIBRARY` this is because
# the stub `libcuda.so` is earlier on the library search path than the real libcuda. Try removing
# the `rpath` from the library:
#
# ```
# patchelf --set-rpath '' _core.cpython-312-x86_64-linux-gnu.so
# ```
#
# If you include the `llamacpp` feature flag, `libllama.so` and `libggml.so` (and family) will need to be
# available at runtime.
#
import argparse import argparse
import asyncio import asyncio
...@@ -79,7 +63,7 @@ def parse_args(): ...@@ -79,7 +63,7 @@ def parse_args():
# --- Step 2: Argparse for flags and the model path --- # --- Step 2: Argparse for flags and the model path ---
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Dynamo example CLI: Connect inputs to an engine", description="Dynamo example CLI: Connect inputs to an engine",
usage="python cli.py in=text out=mistralrs <your-model>", usage="python cli.py in=text out=echo <your-model>",
formatter_class=argparse.RawTextHelpFormatter, # To preserve multi-line help formatting formatter_class=argparse.RawTextHelpFormatter, # To preserve multi-line help formatting
) )
...@@ -186,8 +170,6 @@ async def run(): ...@@ -186,8 +170,6 @@ async def run():
engine_type_map = { engine_type_map = {
"echo": EngineType.Echo, "echo": EngineType.Echo,
"mistralrs": EngineType.MistralRs,
"llamacpp": EngineType.LlamaCpp,
"dyn": EngineType.Dynamic, "dyn": EngineType.Dynamic,
} }
out_mode = args["out_mode"] out_mode = args["out_mode"]
......
...@@ -17,10 +17,8 @@ use dynamo_runtime::protocols::Endpoint as EndpointId; ...@@ -17,10 +17,8 @@ use dynamo_runtime::protocols::Endpoint as EndpointId;
#[repr(i32)] #[repr(i32)]
pub enum EngineType { pub enum EngineType {
Echo = 1, Echo = 1,
MistralRs = 2, Dynamic = 2,
LlamaCpp = 3, Mocker = 3,
Dynamic = 4,
Mocker = 5,
} }
#[pyclass] #[pyclass]
...@@ -157,40 +155,6 @@ async fn select_engine( ...@@ -157,40 +155,6 @@ async fn select_engine(
model: Box::new(local_model), model: Box::new(local_model),
} }
} }
EngineType::MistralRs => {
#[cfg(feature = "mistralrs")]
{
RsEngineConfig::StaticFull {
engine: dynamo_engine_mistralrs::make_engine(&local_model).await?,
model: Box::new(local_model),
}
}
#[cfg(not(feature = "mistralrs"))]
{
anyhow::bail!(
"mistralrs engine is not enabled. Rebuild bindings with `--features mistralrs`"
);
}
}
EngineType::LlamaCpp => {
#[cfg(feature = "llamacpp")]
{
RsEngineConfig::StaticCore {
engine: dynamo_engine_llamacpp::make_engine(
distributed_runtime.inner.primary_token(),
&local_model,
)
.await?,
model: Box::new(local_model),
}
}
#[cfg(not(feature = "llamacpp"))]
{
anyhow::bail!(
"llamacpp engine is not enabled. Rebuild bindings with `--features llamacpp`"
);
}
}
}; };
Ok(inner) Ok(inner)
......
...@@ -30,7 +30,7 @@ use crate::config::{self, RuntimeConfig}; ...@@ -30,7 +30,7 @@ use crate::config::{self, RuntimeConfig};
use futures::Future; use futures::Future;
use once_cell::sync::OnceCell; use once_cell::sync::OnceCell;
use std::sync::{Arc, Mutex}; use std::sync::Arc;
use tokio::{signal, task::JoinHandle}; use tokio::{signal, task::JoinHandle};
pub use tokio_util::sync::CancellationToken; pub use tokio_util::sync::CancellationToken;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment