Unverified Commit 3e3c3b10 authored by Graham King's avatar Graham King Committed by GitHub
Browse files

feat(dynamo-run): Use llama.cpp as the default engine for GGUF (#1276)

Previously `mistral.rs` was the default engine for both safetensors and GGUF models. Now it is only the default for safetensors, `llama.cpp` becomes the default for GGUF.

Why?

- Since #1177 `llama.cpp` is built-in by default, so we can switch.
- `llama.cpp` is very very good at running GGUF (but can't run other types of model), so we should switch.

Dynamo's multi-engine support gives us a secret super-power: we can use the best engine for this specific format or model.

We can still run GGUF with mistralrs by doing `out=mistralrs`.
parent f9ba6f5c
......@@ -41,13 +41,21 @@ pub enum EngineConfig {
},
}
fn is_in_dynamic(in_opt: &Input) -> bool {
matches!(in_opt, Input::Endpoint(_))
}
fn is_out_dynamic(out_opt: &Option<Output>) -> bool {
matches!(out_opt, Some(Output::Dynamic))
}
pub async fn run(
runtime: dynamo_runtime::Runtime,
in_opt: Input,
out_opt: Output,
out_opt: Option<Output>,
flags: Flags,
) -> anyhow::Result<()> {
if matches!(&in_opt, Input::Endpoint(_)) && matches!(&out_opt, Output::Dynamic) {
if is_in_dynamic(&in_opt) && is_out_dynamic(&out_opt) {
anyhow::bail!("Cannot use endpoint for both in and out");
}
......@@ -57,13 +65,12 @@ pub async fn run(
.clone()
.or(flags.model_path_flag.clone());
let mut local_model: LocalModel = match out_opt {
let mut local_model: LocalModel = if is_out_dynamic(&out_opt) {
// If output is dynamic we are ingress and don't have a local model, but making an
// empty one cleans up the code.
Output::Dynamic => Default::default(),
Default::default()
} else {
// All other output types have a local model
_ => {
match &maybe_path {
Some(model_path) => {
LocalModel::prepare(
......@@ -81,7 +88,6 @@ pub async fn run(
}
}
}
}
};
// Only set if user provides. Usually loaded from tokenizer_config.json
......@@ -108,6 +114,20 @@ pub async fn run(
// We may need it later
let card = local_model.card().clone();
let out_opt = out_opt.unwrap_or_else(|| {
let default_engine = if card.is_gguf() {
Output::LlamaCpp
} else {
Output::MistralRs
};
tracing::info!(
"Using default engine: {default_engine}. Use out=<engine> to specify one of {}",
Output::available_engines().join(", ")
);
default_engine
});
print_cuda(&out_opt);
// Create the engine matching `out`
let engine_config = match out_opt {
Output::Dynamic => {
......@@ -344,3 +364,39 @@ async fn stopper(
// Keep it alive until the engine has stopped.
drop(py_script);
}
/// If the user will benefit from CUDA/Metal/Vulkan, remind them to build with it.
/// If they have it, celebrate!
// Only mistralrs and llamacpp need to be built with CUDA.
// The Python engines only need it at runtime.
#[cfg(any(feature = "mistralrs", feature = "llamacpp"))]
fn print_cuda(output: &Output) {
// These engines maybe be compiled in, but are they the chosen one?
match output {
#[cfg(feature = "mistralrs")]
Output::MistralRs => {}
#[cfg(feature = "llamacpp")]
Output::LlamaCpp => {}
_ => {
return;
}
}
#[cfg(feature = "cuda")]
{
tracing::info!("CUDA on");
}
#[cfg(feature = "metal")]
{
tracing::info!("Metal on");
}
#[cfg(feature = "vulkan")]
{
tracing::info!("Vulkan on");
}
#[cfg(not(any(feature = "cuda", feature = "metal", feature = "vulkan")))]
tracing::info!("CPU mode. Rebuild with `--features cuda|metal|vulkan` for better performance");
}
#[cfg(not(any(feature = "mistralrs", feature = "llamacpp")))]
fn print_cuda(_output: &Output) {}
......@@ -103,21 +103,9 @@ async fn wrapper(runtime: dynamo_runtime::Runtime) -> anyhow::Result<()> {
}
None => Input::default(),
};
let out_opt = match out_opt {
Some(x) => {
if out_opt.is_some() {
non_flag_params += 1;
x
}
None => {
let default_engine = Output::default(); // smart default based on feature flags
tracing::info!(
"Using default engine: {default_engine}. Use out=<engine> to specify one of {}",
Output::available_engines().join(", ")
);
default_engine
}
};
print_cuda(&out_opt);
// Clap skips the first argument expecting it to be the binary name, so add it back
// Note `--model-path` has index=1 (in lib.rs) so that doesn't need a flag.
......@@ -129,39 +117,3 @@ async fn wrapper(runtime: dynamo_runtime::Runtime) -> anyhow::Result<()> {
dynamo_run::run(runtime, in_opt, out_opt, flags).await
}
/// If the user will benefit from CUDA/Metal/Vulkan, remind them to build with it.
/// If they have it, celebrate!
// Only mistralrs and llamacpp need to be built with CUDA.
// The Python engines only need it at runtime.
#[cfg(any(feature = "mistralrs", feature = "llamacpp"))]
fn print_cuda(output: &Output) {
// These engines maybe be compiled in, but are they the chosen one?
match output {
#[cfg(feature = "mistralrs")]
Output::MistralRs => {}
#[cfg(feature = "llamacpp")]
Output::LlamaCpp => {}
_ => {
return;
}
}
#[cfg(feature = "cuda")]
{
tracing::info!("CUDA on");
}
#[cfg(feature = "metal")]
{
tracing::info!("Metal on");
}
#[cfg(feature = "vulkan")]
{
tracing::info!("Vulkan on");
}
#[cfg(not(any(feature = "cuda", feature = "metal", feature = "vulkan")))]
tracing::info!("CPU mode. Rebuild with `--features cuda|metal|vulkan` for better performance");
}
#[cfg(not(any(feature = "mistralrs", feature = "llamacpp")))]
fn print_cuda(_output: &Output) {}
......@@ -165,23 +165,6 @@ impl fmt::Display for Output {
}
}
/// Returns the engine to use if user did not say on cmd line.
/// Nearly always defaults to mistralrs which has no dependencies and we include by default.
/// If built with --no-default-features default to subprocess vllm.
#[allow(unused_assignments, unused_mut)]
impl Default for Output {
fn default() -> Self {
let mut out = Output::Vllm;
#[cfg(feature = "mistralrs")]
{
out = Output::MistralRs;
}
out
}
}
impl Output {
#[allow(unused_mut)]
pub fn available_engines() -> Vec<String> {
......
......@@ -62,6 +62,12 @@ impl LocalModel {
&self.card.service_name
}
pub fn is_gguf(&self) -> bool {
// GGUF is the only file (not-folder) we accept, so we don't need to check the extension
// We will error when we come to parse it
self.full_path.is_file()
}
/// Override max number of tokens in context. We usually only do this to limit kv cache allocation.
pub fn set_context_length(&mut self, context_length: usize) {
self.card.context_length = context_length;
......
......@@ -226,6 +226,13 @@ impl ModelDeploymentCard {
}
}
pub fn is_gguf(&self) -> bool {
match &self.model_info {
Some(info) => info.is_gguf(),
None => false,
}
}
/// Move the files this MDC uses into the NATS object store.
/// Updates the URI's to point to NATS.
pub async fn move_to_nats(&mut self, nats_client: nats::Client) -> Result<()> {
......@@ -380,6 +387,9 @@ impl ModelInfoType {
Self::GGUF(path) => HFConfig::from_gguf(path),
}
}
pub fn is_gguf(&self) -> bool {
matches!(self, Self::GGUF(_))
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment