Unverified Commit 3e3c3b10 authored by Graham King's avatar Graham King Committed by GitHub
Browse files

feat(dynamo-run): Use llama.cpp as the default engine for GGUF (#1276)

Previously `mistral.rs` was the default engine for both safetensors and GGUF models. Now it is only the default for safetensors, `llama.cpp` becomes the default for GGUF.

Why?

- Since #1177 `llama.cpp` is built-in by default, so we can switch.
- `llama.cpp` is very very good at running GGUF (but can't run other types of model), so we should switch.

Dynamo's multi-engine support gives us a secret super-power: we can use the best engine for this specific format or model.

We can still run GGUF with mistralrs by doing `out=mistralrs`.
parent f9ba6f5c
...@@ -41,13 +41,21 @@ pub enum EngineConfig { ...@@ -41,13 +41,21 @@ pub enum EngineConfig {
}, },
} }
fn is_in_dynamic(in_opt: &Input) -> bool {
matches!(in_opt, Input::Endpoint(_))
}
fn is_out_dynamic(out_opt: &Option<Output>) -> bool {
matches!(out_opt, Some(Output::Dynamic))
}
pub async fn run( pub async fn run(
runtime: dynamo_runtime::Runtime, runtime: dynamo_runtime::Runtime,
in_opt: Input, in_opt: Input,
out_opt: Output, out_opt: Option<Output>,
flags: Flags, flags: Flags,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
if matches!(&in_opt, Input::Endpoint(_)) && matches!(&out_opt, Output::Dynamic) { if is_in_dynamic(&in_opt) && is_out_dynamic(&out_opt) {
anyhow::bail!("Cannot use endpoint for both in and out"); anyhow::bail!("Cannot use endpoint for both in and out");
} }
...@@ -57,28 +65,26 @@ pub async fn run( ...@@ -57,28 +65,26 @@ pub async fn run(
.clone() .clone()
.or(flags.model_path_flag.clone()); .or(flags.model_path_flag.clone());
let mut local_model: LocalModel = match out_opt { let mut local_model: LocalModel = if is_out_dynamic(&out_opt) {
// If output is dynamic we are ingress and don't have a local model, but making an // If output is dynamic we are ingress and don't have a local model, but making an
// empty one cleans up the code. // empty one cleans up the code.
Output::Dynamic => Default::default(), Default::default()
} else {
// All other output types have a local model // All other output types have a local model
_ => { match &maybe_path {
match &maybe_path { Some(model_path) => {
Some(model_path) => { LocalModel::prepare(
LocalModel::prepare( model_path.to_str().context("Invalid UTF-8 in model path")?,
model_path.to_str().context("Invalid UTF-8 in model path")?, flags.model_config.as_deref(),
flags.model_config.as_deref(), flags.model_name.clone(),
flags.model_name.clone(), )
) .await?
.await? }
} None => {
None => { // echo_full engine doesn't need a path
// echo_full engine doesn't need a path match &flags.model_name {
match &flags.model_name { Some(name) => LocalModel::with_name_only(name),
Some(name) => LocalModel::with_name_only(name), None => Default::default(),
None => Default::default(),
}
} }
} }
} }
...@@ -108,6 +114,20 @@ pub async fn run( ...@@ -108,6 +114,20 @@ pub async fn run(
// We may need it later // We may need it later
let card = local_model.card().clone(); let card = local_model.card().clone();
let out_opt = out_opt.unwrap_or_else(|| {
let default_engine = if card.is_gguf() {
Output::LlamaCpp
} else {
Output::MistralRs
};
tracing::info!(
"Using default engine: {default_engine}. Use out=<engine> to specify one of {}",
Output::available_engines().join(", ")
);
default_engine
});
print_cuda(&out_opt);
// Create the engine matching `out` // Create the engine matching `out`
let engine_config = match out_opt { let engine_config = match out_opt {
Output::Dynamic => { Output::Dynamic => {
...@@ -344,3 +364,39 @@ async fn stopper( ...@@ -344,3 +364,39 @@ async fn stopper(
// Keep it alive until the engine has stopped. // Keep it alive until the engine has stopped.
drop(py_script); drop(py_script);
} }
/// If the user will benefit from CUDA/Metal/Vulkan, remind them to build with it.
/// If they have it, celebrate!
// Only mistralrs and llamacpp need to be built with CUDA.
// The Python engines only need it at runtime.
#[cfg(any(feature = "mistralrs", feature = "llamacpp"))]
fn print_cuda(output: &Output) {
// These engines maybe be compiled in, but are they the chosen one?
match output {
#[cfg(feature = "mistralrs")]
Output::MistralRs => {}
#[cfg(feature = "llamacpp")]
Output::LlamaCpp => {}
_ => {
return;
}
}
#[cfg(feature = "cuda")]
{
tracing::info!("CUDA on");
}
#[cfg(feature = "metal")]
{
tracing::info!("Metal on");
}
#[cfg(feature = "vulkan")]
{
tracing::info!("Vulkan on");
}
#[cfg(not(any(feature = "cuda", feature = "metal", feature = "vulkan")))]
tracing::info!("CPU mode. Rebuild with `--features cuda|metal|vulkan` for better performance");
}
#[cfg(not(any(feature = "mistralrs", feature = "llamacpp")))]
fn print_cuda(_output: &Output) {}
...@@ -103,21 +103,9 @@ async fn wrapper(runtime: dynamo_runtime::Runtime) -> anyhow::Result<()> { ...@@ -103,21 +103,9 @@ async fn wrapper(runtime: dynamo_runtime::Runtime) -> anyhow::Result<()> {
} }
None => Input::default(), None => Input::default(),
}; };
let out_opt = match out_opt { if out_opt.is_some() {
Some(x) => { non_flag_params += 1;
non_flag_params += 1; }
x
}
None => {
let default_engine = Output::default(); // smart default based on feature flags
tracing::info!(
"Using default engine: {default_engine}. Use out=<engine> to specify one of {}",
Output::available_engines().join(", ")
);
default_engine
}
};
print_cuda(&out_opt);
// Clap skips the first argument expecting it to be the binary name, so add it back // Clap skips the first argument expecting it to be the binary name, so add it back
// Note `--model-path` has index=1 (in lib.rs) so that doesn't need a flag. // Note `--model-path` has index=1 (in lib.rs) so that doesn't need a flag.
...@@ -129,39 +117,3 @@ async fn wrapper(runtime: dynamo_runtime::Runtime) -> anyhow::Result<()> { ...@@ -129,39 +117,3 @@ async fn wrapper(runtime: dynamo_runtime::Runtime) -> anyhow::Result<()> {
dynamo_run::run(runtime, in_opt, out_opt, flags).await dynamo_run::run(runtime, in_opt, out_opt, flags).await
} }
/// If the user will benefit from CUDA/Metal/Vulkan, remind them to build with it.
/// If they have it, celebrate!
// Only mistralrs and llamacpp need to be built with CUDA.
// The Python engines only need it at runtime.
#[cfg(any(feature = "mistralrs", feature = "llamacpp"))]
fn print_cuda(output: &Output) {
// These engines maybe be compiled in, but are they the chosen one?
match output {
#[cfg(feature = "mistralrs")]
Output::MistralRs => {}
#[cfg(feature = "llamacpp")]
Output::LlamaCpp => {}
_ => {
return;
}
}
#[cfg(feature = "cuda")]
{
tracing::info!("CUDA on");
}
#[cfg(feature = "metal")]
{
tracing::info!("Metal on");
}
#[cfg(feature = "vulkan")]
{
tracing::info!("Vulkan on");
}
#[cfg(not(any(feature = "cuda", feature = "metal", feature = "vulkan")))]
tracing::info!("CPU mode. Rebuild with `--features cuda|metal|vulkan` for better performance");
}
#[cfg(not(any(feature = "mistralrs", feature = "llamacpp")))]
fn print_cuda(_output: &Output) {}
...@@ -165,23 +165,6 @@ impl fmt::Display for Output { ...@@ -165,23 +165,6 @@ impl fmt::Display for Output {
} }
} }
/// Returns the engine to use if user did not say on cmd line.
/// Nearly always defaults to mistralrs which has no dependencies and we include by default.
/// If built with --no-default-features default to subprocess vllm.
#[allow(unused_assignments, unused_mut)]
impl Default for Output {
fn default() -> Self {
let mut out = Output::Vllm;
#[cfg(feature = "mistralrs")]
{
out = Output::MistralRs;
}
out
}
}
impl Output { impl Output {
#[allow(unused_mut)] #[allow(unused_mut)]
pub fn available_engines() -> Vec<String> { pub fn available_engines() -> Vec<String> {
......
...@@ -62,6 +62,12 @@ impl LocalModel { ...@@ -62,6 +62,12 @@ impl LocalModel {
&self.card.service_name &self.card.service_name
} }
pub fn is_gguf(&self) -> bool {
// GGUF is the only file (not-folder) we accept, so we don't need to check the extension
// We will error when we come to parse it
self.full_path.is_file()
}
/// Override max number of tokens in context. We usually only do this to limit kv cache allocation. /// Override max number of tokens in context. We usually only do this to limit kv cache allocation.
pub fn set_context_length(&mut self, context_length: usize) { pub fn set_context_length(&mut self, context_length: usize) {
self.card.context_length = context_length; self.card.context_length = context_length;
......
...@@ -226,6 +226,13 @@ impl ModelDeploymentCard { ...@@ -226,6 +226,13 @@ impl ModelDeploymentCard {
} }
} }
pub fn is_gguf(&self) -> bool {
match &self.model_info {
Some(info) => info.is_gguf(),
None => false,
}
}
/// Move the files this MDC uses into the NATS object store. /// Move the files this MDC uses into the NATS object store.
/// Updates the URI's to point to NATS. /// Updates the URI's to point to NATS.
pub async fn move_to_nats(&mut self, nats_client: nats::Client) -> Result<()> { pub async fn move_to_nats(&mut self, nats_client: nats::Client) -> Result<()> {
...@@ -380,6 +387,9 @@ impl ModelInfoType { ...@@ -380,6 +387,9 @@ impl ModelInfoType {
Self::GGUF(path) => HFConfig::from_gguf(path), Self::GGUF(path) => HFConfig::from_gguf(path),
} }
} }
pub fn is_gguf(&self) -> bool {
matches!(self, Self::GGUF(_))
}
} }
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment