Unverified Commit bbe82f18 authored by Graham King's avatar Graham King Committed by GitHub
Browse files

chore: Remove dynamo-run and mistral-rs engine (#6203)


Signed-off-by: default avatarGraham King <grahamk@nvidia.com>
parent 2c747d64
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use anyhow::Context as _;
use dynamo_llm::entrypoint::EngineConfig;
use dynamo_llm::entrypoint::input::Input;
use dynamo_llm::local_model::{LocalModel, LocalModelBuilder};
use dynamo_runtime::distributed::{DistributedConfig, RequestPlaneMode};
use dynamo_runtime::storage::kv;
use dynamo_runtime::transports::nats;
use dynamo_runtime::{DistributedRuntime, Runtime};
mod flags;
pub use flags::Flags;
mod opt;
pub use dynamo_llm::request_template::RequestTemplate;
pub use opt::Output;
pub async fn run(
runtime: Runtime,
in_opt: Input,
out_opt: Option<Output>,
mut flags: Flags,
) -> anyhow::Result<()> {
//
// Download
//
let maybe_remote_repo = flags
.model_path_pos
.clone()
.or_else(|| flags.model_path_flag.clone());
// Preserve the original model identifier before downloading (for default model name)
let original_model_identifier = maybe_remote_repo.as_ref().map(|p| p.display().to_string());
let model_path = match maybe_remote_repo {
None => None,
Some(p) if p.exists() => {
// Already a local path
Some(p)
}
Some(p) => {
// model_path might be an HF repo, not a local path. Resolve it by downloading.
// Mocker only needs tokenizer, not weights
let ignore_weights = matches!(out_opt, Some(Output::Mocker));
Some(LocalModel::fetch(&p.display().to_string(), ignore_weights).await?)
}
};
//
// Configure
//
let mut builder = LocalModelBuilder::default();
builder
.model_name(flags.model_name.clone().or(original_model_identifier))
.kv_cache_block_size(flags.kv_cache_block_size)
// Only set if user provides. Usually loaded from tokenizer_config.json
.context_length(flags.context_length)
.http_port(flags.http_port)
.tls_cert_path(flags.tls_cert_path.take())
.tls_key_path(flags.tls_key_path.take())
.router_config(Some(flags.router_config()))
.migration_limit(flags.migration_limit)
.request_template(flags.request_template.clone())
.is_mocker(matches!(out_opt, Some(Output::Mocker)));
// Only the worker has a model path
if let Some(model_path) = model_path {
builder.model_path(model_path);
}
// TODO: old, address this later:
// If `in=dyn` we want the trtllm/sglang/vllm subprocess to listen on that endpoint.
// If not, then the endpoint isn't exposed so we let LocalModel invent one.
if let Input::Endpoint(path) = &in_opt {
builder.endpoint_id(Some(path.parse().with_context(|| path.clone())?));
}
let dst_config = if is_process_local(&in_opt, &out_opt) {
// We are both the frontend and backend, no networking
DistributedConfig::process_local()
} else {
// Normal case
let selected_store: kv::Selector = flags.store_kv.parse()?;
let request_plane: RequestPlaneMode = flags.request_plane.parse()?;
DistributedConfig {
store_backend: selected_store,
// We only need NATS here to monitor it's metrics, so only if it's our request plane.
nats_config: if request_plane.is_nats() {
Some(nats::ClientOptions::default())
} else {
None
},
request_plane,
}
};
let distributed_runtime = DistributedRuntime::new(runtime.clone(), dst_config).await?;
let local_model = builder.build().await?;
//
// Create an engine
//
let out_opt = out_opt.unwrap_or_else(|| default_engine_for(&local_model));
print_cuda(&out_opt);
// Now that we know the output we're targeting, check if we expect it to work
flags.validate(&out_opt)?;
// Make an engine from the local_model, flags and output.
let engine_config = engine_for(
out_opt,
flags.clone(),
local_model,
distributed_runtime.clone(),
)
.await?;
// Run it from an input
dynamo_llm::entrypoint::input::run_input(distributed_runtime, in_opt, engine_config).await?;
Ok(())
}
pub fn is_in_dynamic(in_opt: &Input) -> bool {
matches!(in_opt, Input::Endpoint(_))
}
pub fn is_out_dynamic(out_opt: &Option<Output>) -> bool {
matches!(out_opt, Some(Output::Auto))
}
fn is_process_local(in_opt: &Input, out_opt: &Option<Output>) -> bool {
!is_in_dynamic(in_opt) && !is_out_dynamic(out_opt)
}
/// Create the engine matching `out_opt`
/// Note validation happens in Flags::validate. In here assume everything is going to work.
async fn engine_for(
out_opt: Output,
flags: Flags,
local_model: LocalModel,
drt: DistributedRuntime,
) -> anyhow::Result<EngineConfig> {
match out_opt {
Output::Auto => {
// Auto-discover backends
Ok(EngineConfig::Dynamic {
model: Box::new(local_model),
chat_engine_factory: None,
})
}
Output::Echo => Ok(EngineConfig::InProcessText {
model: Box::new(local_model),
engine: dynamo_llm::engines::make_echo_engine(),
}),
#[cfg(feature = "mistralrs")]
Output::MistralRs => Ok(EngineConfig::InProcessText {
engine: dynamo_engine_mistralrs::make_engine(&local_model).await?,
model: Box::new(local_model),
}),
Output::Mocker => {
let args = flags.mocker_config();
let endpoint = local_model.endpoint_id().clone();
let engine = dynamo_llm::mocker::make_mocker_engine(drt, endpoint, args).await?;
Ok(EngineConfig::InProcessTokens {
engine,
model: Box::new(local_model),
is_prefill: false,
})
}
}
}
/// If the user will benefit from CUDA or Metal, remind them to build with it.
/// If they have it, celebrate!
// Only mistralrs needs to be built with CUDA.
// The Python engines only need it at runtime.
#[cfg(feature = "mistralrs")]
fn print_cuda(output: &Output) {
// These engines maybe be compiled in, but are they the chosen one?
match output {
#[cfg(feature = "mistralrs")]
Output::MistralRs => {}
_ => {
return;
}
}
#[cfg(feature = "cuda")]
{
tracing::info!("CUDA on");
}
#[cfg(feature = "metal")]
{
tracing::info!("Metal on");
}
#[cfg(not(any(feature = "cuda", feature = "metal")))]
tracing::info!("CPU mode. Rebuild with `--features cuda|metal` for better performance");
}
#[cfg(not(feature = "mistralrs"))]
fn print_cuda(_output: &Output) {}
fn default_engine_for(_local_model: &LocalModel) -> Output {
safetensors_default()
}
fn safetensors_default() -> Output {
#[cfg(feature = "mistralrs")]
{
Output::MistralRs
}
#[cfg(not(feature = "mistralrs"))]
{
Output::Echo
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use std::env;
use clap::{CommandFactory as _, Parser};
use dynamo_runtime::config::environment_names::logging as env_logging;
use dynamo_llm::entrypoint::input::Input;
use dynamo_run::Output;
use dynamo_runtime::logging;
const HELP: &str = r#"
dynamo-run is a single binary that wires together the various inputs (http, text, network) and workers (network, engine), that runs the services. It is the simplest way to use dynamo locally.
Verbosity:
- -v enables debug logs
- -vv enables full trace logs
- Default is info level logging
Example:
- cargo build --features cuda -p dynamo-run
- cd target/debug
- ./dynamo-run Qwen/Qwen3-0.6B (OR ./dynamo-run /data/hf-checkouts/Qwen3-0.6B)
See `docs/guides/dynamo_run.md` in the repo for full details.
"#;
const USAGE: &str = "USAGE: dynamo-run in=[http|grpc|text|dyn://<path>|batch:<folder>] out=ENGINE_LIST|auto|dyn://<path> [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--context-length=N] [--kv-cache-block-size=16] [--extra-engine-args=args.json] [--router-mode random|round-robin|kv] [--kv-overlap-score-weight=2.0] [--router-temperature=0.0] [--use-kv-events] [--max-num-batched-tokens=1.0] [--migration-limit=0] [--verbosity (-v|-vv)]";
fn main() -> anyhow::Result<()> {
// Set log level based on verbosity flag
let log_level = match dynamo_run::Flags::try_parse() {
Ok(flags) => match flags.verbosity {
0 => "info",
1 => "debug",
2 => "trace",
_ => {
return Err(anyhow::anyhow!(
"Invalid verbosity level. Valid values are v (debug) or vv (trace)"
));
}
},
Err(_) => "info",
};
if log_level != "info" {
unsafe { std::env::set_var(env_logging::DYN_LOG, log_level) };
}
logging::init();
// max_worker_threads and max_blocking_threads from env vars or config file.
let rt_config = dynamo_runtime::RuntimeConfig::from_settings()?;
tracing::debug!("Runtime config: {rt_config}");
// One per process. Wraps a Runtime with holds one or two tokio runtimes.
let worker = dynamo_runtime::Worker::from_config(rt_config)?;
worker.execute(wrapper)
}
async fn wrapper(runtime: dynamo_runtime::Runtime) -> anyhow::Result<()> {
let mut in_opt = None;
let mut out_opt = None;
let args: Vec<String> = env::args().skip(1).collect();
if args.is_empty()
|| args[0] == "-h"
|| args[0] == "--help"
|| (args.iter().all(|arg| arg == "-v" || arg == "-vv"))
{
let engine_list = Output::available_engines().join("|");
let usage = USAGE.replace("ENGINE_LIST", &engine_list);
println!("{usage}");
println!("{HELP}");
dynamo_run::Flags::command().print_long_help().unwrap();
return Ok(());
} else if args[0] == "--version" {
if let Some(describe) = option_env!("VERGEN_GIT_DESCRIBE") {
println!("dynamo-run {}", describe);
} else {
println!("Version not available (git describe not available)");
}
return Ok(());
}
for arg in env::args().skip(1).take(2) {
let Some((in_out, val)) = arg.split_once('=') else {
// Probably we're defaulting in and/or out, and this is a flag
continue;
};
match in_out {
"in" => {
in_opt = Some(val.try_into()?);
}
"out" => {
if val == "sglang" || val == "trtllm" || val == "vllm" {
tracing::error!(
"To run the {val} engine please use the Python interface, see root README or look in directory `examples/backends/`."
);
std::process::exit(1);
}
out_opt = Some(val.try_into()?);
}
_ => {
anyhow::bail!("Invalid argument, must start with 'in' or 'out. {USAGE}");
}
}
}
let mut non_flag_params = 1; // binary name
let in_opt = match in_opt {
Some(x) => {
non_flag_params += 1;
x
}
None => Input::default(),
};
if out_opt.is_some() {
non_flag_params += 1;
}
// Clap skips the first argument expecting it to be the binary name, so add it back
// Note `--model-path` has index=1 (in lib.rs) so that doesn't need a flag.
let flags = dynamo_run::Flags::try_parse_from(
["dynamo-run".to_string()]
.into_iter()
.chain(env::args().skip(non_flag_params)),
)?;
if dynamo_run::is_in_dynamic(&in_opt) && dynamo_run::is_out_dynamic(&out_opt) {
anyhow::bail!("Cannot use endpoint for both in and out");
}
dynamo_run::run(runtime, in_opt, out_opt, flags).await
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use std::fmt;
pub enum Output {
/// Echos the prompt back as the response
Echo,
/// Listen for models on nats/etcd, add/remove dynamically
Auto,
#[cfg(feature = "mistralrs")]
MistralRs,
Mocker,
}
impl TryFrom<&str> for Output {
type Error = anyhow::Error;
fn try_from(s: &str) -> anyhow::Result<Self> {
match s {
#[cfg(feature = "mistralrs")]
"mistralrs" => Ok(Output::MistralRs),
"mocker" => Ok(Output::Mocker),
"echo" | "echo_full" => Ok(Output::Echo),
"dyn" | "auto" => Ok(Output::Auto),
e => Err(anyhow::anyhow!("Invalid out= option '{e}'")),
}
}
}
impl fmt::Display for Output {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let s = match self {
#[cfg(feature = "mistralrs")]
Output::MistralRs => "mistralrs",
Output::Mocker => "mocker",
Output::Echo => "echo",
Output::Auto => "auto",
};
write!(f, "{s}")
}
}
impl Output {
#[allow(unused_mut)]
pub fn available_engines() -> Vec<String> {
let mut out = vec!["echo".to_string(), Output::Mocker.to_string()];
#[cfg(feature = "mistralrs")]
{
out.push(Output::MistralRs.to_string());
}
out
}
}
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Example cli using the Python bindings, similar to `dynamo-run`.
# Example cli using the Python bindings.
#
# Usage: `python cli.py in=text out=echo <your-model>`.
# `in` can be:
......
......@@ -16,8 +16,7 @@
# Start nats and etcd:
# - nats-server -js
#
# Window 1: `python server_sglang.py`. Wait for log "Starting endpoint".
# Window 2: `dynamo-run out=dyn
# `python server_sglang.py`. Wait for log "Starting endpoint".
import argparse
import asyncio
......
......@@ -17,8 +17,7 @@
# Start nats and etcd:
# - nats-server -js
#
# Window 1: `python server_sglang.py`. Wait for log "Starting endpoint".
# Window 2: `dynamo-run out=dyn
# `python server_sglang.py`. Wait for log "Starting endpoint".
import argparse
import asyncio
......
......@@ -12,8 +12,7 @@
# Start nats and etcd:
# - nats-server -js
#
# Window 1: `python server_vllm.py`. Wait for log "Starting endpoint".
# Window 2: `dynamo-run out=dyn
# `python server_vllm.py`. Wait for log "Starting endpoint".
import argparse
import asyncio
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
[package]
name = "dynamo-engine-mistralrs"
version.workspace = true
edition.workspace = true
description.workspace = true
authors.workspace = true
license.workspace = true
homepage.workspace = true
repository.workspace = true
keywords.workspace = true
[features]
default = []
cuda = ["mistralrs/cuda"]
metal = ["mistralrs/metal"]
[dependencies]
dynamo-runtime = { workspace = true }
dynamo-llm = { workspace = true }
anyhow = { workspace = true }
dynamo-async-openai = { workspace = true }
async-stream = { workspace = true }
async-trait = { workspace = true }
either = { workspace = true }
indexmap = { version = "2.9.0", features = ["serde"] }
mistralrs = { git = "https://github.com/EricLBuehler/mistral.rs.git", version = "0.6.0", rev = "2bcf0e9e3" }
serde_json = { workspace = true }
tokio = { workspace = true }
tracing = { workspace = true }
This diff is collapsed.
......@@ -50,7 +50,7 @@ async fn main_loop(
}
let theme = dialoguer::theme::ColorfulTheme::default();
// Initial prompt is the pipe case: `echo "Hello" | dynamo-run ..`
// Initial prompt is from piped stdin.
// We run that single prompt and exit
let single = initial_prompt.is_some();
let mut history = dialoguer::BasicHistory::default();
......
......@@ -122,7 +122,6 @@ impl Default for LoggingConfig {
("tokenizers".to_string(), "error".to_string()),
("axum".to_string(), "error".to_string()),
("tonic".to_string(), "error".to_string()),
("mistralrs_core".to_string(), "error".to_string()),
("hf_hub".to_string(), "error".to_string()),
("opentelemetry".to_string(), "error".to_string()),
("opentelemetry-otlp".to_string(), "error".to_string()),
......
......@@ -24,12 +24,6 @@ echo "🚀 Starting dynamo disaggregated serving setup without LMCache:"
echo " Model: $MODEL_URL"
echo " Port: 8000"
echo " Mode: Disaggregated (prefill + decode workers)"
# Kill any existing dynamo processes
echo "🧹 Cleaning up any existing dynamo processes..."
pkill -f "dynamo-run" || true
sleep 2
echo "🔧 Starting dynamo disaggregated serving without LMCache..."
python -m dynamo.frontend &
......
......@@ -22,12 +22,6 @@ fi
echo "🚀 Starting dynamo setup without LMCache:"
echo " Model: $MODEL_URL"
echo " Port: 8000"
# Kill any existing dynamo processes
echo "🧹 Cleaning up any existing dynamo processes..."
pkill -f "dynamo-run" || true
sleep 2
echo "🔧 Starting dynamo worker without LMCache..."
python -m dynamo.frontend &
......
......@@ -25,12 +25,6 @@ echo " Model: $MODEL_URL"
echo " Port: 8000"
echo " Mode: Disaggregated (prefill + decode workers) + LMCache"
echo " !! Remember to kill the old dynamo processes otherwise the port will be busy !!"
# Kill any existing dynamo processes
echo "🧹 Cleaning up any existing dynamo processes..."
pkill -f "dynamo-run" || true
sleep 2
echo "🔧 Starting dynamo disaggregated serving with LMCache enabled..."
python -m dynamo.frontend &
......
......@@ -23,12 +23,6 @@ echo "🚀 Starting dynamo setup with LMCache:"
echo " Model: $MODEL_URL"
echo " Port: 8000"
echo " !! Remmber to kill the old dynamo processes other wise the port will be busy !! "
# Kill any existing dynamo processes
echo "🧹 Cleaning up any existing dynamo processes..."
pkill -f "dynamo-run" || true
sleep 2
echo "🔧 Starting dynamo worker with LMCache enabled..."
python -m dynamo.frontend &
......
......@@ -17,10 +17,6 @@ echo ""
cleanup() {
echo "🧹 Cleaning up running processes..."
# Kill any remaining dynamo processes
pkill -f "dynamo-run" || true
pkill -f "components/main.py" || true
# Stop docker services
docker compose -f ../../deploy/docker-compose.yml down 2>/dev/null || true
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment