chore: Remove dynamo-run and mistral-rs engine (#6203)

Signed-off-by: Graham King <grahamk@nvidia.com>

chore: Remove dynamo-run and mistral-rs engine (#6203)
Signed-off-by: Graham King <grahamk@nvidia.com>
bbe82f18 · Graham King · GitHub · 2c747d64 · 2c747d64 · 2c747d64
Unverified Commit bbe82f18 authored Feb 12, 2026 by Graham King Committed by GitHub Feb 12, 2026
16 changed files
--- a/launch/dynamo-run/src/lib.rs
+++ b/launch/dynamo-run/src/lib.rs
-// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-
-use anyhow::Context as _;
-use dynamo_llm::entrypoint::EngineConfig;
-use dynamo_llm::entrypoint::input::Input;
-use dynamo_llm::local_model::{LocalModel, LocalModelBuilder};
-use dynamo_runtime::distributed::{DistributedConfig, RequestPlaneMode};
-use dynamo_runtime::storage::kv;
-use dynamo_runtime::transports::nats;
-use dynamo_runtime::{DistributedRuntime, Runtime};
-
-mod flags;
-pub use flags::Flags;
-mod opt;
-pub use dynamo_llm::request_template::RequestTemplate;
-pub use opt::Output;
-
-pub async fn run(
-    runtime: Runtime,
-    in_opt: Input,
-    out_opt: Option<Output>,
-    mut flags: Flags,
-) -> anyhow::Result<()> {
-    //
-    // Download
-    //
-
-    let maybe_remote_repo = flags
-        .model_path_pos
-        .clone()
-        .or_else(|| flags.model_path_flag.clone());
-
-    // Preserve the original model identifier before downloading (for default model name)
-    let original_model_identifier = maybe_remote_repo.as_ref().map(|p| p.display().to_string());
-
-    let model_path = match maybe_remote_repo {
-        None => None,
-        Some(p) if p.exists() => {
-            // Already a local path
-            Some(p)
-        }
-        Some(p) => {
-            // model_path might be an HF repo, not a local path. Resolve it by downloading.
-            // Mocker only needs tokenizer, not weights
-            let ignore_weights = matches!(out_opt, Some(Output::Mocker));
-            Some(LocalModel::fetch(&p.display().to_string(), ignore_weights).await?)
-        }
-    };
-
-    //
-    // Configure
-    //
-
-    let mut builder = LocalModelBuilder::default();
-    builder
-        .model_name(flags.model_name.clone().or(original_model_identifier))
-        .kv_cache_block_size(flags.kv_cache_block_size)
-        // Only set if user provides. Usually loaded from tokenizer_config.json
-        .context_length(flags.context_length)
-        .http_port(flags.http_port)
-        .tls_cert_path(flags.tls_cert_path.take())
-        .tls_key_path(flags.tls_key_path.take())
-        .router_config(Some(flags.router_config()))
-        .migration_limit(flags.migration_limit)
-        .request_template(flags.request_template.clone())
-        .is_mocker(matches!(out_opt, Some(Output::Mocker)));
-
-    // Only the worker has a model path
-    if let Some(model_path) = model_path {
-        builder.model_path(model_path);
-    }
-
-    // TODO: old, address this later:
-    // If `in=dyn` we want the trtllm/sglang/vllm subprocess to listen on that endpoint.
-    // If not, then the endpoint isn't exposed so we let LocalModel invent one.
-    if let Input::Endpoint(path) = &in_opt {
-        builder.endpoint_id(Some(path.parse().with_context(|| path.clone())?));
-    }
-    let dst_config = if is_process_local(&in_opt, &out_opt) {
-        // We are both the frontend and backend, no networking
-        DistributedConfig::process_local()
-    } else {
-        // Normal case
-        let selected_store: kv::Selector = flags.store_kv.parse()?;
-        let request_plane: RequestPlaneMode = flags.request_plane.parse()?;
-        DistributedConfig {
-            store_backend: selected_store,
-            // We only need NATS here to monitor it's metrics, so only if it's our request plane.
-            nats_config: if request_plane.is_nats() {
-                Some(nats::ClientOptions::default())
-            } else {
-                None
-            },
-            request_plane,
-        }
-    };
-    let distributed_runtime = DistributedRuntime::new(runtime.clone(), dst_config).await?;
-    let local_model = builder.build().await?;
-
-    //
-    // Create an engine
-    //
-
-    let out_opt = out_opt.unwrap_or_else(|| default_engine_for(&local_model));
-    print_cuda(&out_opt);
-
-    // Now that we know the output we're targeting, check if we expect it to work
-    flags.validate(&out_opt)?;
-
-    // Make an engine from the local_model, flags and output.
-    let engine_config = engine_for(
-        out_opt,
-        flags.clone(),
-        local_model,
-        distributed_runtime.clone(),
-    )
-    .await?;
-
-    // Run it from an input
-    dynamo_llm::entrypoint::input::run_input(distributed_runtime, in_opt, engine_config).await?;
-
-    Ok(())
-}
-
-pub fn is_in_dynamic(in_opt: &Input) -> bool {
-    matches!(in_opt, Input::Endpoint(_))
-}
-
-pub fn is_out_dynamic(out_opt: &Option<Output>) -> bool {
-    matches!(out_opt, Some(Output::Auto))
-}
-
-fn is_process_local(in_opt: &Input, out_opt: &Option<Output>) -> bool {
-    !is_in_dynamic(in_opt) && !is_out_dynamic(out_opt)
-}
-
-/// Create the engine matching `out_opt`
-/// Note validation happens in Flags::validate. In here assume everything is going to work.
-async fn engine_for(
-    out_opt: Output,
-    flags: Flags,
-    local_model: LocalModel,
-    drt: DistributedRuntime,
-) -> anyhow::Result<EngineConfig> {
-    match out_opt {
-        Output::Auto => {
-            // Auto-discover backends
-            Ok(EngineConfig::Dynamic {
-                model: Box::new(local_model),
-                chat_engine_factory: None,
-            })
-        }
-        Output::Echo => Ok(EngineConfig::InProcessText {
-            model: Box::new(local_model),
-            engine: dynamo_llm::engines::make_echo_engine(),
-        }),
-        #[cfg(feature = "mistralrs")]
-        Output::MistralRs => Ok(EngineConfig::InProcessText {
-            engine: dynamo_engine_mistralrs::make_engine(&local_model).await?,
-            model: Box::new(local_model),
-        }),
-        Output::Mocker => {
-            let args = flags.mocker_config();
-            let endpoint = local_model.endpoint_id().clone();
-
-            let engine = dynamo_llm::mocker::make_mocker_engine(drt, endpoint, args).await?;
-
-            Ok(EngineConfig::InProcessTokens {
-                engine,
-                model: Box::new(local_model),
-                is_prefill: false,
-            })
-        }
-    }
-}
-
-/// If the user will benefit from CUDA or Metal, remind them to build with it.
-/// If they have it, celebrate!
-// Only mistralrs needs to be built with CUDA.
-// The Python engines only need it at runtime.
-#[cfg(feature = "mistralrs")]
-fn print_cuda(output: &Output) {
-    // These engines maybe be compiled in, but are they the chosen one?
-    match output {
-        #[cfg(feature = "mistralrs")]
-        Output::MistralRs => {}
-        _ => {
-            return;
-        }
-    }
-
-    #[cfg(feature = "cuda")]
-    {
-        tracing::info!("CUDA on");
-    }
-    #[cfg(feature = "metal")]
-    {
-        tracing::info!("Metal on");
-    }
-    #[cfg(not(any(feature = "cuda", feature = "metal")))]
-    tracing::info!("CPU mode. Rebuild with `--features cuda|metal` for better performance");
-}
-
-#[cfg(not(feature = "mistralrs"))]
-fn print_cuda(_output: &Output) {}
-
-fn default_engine_for(_local_model: &LocalModel) -> Output {
-    safetensors_default()
-}
-
-fn safetensors_default() -> Output {
-    #[cfg(feature = "mistralrs")]
-    {
-        Output::MistralRs
-    }
-
-    #[cfg(not(feature = "mistralrs"))]
-    {
-        Output::Echo
-    }
-}
--- a/launch/dynamo-run/src/main.rs
+++ b/launch/dynamo-run/src/main.rs
-// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-
-use std::env;
-
-use clap::{CommandFactory as _, Parser};
-use dynamo_runtime::config::environment_names::logging as env_logging;
-
-use dynamo_llm::entrypoint::input::Input;
-use dynamo_run::Output;
-use dynamo_runtime::logging;
-
-const HELP: &str = r#"
-dynamo-run is a single binary that wires together the various inputs (http, text, network) and workers (network, engine), that runs the services. It is the simplest way to use dynamo locally.
-
-Verbosity:
- -v enables debug logs
- -vv enables full trace logs
- Default is info level logging
-
-Example:
- cargo build --features cuda -p dynamo-run
- cd target/debug
- ./dynamo-run Qwen/Qwen3-0.6B (OR ./dynamo-run /data/hf-checkouts/Qwen3-0.6B)
-
-See `docs/guides/dynamo_run.md` in the repo for full details.
-"#;
-
-const USAGE: &str = "USAGE: dynamo-run in=[http|grpc|text|dyn://<path>|batch:<folder>] out=ENGINE_LIST|auto|dyn://<path> [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--context-length=N] [--kv-cache-block-size=16] [--extra-engine-args=args.json] [--router-mode random|round-robin|kv] [--kv-overlap-score-weight=2.0] [--router-temperature=0.0] [--use-kv-events] [--max-num-batched-tokens=1.0] [--migration-limit=0] [--verbosity (-v|-vv)]";
-
-fn main() -> anyhow::Result<()> {
-    // Set log level based on verbosity flag
-    let log_level = match dynamo_run::Flags::try_parse() {
-        Ok(flags) => match flags.verbosity {
-            0 => "info",
-            1 => "debug",
-            2 => "trace",
-            _ => {
-                return Err(anyhow::anyhow!(
-                    "Invalid verbosity level. Valid values are v (debug) or vv (trace)"
-                ));
-            }
-        },
-        Err(_) => "info",
-    };
-
-    if log_level != "info" {
-        unsafe { std::env::set_var(env_logging::DYN_LOG, log_level) };
-    }
-
-    logging::init();
-
-    // max_worker_threads and max_blocking_threads from env vars or config file.
-    let rt_config = dynamo_runtime::RuntimeConfig::from_settings()?;
-    tracing::debug!("Runtime config: {rt_config}");
-
-    // One per process. Wraps a Runtime with holds one or two tokio runtimes.
-    let worker = dynamo_runtime::Worker::from_config(rt_config)?;
-
-    worker.execute(wrapper)
-}
-
-async fn wrapper(runtime: dynamo_runtime::Runtime) -> anyhow::Result<()> {
-    let mut in_opt = None;
-    let mut out_opt = None;
-    let args: Vec<String> = env::args().skip(1).collect();
-    if args.is_empty()
-        || args[0] == "-h"
-        || args[0] == "--help"
-        || (args.iter().all(|arg| arg == "-v" || arg == "-vv"))
-    {
-        let engine_list = Output::available_engines().join("|");
-        let usage = USAGE.replace("ENGINE_LIST", &engine_list);
-        println!("{usage}");
-        println!("{HELP}");
-        dynamo_run::Flags::command().print_long_help().unwrap();
-        return Ok(());
-    } else if args[0] == "--version" {
-        if let Some(describe) = option_env!("VERGEN_GIT_DESCRIBE") {
-            println!("dynamo-run {}", describe);
-        } else {
-            println!("Version not available (git describe not available)");
-        }
-        return Ok(());
-    }
-    for arg in env::args().skip(1).take(2) {
-        let Some((in_out, val)) = arg.split_once('=') else {
-            // Probably we're defaulting in and/or out, and this is a flag
-            continue;
-        };
-        match in_out {
-            "in" => {
-                in_opt = Some(val.try_into()?);
-            }
-            "out" => {
-                if val == "sglang" || val == "trtllm" || val == "vllm" {
-                    tracing::error!(
-                        "To run the {val} engine please use the Python interface, see root README or look in directory `examples/backends/`."
-                    );
-                    std::process::exit(1);
-                }
-
-                out_opt = Some(val.try_into()?);
-            }
-            _ => {
-                anyhow::bail!("Invalid argument, must start with 'in' or 'out. {USAGE}");
-            }
-        }
-    }
-    let mut non_flag_params = 1; // binary name
-    let in_opt = match in_opt {
-        Some(x) => {
-            non_flag_params += 1;
-            x
-        }
-        None => Input::default(),
-    };
-    if out_opt.is_some() {
-        non_flag_params += 1;
-    }
-
-    // Clap skips the first argument expecting it to be the binary name, so add it back
-    // Note `--model-path` has index=1 (in lib.rs) so that doesn't need a flag.
-    let flags = dynamo_run::Flags::try_parse_from(
-        ["dynamo-run".to_string()]
-            .into_iter()
-            .chain(env::args().skip(non_flag_params)),
-    )?;
-
-    if dynamo_run::is_in_dynamic(&in_opt) && dynamo_run::is_out_dynamic(&out_opt) {
-        anyhow::bail!("Cannot use endpoint for both in and out");
-    }
-
-    dynamo_run::run(runtime, in_opt, out_opt, flags).await
-}
--- a/launch/dynamo-run/src/opt.rs
+++ b/launch/dynamo-run/src/opt.rs
-// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-
-use std::fmt;
-
-pub enum Output {
-    /// Echos the prompt back as the response
-    Echo,
-
-    /// Listen for models on nats/etcd, add/remove dynamically
-    Auto,
-
-    #[cfg(feature = "mistralrs")]
-    MistralRs,
-
-    Mocker,
-}
-
-impl TryFrom<&str> for Output {
-    type Error = anyhow::Error;
-
-    fn try_from(s: &str) -> anyhow::Result<Self> {
-        match s {
-            #[cfg(feature = "mistralrs")]
-            "mistralrs" => Ok(Output::MistralRs),
-
-            "mocker" => Ok(Output::Mocker),
-            "echo" | "echo_full" => Ok(Output::Echo),
-
-            "dyn" | "auto" => Ok(Output::Auto),
-
-            e => Err(anyhow::anyhow!("Invalid out= option '{e}'")),
-        }
-    }
-}
-
-impl fmt::Display for Output {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        let s = match self {
-            #[cfg(feature = "mistralrs")]
-            Output::MistralRs => "mistralrs",
-
-            Output::Mocker => "mocker",
-            Output::Echo => "echo",
-
-            Output::Auto => "auto",
-        };
-        write!(f, "{s}")
-    }
-}
-
-impl Output {
-    #[allow(unused_mut)]
-    pub fn available_engines() -> Vec<String> {
-        let mut out = vec!["echo".to_string(), Output::Mocker.to_string()];
-        #[cfg(feature = "mistralrs")]
-        {
-            out.push(Output::MistralRs.to_string());
-        }
-        out
-    }
-}
--- a/lib/bindings/python/examples/cli/cli.py
+++ b/lib/bindings/python/examples/cli/cli.py
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0

-# Example cli using the Python bindings, similar to `dynamo-run`.
+# Example cli using the Python bindings.
 #
 # Usage: `python cli.py in=text out=echo <your-model>`.
 # `in` can be:

--- a/lib/bindings/python/examples/hello_world/server_sglang.py
+++ b/lib/bindings/python/examples/hello_world/server_sglang.py
@@ -16,8 +16,7 @@
 # Start nats and etcd:
 #  - nats-server -js
 #
-# Window 1: `python server_sglang.py`. Wait for log "Starting endpoint".
-# Window 2: `dynamo-run out=dyn
+# `python server_sglang.py`. Wait for log "Starting endpoint".

 import argparse
 import asyncio

--- a/lib/bindings/python/examples/hello_world/server_sglang_tok.py
+++ b/lib/bindings/python/examples/hello_world/server_sglang_tok.py
@@ -17,8 +17,7 @@
 # Start nats and etcd:
 #  - nats-server -js
 #
-# Window 1: `python server_sglang.py`. Wait for log "Starting endpoint".
-# Window 2: `dynamo-run out=dyn
+# `python server_sglang.py`. Wait for log "Starting endpoint".

 import argparse
 import asyncio

--- a/lib/bindings/python/examples/hello_world/server_vllm.py
+++ b/lib/bindings/python/examples/hello_world/server_vllm.py
@@ -12,8 +12,7 @@
 # Start nats and etcd:
 #  - nats-server -js
 #
-# Window 1: `python server_vllm.py`. Wait for log "Starting endpoint".
-# Window 2: `dynamo-run out=dyn
+# `python server_vllm.py`. Wait for log "Starting endpoint".

 import argparse
 import asyncio

--- a/lib/engines/mistralrs/Cargo.toml
+++ b/lib/engines/mistralrs/Cargo.toml
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-[package]
-name = "dynamo-engine-mistralrs"
-version.workspace = true
-edition.workspace = true
-description.workspace = true
-authors.workspace = true
-license.workspace = true
-homepage.workspace = true
-repository.workspace = true
-keywords.workspace = true
-
-[features]
-default = []
-cuda = ["mistralrs/cuda"]
-metal = ["mistralrs/metal"]
-
-[dependencies]
-dynamo-runtime = { workspace = true }
-dynamo-llm = { workspace = true }
-
-anyhow = { workspace = true }
-dynamo-async-openai = { workspace = true }
-async-stream = { workspace = true }
-async-trait = { workspace = true }
-either = { workspace = true }
-indexmap = { version = "2.9.0", features = ["serde"] }
-mistralrs = { git = "https://github.com/EricLBuehler/mistral.rs.git", version = "0.6.0", rev = "2bcf0e9e3" }
-serde_json = { workspace = true }
-tokio = { workspace = true }
-tracing = { workspace = true }
--- a/lib/engines/mistralrs/src/lib.rs
+++ b/lib/engines/mistralrs/src/lib.rs
--- a/lib/llm/src/entrypoint/input/text.rs
+++ b/lib/llm/src/entrypoint/input/text.rs
@@ -50,7 +50,7 @@ async fn main_loop(
    }
    let theme = dialoguer::theme::ColorfulTheme::default();

-    // Initial prompt is the pipe case: `echo "Hello" | dynamo-run ..`
+    // Initial prompt is from piped stdin.
    // We run that single prompt and exit
    let single = initial_prompt.is_some();
    let mut history = dialoguer::BasicHistory::default();

--- a/lib/runtime/src/logging.rs
+++ b/lib/runtime/src/logging.rs
@@ -122,7 +122,6 @@ impl Default for LoggingConfig {
                ("tokenizers".to_string(), "error".to_string()),
                ("axum".to_string(), "error".to_string()),
                ("tonic".to_string(), "error".to_string()),
-                ("mistralrs_core".to_string(), "error".to_string()),
                ("hf_hub".to_string(), "error".to_string()),
                ("opentelemetry".to_string(), "error".to_string()),
                ("opentelemetry-otlp".to_string(), "error".to_string()),

--- a/tests/lmcache/deploy-baseline-dynamo-disag.sh
+++ b/tests/lmcache/deploy-baseline-dynamo-disag.sh
@@ -24,12 +24,6 @@ echo "🚀 Starting dynamo disaggregated serving setup without LMCache:"
 echo "   Model: $MODEL_URL"
 echo "   Port: 8000"
 echo "   Mode: Disaggregated (prefill + decode workers)"
-
-# Kill any existing dynamo processes
-echo "🧹 Cleaning up any existing dynamo processes..."
-pkill -f "dynamo-run" || true
-sleep 2
-
 echo "🔧 Starting dynamo disaggregated serving without LMCache..."

 python -m dynamo.frontend &

--- a/tests/lmcache/deploy-baseline-dynamo.sh
+++ b/tests/lmcache/deploy-baseline-dynamo.sh
@@ -22,12 +22,6 @@ fi
 echo "🚀 Starting dynamo setup without LMCache:"
 echo "   Model: $MODEL_URL"
 echo "   Port: 8000"
-
-# Kill any existing dynamo processes
-echo "🧹 Cleaning up any existing dynamo processes..."
-pkill -f "dynamo-run" || true
-sleep 2
-
 echo "🔧 Starting dynamo worker without LMCache..."

 python -m dynamo.frontend &

--- a/tests/lmcache/deploy-lmcache_enabled-dynamo-disag.sh
+++ b/tests/lmcache/deploy-lmcache_enabled-dynamo-disag.sh
@@ -25,12 +25,6 @@ echo "   Model: $MODEL_URL"
 echo "   Port: 8000"
 echo "   Mode: Disaggregated (prefill + decode workers) + LMCache"
 echo "   !! Remember to kill the old dynamo processes otherwise the port will be busy !!"
-
-# Kill any existing dynamo processes
-echo "🧹 Cleaning up any existing dynamo processes..."
-pkill -f "dynamo-run" || true
-sleep 2
-
 echo "🔧 Starting dynamo disaggregated serving with LMCache enabled..."

 python -m dynamo.frontend &

--- a/tests/lmcache/deploy-lmcache_enabled-dynamo.sh
+++ b/tests/lmcache/deploy-lmcache_enabled-dynamo.sh
@@ -23,12 +23,6 @@ echo "🚀 Starting dynamo setup with LMCache:"
 echo "   Model: $MODEL_URL"
 echo "   Port: 8000"
 echo "   !! Remmber to kill the old dynamo processes other wise the port will be busy !! "
-
-# Kill any existing dynamo processes
-echo "🧹 Cleaning up any existing dynamo processes..."
-pkill -f "dynamo-run" || true
-sleep 2
-
 echo "🔧 Starting dynamo worker with LMCache enabled..."

 python -m dynamo.frontend &

--- a/tests/lmcache/run_test.sh
+++ b/tests/lmcache/run_test.sh
@@ -17,10 +17,6 @@ echo ""
 cleanup() {
    echo "🧹 Cleaning up running processes..."

-    # Kill any remaining dynamo processes
-    pkill -f "dynamo-run" || true
-    pkill -f "components/main.py" || true
-
    # Stop docker services
    docker compose -f ../../deploy/docker-compose.yml down 2>/dev/null || true