refactor: migrate engines to standalone crates (#453)

Moved all of `lib/llm/src/engines` to their own crates as e.g. `lib/engines/mistralrs`. This will allow publishing of the `dynamo-llm` crate as it won't have any github dependencies. The only engines in dynamo-llm will be the demo `echo` ones. Co-authored-by: Graham King <grahamk@nvidia.com>

refactor: migrate engines to standalone crates (#453)
Moved all of `lib/llm/src/engines` to their own crates as e.g. `lib/engines/mistralrs`. This will allow publishing of the `dynamo-llm` crate as it won't have any github dependencies. The only engines in dynamo-llm will be the demo `echo` ones. Co-authored-by: Graham King <grahamk@nvidia.com>
84985d3f · Ryan Olson · GitHub · 6eb10540 · 84985d3f · 84985d3f
Commit 84985d3f authored Apr 03, 2025 by Ryan Olson Committed by GitHub Apr 03, 2025
20 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -4,7 +4,6 @@
 .helix
 [Bb]inlog/
 [Bb][Uu][Ii][Ll][Dd]/
-[Cc][Mm][Aa][Kk][Ee]/
 [Oo][Bb][Jj]/
 [Oo][Uu][Tt]/
 [Tt][Mm][Pp]/

--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1500,6 +1500,120 @@ dependencies = [
 "bytemuck",
 ]

+[[package]]
+name = "dynamo-engine-llamacpp"
+version = "0.1.0"
+dependencies = [
+ "async-stream",
+ "dynamo-llm",
+ "dynamo-runtime",
+ "llama-cpp-2",
+ "tokio",
+ "tracing",
+]
+
+[[package]]
+name = "dynamo-engine-mistralrs"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "async-openai",
+ "async-stream",
+ "async-trait",
+ "candle-core 0.8.4",
+ "dynamo-llm",
+ "dynamo-runtime",
+ "either",
+ "indexmap 2.8.0",
+ "mistralrs",
+ "serde_json",
+ "tokio",
+ "tracing",
+]
+
+[[package]]
+name = "dynamo-engine-python"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "async-openai",
+ "async-stream",
+ "async-trait",
+ "dynamo-llm",
+ "dynamo-runtime",
+ "pyo3",
+ "pyo3-async-runtimes",
+ "pythonize",
+ "serde",
+ "serde_json",
+ "thiserror 2.0.12",
+ "tokio",
+ "tokio-stream",
+ "tracing",
+]
+
+[[package]]
+name = "dynamo-engine-sglang"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "async-openai",
+ "async-stream",
+ "async-trait",
+ "async_zmq",
+ "dynamo-llm",
+ "dynamo-runtime",
+ "libc",
+ "pyo3",
+ "regex",
+ "serde_json",
+ "tokio",
+ "tracing",
+]
+
+[[package]]
+name = "dynamo-engine-trtllm"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "async-openai",
+ "async-stream",
+ "async-trait",
+ "bindgen 0.70.1",
+ "cmake",
+ "derive_builder",
+ "dynamo-llm",
+ "dynamo-runtime",
+ "futures",
+ "serde",
+ "serde_json",
+ "serde_repr",
+ "thiserror 2.0.12",
+ "tokio",
+ "tokio-util",
+ "tracing",
+]
+
+[[package]]
+name = "dynamo-engine-vllm"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "async-openai",
+ "async-stream",
+ "async-trait",
+ "async_zmq",
+ "dynamo-llm",
+ "dynamo-runtime",
+ "pyo3",
+ "regex",
+ "serde-pickle",
+ "serde_json",
+ "thiserror 2.0.12",
+ "tokio",
+ "tracing",
+]
+
 [[package]]
 name = "dynamo-llm"
 version = "0.1.0"
@@ -1511,14 +1625,12 @@ dependencies = [
 "async-trait",
 "async_zmq",
 "axum 0.8.1",
- "bindgen 0.70.1",
 "blake3",
 "bs62",
 "bytemuck",
 "bytes",
 "candle-core 0.8.4",
 "chrono",
- "cmake",
 "cudarc 0.13.9 (git+https://github.com/coreylowman/cudarc.git?rev=8c52e735b55bf8e979e1a16bd85e3dfe4f87c9fe)",
 "derive-getters",
 "derive_builder",
@@ -1529,33 +1641,23 @@ dependencies = [
 "galil-seiferas",
 "ggus",
 "hf-hub",
- "indexmap 2.8.0",
 "insta",
 "itertools 0.14.0",
- "libc",
- "llama-cpp-2",
 "memmap2",
 "minijinja",
 "minijinja-contrib",
- "mistralrs",
 "ndarray",
 "prometheus",
 "proptest",
- "pyo3",
- "pyo3-async-runtimes",
- "pythonize",
 "rand 0.9.0",
 "rayon",
 "regex",
 "reqwest",
 "rstest 0.18.2",
 "rstest_reuse",
- "semver",
 "sentencepiece",
 "serde",
- "serde-pickle",
 "serde_json",
- "serde_repr",
 "strum 0.27.1",
 "tempfile",
 "thiserror 2.0.12",
@@ -1582,6 +1684,12 @@ dependencies = [
 "async-trait",
 "clap",
 "dialoguer",
+ "dynamo-engine-llamacpp",
+ "dynamo-engine-mistralrs",
+ "dynamo-engine-python",
+ "dynamo-engine-sglang",
+ "dynamo-engine-trtllm",
+ "dynamo-engine-vllm",
 "dynamo-llm",
 "dynamo-runtime",
 "futures",
@@ -5539,9 +5647,6 @@ name = "semver"
 version = "1.0.26"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0"
-dependencies = [
- "serde",
-]

 [[package]]
 name = "sentencepiece"

--- a/Cargo.toml
+++ b/Cargo.toml
@@ -20,6 +20,7 @@ members = [
    "lib/llm",
    "lib/runtime",
    "lib/bindings/c",
+    "lib/engines/*",
 ]
 resolver = "2"


--- a/deny.toml
+++ b/deny.toml
@@ -19,17 +19,18 @@
 [licenses]
 confidence-threshold = 0.93
 allow = [
+    "MIT-0",
    "MIT",
    "Apache-2.0",
    "Apache-2.0 WITH LLVM-exception",
    "ISC",
+    "0BSD",
    "BSD-2-Clause",
    "BSD-3-Clause",
    "OpenSSL",
    "Unicode-3.0",
    "BSL-1.0",
-    "MPL-2.0",
-    "MIT-0"
+    "MPL-2.0"
 ]

 # TODO exceptions

--- a/launch/dynamo-run/Cargo.toml
+++ b/launch/dynamo-run/Cargo.toml
@@ -26,22 +26,30 @@ description = "Dynamo Run CLI"

 [features]
 # Build with `--no-default-features` to disable these defaults
-default = ["mistralrs", "vllm", "sglang"]
-mistralrs = ["dynamo-llm/mistralrs"]
-sglang = ["dynamo-llm/sglang", "dep:netlink-packet-route", "dep:rtnetlink"]
-vllm = ["dynamo-llm/vllm", "dep:netlink-packet-route", "dep:rtnetlink"]
 # We don't include llamacpp by default until we figure out when it needs external libraries
-llamacpp = ["dynamo-llm/llamacpp"]
-trtllm = ["dynamo-llm/trtllm"]
-python = ["dynamo-llm/python"]
-cuda = ["dynamo-llm/cuda"]
-metal = ["dynamo-llm/metal"]
-vulkan = ["dynamo-llm/vulkan"]
+default = ["mistralrs", "vllm", "sglang"]
+mistralrs = ["dep:dynamo-engine-mistralrs"]
+llamacpp = ["dep:dynamo-engine-llamacpp"]
+vllm = ["dep:dynamo-engine-vllm", "dep:netlink-packet-route", "dep:rtnetlink"]
+sglang = ["dep:dynamo-engine-sglang", "dep:netlink-packet-route", "dep:rtnetlink"]
+trtllm = ["dep:dynamo-engine-trtllm"]
+python = ["dep:dynamo-engine-python"]
+
+cuda = ["dynamo-engine-llamacpp/cuda", "dynamo-engine-mistralrs/cuda"]
+metal = ["dynamo-engine-llamacpp/metal", "dynamo-engine-mistralrs/metal"]
+vulkan = ["dynamo-engine-llamacpp/vulkan"]

 [dependencies]
 dynamo-llm = { workspace = true }
 dynamo-runtime = { workspace = true }

+dynamo-engine-llamacpp = { path = "../../lib/engines/llamacpp", optional = true }
+dynamo-engine-mistralrs = { path = "../../lib/engines/mistralrs", optional = true }
+dynamo-engine-sglang = { path = "../../lib/engines/sglang", optional = true }
+dynamo-engine-vllm = { path = "../../lib/engines/vllm", optional = true }
+dynamo-engine-trtllm = { path = "../../lib/engines/trtllm", optional = true }
+dynamo-engine-python = { path = "../../lib/engines/python", optional = true }
+
 anyhow = { workspace = true }
 async-stream = { workspace = true }
 async-trait = { workspace = true }

--- a/launch/dynamo-run/src/lib.rs
+++ b/launch/dynamo-run/src/lib.rs
@@ -30,7 +30,6 @@ mod input;
 #[cfg(any(feature = "vllm", feature = "sglang"))]
 mod net;
 mod opt;
-mod output;
 pub use opt::{Input, Output};

 /// How we identify a namespace/component/endpoint URL.
@@ -185,7 +184,7 @@ pub async fn run(
            };
            EngineConfig::StaticFull {
                service_name: model_name,
-                engine: output::echo_full::make_engine_full(),
+                engine: dynamo_llm::engines::make_engine_full(),
            }
        }
        Output::EchoCore => {
@@ -197,7 +196,7 @@ pub async fn run(
            card.requires_preprocessing = true;
            EngineConfig::StaticCore {
                service_name: card.service_name.clone(),
-                engine: output::echo_core::make_engine_core(),
+                engine: dynamo_llm::engines::make_engine_core(),
                card: Box::new(card),
            }
        }
@@ -215,12 +214,12 @@ pub async fn run(
            };
            EngineConfig::StaticFull {
                service_name: model_name,
-                engine: dynamo_llm::engines::mistralrs::make_engine(&model_path).await?,
+                engine: dynamo_engine_mistralrs::make_engine(&model_path).await?,
            }
        }
        #[cfg(feature = "sglang")]
        Output::SgLang => {
-            use dynamo_llm::engines::sglang;
+            use dynamo_engine_sglang;
            let Some(model_path) = model_path else {
                anyhow::bail!("out=sglang requires flag --model-path=<full-path-to-model-dir>");
            };
@@ -250,7 +249,7 @@ pub async fn run(
                }
            }

-            let (engine, sglang_process) = sglang::make_engine(
+            let (engine, sglang_process) = dynamo_engine_sglang::make_engine(
                cancel_token.clone(),
                &model_path,
                &sock_prefix,
@@ -271,7 +270,6 @@ pub async fn run(
        }
        #[cfg(feature = "vllm")]
        Output::Vllm => {
-            use dynamo_llm::engines::vllm;
            if flags.base_gpu_id != 0 {
                anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
            }
@@ -305,7 +303,7 @@ pub async fn run(
            }
            if node_conf.node_rank == 0 {
                // vllm multi-node only the leader runs vllm
-                let (engine, vllm_future) = vllm::make_leader_engine(
+                let (engine, vllm_future) = dynamo_engine_vllm::make_leader_engine(
                    cancel_token.clone(),
                    &model_path,
                    &sock_prefix,
@@ -324,14 +322,15 @@ pub async fn run(
                }
            } else {
                // Nodes rank > 0 only run 'ray'
-                let stop_future = vllm::start_follower(cancel_token.clone(), node_conf).await?;
+                let stop_future =
+                    dynamo_engine_vllm::start_follower(cancel_token.clone(), node_conf).await?;
                extra = Some(Box::pin(stop_future));
                EngineConfig::None
            }
        }
        #[cfg(feature = "llamacpp")]
        Output::LlamaCpp => {
-            use dynamo_llm::engines::llamacpp;
+            use dynamo_engine_llamacpp;
            let Some(model_path) = model_path else {
                anyhow::bail!("out=llamacpp requires flag --model-path=<full-path-to-model-gguf>");
            };
@@ -343,7 +342,8 @@ pub async fn run(
                    "Pass --model-config so we can find the tokenizer, should be an HF checkout."
                );
            };
-            let engine = llamacpp::make_engine(cancel_token.clone(), &model_path).await?;
+            let engine =
+                dynamo_engine_llamacpp::make_engine(cancel_token.clone(), &model_path).await?;
            EngineConfig::StaticCore {
                service_name: card.service_name.clone(),
                engine,
@@ -352,7 +352,6 @@ pub async fn run(
        }
        #[cfg(feature = "trtllm")]
        Output::TrtLLM => {
-            use dynamo_llm::engines::trtllm;
            let Some(model_path) = model_path else {
                anyhow::bail!("out=trtllm requires flag --model-path=<full-path-to-model-dir>");
            };
@@ -363,7 +362,10 @@ pub async fn run(
            }
            // Safety: Earlier we build maybe_card from model_path, which we checked right above
            let card = maybe_card.clone().unwrap();
-            let engine = trtllm::make_engine(model_path.display(), flags.tensor_parallel_size)?;
+            let engine = dynamo_engine_trtllm::make_engine(
+                model_path.display(),
+                flags.tensor_parallel_size,
+            )?;
            EngineConfig::StaticCore {
                service_name: card.service_name.clone(),
                engine,
@@ -372,13 +374,13 @@ pub async fn run(
        }
        #[cfg(feature = "python")]
        Output::PythonStr(path_str) => {
-            use dynamo_llm::engines::python;
            let Some(model_name) = model_name else {
                anyhow::bail!("Provide model service name as `--model-name <this>`");
            };
            let py_args = flags.as_vec(&path_str, &model_name);
            let p = std::path::PathBuf::from(path_str);
-            let engine = python::make_string_engine(cancel_token.clone(), &p, py_args).await?;
+            let engine =
+                dynamo_engine_python::make_string_engine(cancel_token.clone(), &p, py_args).await?;
            EngineConfig::StaticFull {
                service_name: model_name,
                engine,
@@ -386,7 +388,6 @@ pub async fn run(
        }
        #[cfg(feature = "python")]
        Output::PythonTok(path_str) => {
-            use dynamo_llm::engines::python;
            let Some(card) = maybe_card.clone() else {
                anyhow::bail!("Could not find tokenizer. Pass flag --model-path <path>");
            };
@@ -395,7 +396,8 @@ pub async fn run(
            };
            let py_args = flags.as_vec(&path_str, &model_name);
            let p = std::path::PathBuf::from(path_str);
-            let engine = python::make_token_engine(cancel_token.clone(), &p, py_args).await?;
+            let engine =
+                dynamo_engine_python::make_token_engine(cancel_token.clone(), &p, py_args).await?;
            EngineConfig::StaticCore {
                service_name: model_name.clone(),
                engine,

--- a/launch/dynamo-run/src/main.rs
+++ b/launch/dynamo-run/src/main.rs
@@ -51,8 +51,7 @@ fn main() -> anyhow::Result<()> {
            if cfg!(feature = "sglang") {
                #[cfg(feature = "sglang")]
                {
-                    use dynamo_llm::engines::sglang;
-                    let gpu_config = sglang::MultiGPUConfig {
+                    let gpu_config = dynamo_engine_sglang::MultiGPUConfig {
                        tp_size: flags.tensor_parallel_size,
                        tp_rank: sglang_flags.tp_rank,
                        gpu_id: sglang_flags.gpu_id,
@@ -62,7 +61,7 @@ fn main() -> anyhow::Result<()> {
                        node_rank: flags.node_rank,
                        leader_addr: flags.leader_addr.unwrap_or_default(),
                    };
-                    return sglang::run_subprocess(
+                    return dynamo_engine_sglang::run_subprocess(
                        ZMQ_SOCKET_PREFIX,
                        model_path,
                        sglang_flags.pipe_fd as std::os::fd::RawFd,
@@ -84,13 +83,12 @@ fn main() -> anyhow::Result<()> {
            if cfg!(feature = "vllm") {
                #[cfg(feature = "vllm")]
                {
-                    use dynamo_llm::engines::vllm;
                    let node_config = dynamo_llm::engines::MultiNodeConfig {
                        num_nodes: flags.num_nodes,
                        node_rank: flags.node_rank,
                        leader_addr: flags.leader_addr.unwrap_or_default(),
                    };
-                    return vllm::run_subprocess(
+                    return dynamo_engine_vllm::run_subprocess(
                        ZMQ_SOCKET_PREFIX,
                        &model_path,
                        node_config,

--- a/launch/dynamo-run/src/output/common.rs
+++ b/launch/dynamo-run/src/output/common.rs
-// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::env;
-use std::sync::LazyLock;
-use std::time::Duration;
-
-/// How long to sleep between echoed tokens.
-/// Default is 10ms which gives us 100 tok/s.
-/// Can be configured via the DYN_TOKEN_ECHO_DELAY_MS environment variable.
-pub static TOKEN_ECHO_DELAY: LazyLock<Duration> = LazyLock::new(|| {
-    const DEFAULT_DELAY_MS: u64 = 10;
-
-    let delay_ms = env::var("DYN_TOKEN_ECHO_DELAY_MS")
-        .ok()
-        .and_then(|val| val.parse::<u64>().ok())
-        .unwrap_or(DEFAULT_DELAY_MS);
-
-    Duration::from_millis(delay_ms)
-});
--- a/launch/dynamo-run/src/output/echo_core.rs
+++ b/launch/dynamo-run/src/output/echo_core.rs
-// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::sync::Arc;
-
-use async_stream::stream;
-use async_trait::async_trait;
-
-use dynamo_llm::backend::ExecutionContext;
-use dynamo_llm::preprocessor::BackendInput;
-use dynamo_llm::protocols::common::llm_backend::LLMEngineOutput;
-use dynamo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
-use dynamo_runtime::pipeline::{Error, ManyOut, SingleIn};
-use dynamo_runtime::protocols::annotated::Annotated;
-
-use super::common::TOKEN_ECHO_DELAY;
-
-/// Engine that accepts pre-processed requests and echos the tokens back as the response
-/// The response will include the full prompt template.
-/// Useful for testing pre-processing.
-struct EchoEngineCore {}
-pub fn make_engine_core() -> ExecutionContext {
-    Arc::new(EchoEngineCore {})
-}
-
-#[async_trait]
-impl AsyncEngine<SingleIn<BackendInput>, ManyOut<Annotated<LLMEngineOutput>>, Error>
-    for EchoEngineCore
-{
-    async fn generate(
-        &self,
-        incoming_request: SingleIn<BackendInput>,
-    ) -> Result<ManyOut<Annotated<LLMEngineOutput>>, Error> {
-        let (request, context) = incoming_request.into_parts();
-        let ctx = context.context();
-
-        let output = stream! {
-            for tok in request.token_ids {
-                tokio::time::sleep(*TOKEN_ECHO_DELAY).await;
-                yield delta_core(tok);
-            }
-            yield Annotated::from_data(LLMEngineOutput::stop());
-        };
-        Ok(ResponseStream::new(Box::pin(output), ctx))
-    }
-}
-
-fn delta_core(tok: u32) -> Annotated<LLMEngineOutput> {
-    let delta = LLMEngineOutput {
-        token_ids: vec![tok],
-        tokens: None,
-        text: None,
-        cum_log_probs: None,
-        log_probs: None,
-        finish_reason: None,
-    };
-    Annotated::from_data(delta)
-}
--- a/launch/dynamo-run/src/output/echo_full.rs
+++ b/launch/dynamo-run/src/output/echo_full.rs
-// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::sync::Arc;
-
-use async_stream::stream;
-use async_trait::async_trait;
-
-use dynamo_llm::protocols::openai::chat_completions::{
-    NvCreateChatCompletionRequest, NvCreateChatCompletionStreamResponse,
-};
-use dynamo_llm::types::openai::chat_completions::OpenAIChatCompletionsStreamingEngine;
-use dynamo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
-use dynamo_runtime::pipeline::{Error, ManyOut, SingleIn};
-use dynamo_runtime::protocols::annotated::Annotated;
-
-use super::common::TOKEN_ECHO_DELAY;
-
-/// Engine that accepts un-preprocessed requests and echos the prompt back as the response
-/// Useful for testing ingress such as service-http.
-struct EchoEngineFull {}
-pub fn make_engine_full() -> OpenAIChatCompletionsStreamingEngine {
-    Arc::new(EchoEngineFull {})
-}
-
-#[async_trait]
-impl
-    AsyncEngine<
-        SingleIn<NvCreateChatCompletionRequest>,
-        ManyOut<Annotated<NvCreateChatCompletionStreamResponse>>,
-        Error,
-    > for EchoEngineFull
-{
-    async fn generate(
-        &self,
-        incoming_request: SingleIn<NvCreateChatCompletionRequest>,
-    ) -> Result<ManyOut<Annotated<NvCreateChatCompletionStreamResponse>>, Error> {
-        let (request, context) = incoming_request.transfer(());
-        let deltas = request.response_generator();
-        let ctx = context.context();
-        let req = request.inner.messages.into_iter().last().unwrap();
-
-        let prompt = match req {
-            async_openai::types::ChatCompletionRequestMessage::User(user_msg) => {
-                match user_msg.content {
-                    async_openai::types::ChatCompletionRequestUserMessageContent::Text(prompt) => {
-                        prompt
-                    }
-                    _ => anyhow::bail!("Invalid request content field, expected Content::Text"),
-                }
-            }
-            _ => anyhow::bail!("Invalid request type, expected User message"),
-        };
-
-        let output = stream! {
-            let mut id = 1;
-            for c in prompt.chars() {
-                // we are returning characters not tokens, so there will be some postprocessing overhead
-                tokio::time::sleep(*TOKEN_ECHO_DELAY).await;
-                let inner = deltas.create_choice(0, Some(c.to_string()), None, None);
-                let response = NvCreateChatCompletionStreamResponse {
-                    inner,
-                };
-                yield Annotated{ id: Some(id.to_string()), data: Some(response), event: None, comment: None };
-                id += 1;
-            }
-
-            let inner = deltas.create_choice(0, None, Some(async_openai::types::FinishReason::Stop), None);
-            let response = NvCreateChatCompletionStreamResponse {
-                inner,
-            };
-            yield Annotated { id: Some(id.to_string()), data: Some(response), event: None, comment: None };
-        };
-
-        Ok(ResponseStream::new(Box::pin(output), ctx))
-    }
-}
--- a/lib/bindings/cpp/nvllm-trt/cmake/find_library_create_target.cmake
+++ b/lib/bindings/cpp/nvllm-trt/cmake/find_library_create_target.cmake
+#
+# SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION &
+# AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy of
+# the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
+#
+
+macro(find_library_create_target target_name lib libtype hints)
+  message(
+    STATUS
+      "========================= Importing and creating target ${target_name} =========================="
+  )
+  message(STATUS "Looking for library ${lib}")
+  if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    find_library(
+      ${lib}_LIB_PATH ${lib}${TRT_DEBUG_POSTFIX}
+      HINTS ${hints}
+      NO_DEFAULT_PATH)
+  endif()
+  find_library(${lib}_LIB_PATH ${lib} HINTS ${hints} NO_DEFAULT_PATH)
+  find_library(${lib}_LIB_PATH ${lib})
+  message(STATUS "Library that was found ${${lib}_LIB_PATH}")
+  add_library(${target_name} ${libtype} IMPORTED)
+  set_target_properties(
+    ${target_name} PROPERTIES IMPORTED_LOCATION ${${lib}_LIB_PATH}
+                              IMPORTED_IMPLIB ${${lib}_LIB_PATH})
+  message(
+    STATUS
+      "=========================================================================================="
+  )
+endmacro()
--- a/lib/bindings/cpp/nvllm-trt/cmake/modules/set_ifndef.cmake
+++ b/lib/bindings/cpp/nvllm-trt/cmake/modules/set_ifndef.cmake
+#
+# SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION &
+# AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy of
+# the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
+#
+
+function(set_ifndef variable value)
+  if(NOT DEFINED ${variable})
+    set(${variable}
+        ${value}
+        PARENT_SCOPE)
+  endif()
+endfunction()
--- a/lib/bindings/python/Cargo.lock
+++ b/lib/bindings/python/Cargo.lock
@@ -419,26 +419,6 @@ version = "1.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "89e25b6adfb930f02d1981565a6e5d9c547ac15a96606256d3b59040e5cd4ca3"

-[[package]]
-name = "bindgen"
-version = "0.70.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f"
-dependencies = [
- "bitflags 2.9.0",
- "cexpr",
- "clang-sys",
- "itertools 0.13.0",
- "log",
- "prettyplease",
- "proc-macro2",
- "quote",
- "regex",
- "rustc-hash 1.1.0",
- "shlex",
- "syn 2.0.100",
-]
-
 [[package]]
 name = "bit-set"
 version = "0.8.0"
@@ -573,15 +553,6 @@ dependencies = [
 "shlex",
 ]

-[[package]]
-name = "cexpr"
-version = "0.6.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766"
-dependencies = [
- "nom",
-]
-
 [[package]]
 name = "cfg-expr"
 version = "0.15.8"
@@ -623,17 +594,6 @@ dependencies = [
 "windows-link",
 ]

-[[package]]
-name = "clang-sys"
-version = "1.8.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
-dependencies = [
- "glob",
- "libc",
- "libloading",
-]
-
 [[package]]
 name = "clap"
 version = "4.5.32"
@@ -661,15 +621,6 @@ version = "0.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6"

-[[package]]
-name = "cmake"
-version = "0.1.54"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e7caa3f9de89ddbe2c607f4101924c5abec803763ae9534e4f4d7d8f84aa81f0"
-dependencies = [
- "cc",
-]
-
 [[package]]
 name = "colorchoice"
 version = "1.0.3"
@@ -1054,6 +1005,27 @@ dependencies = [
 "bytemuck",
 ]

+[[package]]
+name = "dynamo-engine-python"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "async-openai",
+ "async-stream",
+ "async-trait",
+ "dynamo-llm",
+ "dynamo-runtime",
+ "pyo3",
+ "pyo3-async-runtimes",
+ "pythonize",
+ "serde",
+ "serde_json",
+ "thiserror 2.0.12",
+ "tokio",
+ "tokio-stream",
+ "tracing",
+]
+
 [[package]]
 name = "dynamo-llm"
 version = "0.1.0"
@@ -1065,14 +1037,12 @@ dependencies = [
 "async-trait",
 "async_zmq",
 "axum 0.8.1",
- "bindgen",
 "blake3",
 "bs62",
 "bytemuck",
 "bytes",
 "candle-core",
 "chrono",
- "cmake",
 "derive-getters",
 "derive_builder",
 "dynamo-runtime",
@@ -1081,24 +1051,16 @@ dependencies = [
 "futures",
 "galil-seiferas",
 "ggus",
- "indexmap 2.8.0",
 "itertools 0.14.0",
- "libc",
 "memmap2",
 "minijinja",
 "minijinja-contrib",
 "prometheus",
- "pyo3",
- "pyo3-async-runtimes",
- "pythonize",
 "rand 0.9.0",
 "rayon",
 "regex",
- "semver",
 "serde",
- "serde-pickle",
 "serde_json",
- "serde_repr",
 "strum",
 "thiserror 2.0.12",
 "tokenizers",
@@ -1118,6 +1080,7 @@ dependencies = [
 name = "dynamo-py3"
 version = "0.1.0"
 dependencies = [
+ "dynamo-engine-python",
 "dynamo-llm",
 "dynamo-runtime",
 "futures",
@@ -1838,12 +1801,6 @@ version = "0.31.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f"

-[[package]]
-name = "glob"
-version = "0.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2"
-
 [[package]]
 name = "h2"
 version = "0.4.8"
@@ -2297,12 +2254,6 @@ version = "1.70.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"

-[[package]]
-name = "iter-read"
-version = "1.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "071ed4cc1afd86650602c7b11aa2e1ce30762a1c27193201cb5cee9c6ebb1294"
-
 [[package]]
 name = "itertools"
 version = "0.11.0"
@@ -3367,7 +3318,7 @@ dependencies = [
 "pin-project-lite",
 "quinn-proto",
 "quinn-udp",
- "rustc-hash 2.1.1",
+ "rustc-hash",
 "rustls",
 "socket2",
 "thiserror 2.0.12",
@@ -3386,7 +3337,7 @@ dependencies = [
 "getrandom 0.3.2",
 "rand 0.9.0",
 "ring",
- "rustc-hash 2.1.1",
+ "rustc-hash",
 "rustls",
 "rustls-pki-types",
 "slab",
@@ -3697,12 +3648,6 @@ version = "0.1.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"

-[[package]]
-name = "rustc-hash"
-version = "1.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
-
 [[package]]
 name = "rustc-hash"
 version = "2.1.1"
@@ -3913,9 +3858,6 @@ name = "semver"
 version = "1.0.26"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0"
-dependencies = [
- "serde",
-]

 [[package]]
 name = "seq-macro"
@@ -3932,19 +3874,6 @@ dependencies = [
 "serde_derive",
 ]

-[[package]]
-name = "serde-pickle"
-version = "1.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b641fdc8bcf2781ee78b30c599700d64ad4f412976143e4c5d0b9df906bb4843"
-dependencies = [
- "byteorder",
- "iter-read",
- "num-bigint",
- "num-traits",
- "serde",
-]
-
 [[package]]
 name = "serde_derive"
 version = "1.0.219"

--- a/lib/bindings/python/Cargo.toml
+++ b/lib/bindings/python/Cargo.toml
@@ -35,8 +35,9 @@ crate-type = ["cdylib"]

 [dependencies]

-dynamo-llm = { path = "../../llm", features = ["python"] }
+dynamo-llm = { path = "../../llm" }
 dynamo-runtime = { path = "../../runtime" }
+dynamo-engine-python = { path = "../../engines/python" }

 futures = { version = "0.3" }
 once_cell = { version = "1.20.3" }

--- a/lib/bindings/python/rust/engine.rs
+++ b/lib/bindings/python/rust/engine.rs
@@ -15,7 +15,7 @@

 use std::sync::Arc;

-use dynamo_llm::engines::python::PythonServerStreamingEngine;
+use dynamo_engine_python::PythonServerStreamingEngine;
 use dynamo_runtime::CancellationToken;
 pub use dynamo_runtime::{
    pipeline::{async_trait, AsyncEngine, Data, ManyOut, SingleIn},

--- a/lib/engines/llamacpp/Cargo.toml
+++ b/lib/engines/llamacpp/Cargo.toml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[package]
+name = "dynamo-engine-llamacpp"
+version.workspace = true
+edition.workspace = true
+description.workspace = true
+authors.workspace = true
+license.workspace = true
+homepage.workspace = true
+repository.workspace = true
+keywords.workspace = true
+
+[features]
+default = []
+cuda = ["llama-cpp-2/cuda"]
+metal = ["llama-cpp-2/metal"]
+vulkan = ["llama-cpp-2/vulkan"]
+
+[dependencies]
+dynamo-runtime = { workspace = true }
+dynamo-llm = { workspace = true }
+
+async-stream = { workspace = true }
+tokio = { workspace = true }
+tracing = { workspace = true }
+
+llama-cpp-2 = { version = "0.1.102" }
--- a/lib/engines/llamacpp/README.md
+++ b/lib/engines/llamacpp/README.md
+# llamacpp engine for dynamo
+
+The [`llama-cpp-2`](https://crates.io/crates/llama-cpp-2) project used to build a Dynamo backend
+requires that clang is installed on your system. See [bindgen user guide](https://rust-lang.github.io/rust-bindgen/requirements.html)
+for more details.
--- a/lib/llm/src/engines/llamacpp.rs
+++ b/lib/llm/src/engines/llamacpp.rs
@@ -19,14 +19,12 @@ use std::{
    sync::{Arc, Mutex, OnceLock},
 };

-use anyhow::Context;
 use async_stream::stream;
-use async_trait::async_trait;
 use dynamo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
 use dynamo_runtime::pipeline::error as pipeline_error;
-use dynamo_runtime::pipeline::{Error, ManyOut, SingleIn};
+use dynamo_runtime::pipeline::{async_trait, Error, ManyOut, SingleIn};
 use dynamo_runtime::protocols::annotated::Annotated;
-use dynamo_runtime::CancellationToken;
+use dynamo_runtime::{CancellationToken, ErrorContext, Result};
 use llama_cpp_2::{
    context::{params::LlamaContextParams, LlamaContext},
    llama_backend::LlamaBackend,
@@ -36,9 +34,9 @@ use llama_cpp_2::{
    token::LlamaToken,
 };

-use crate::backend::ExecutionContext;
-use crate::protocols::common::llm_backend::{BackendInput, LLMEngineOutput};
-use crate::protocols::common::preprocessor::PreprocessedRequest;
+use dynamo_llm::backend::ExecutionContext;
+use dynamo_llm::protocols::common::llm_backend::{BackendInput, LLMEngineOutput};
+use dynamo_llm::protocols::common::preprocessor::PreprocessedRequest;

 /// If user does not provide a max_tokens limit prompt+output to this many
 const DEFAULT_MAX_TOKENS: u32 = 8192;
@@ -113,7 +111,7 @@ impl LlamacppEngine {
    }
 }

-fn load_model(backend: &LlamaBackend, model_path: &Path) -> anyhow::Result<LlamaModel> {
+fn load_model(backend: &LlamaBackend, model_path: &Path) -> Result<LlamaModel> {
    let model_params = {
        if cfg!(any(feature = "cuda", feature = "vulkan")) {
            LlamaModelParams::default().with_n_gpu_layers(1000)
@@ -212,7 +210,7 @@ fn run_request(
    cancel_token: CancellationToken,
    work_request: WorkRequest,
    llama_context: &mut ContextWrapper,
-) -> anyhow::Result<()> {
+) -> Result<()> {
    let tokens_list: Vec<LlamaToken> = work_request
        .request
        .token_ids

--- a/lib/engines/mistralrs/Cargo.toml
+++ b/lib/engines/mistralrs/Cargo.toml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[package]
+name = "dynamo-engine-mistralrs"
+version.workspace = true
+edition.workspace = true
+description.workspace = true
+authors.workspace = true
+license.workspace = true
+homepage.workspace = true
+repository.workspace = true
+keywords.workspace = true
+
+[features]
+default = []
+cuda = ["mistralrs/cuda", "candle-core/cuda"]
+metal = ["mistralrs/metal"]
+
+[dependencies]
+dynamo-runtime = { workspace = true }
+dynamo-llm = { workspace = true }
+
+anyhow = { workspace = true }
+async-openai = "0.27.2"
+async-stream = { workspace = true }
+async-trait = { workspace = true }
+candle-core = { version = "0.8.0" }
+either = { workspace = true }
+indexmap = { version = "2.6" }
+mistralrs = { git = "https://github.com/EricLBuehler/mistral.rs.git", rev = "aaafc2ef" }
+serde_json = { workspace = true }
+tokio = { workspace = true }
+tracing = { workspace = true }
--- a/lib/llm/src/engines/mistralrs.rs
+++ b/lib/llm/src/engines/mistralrs.rs
@@ -34,10 +34,10 @@ use dynamo_runtime::pipeline::error as pipeline_error;
 use dynamo_runtime::pipeline::{Error, ManyOut, SingleIn};
 use dynamo_runtime::protocols::annotated::Annotated;

-use crate::protocols::openai::chat_completions::{
+use dynamo_llm::protocols::openai::chat_completions::{
    NvCreateChatCompletionRequest, NvCreateChatCompletionStreamResponse,
 };
-use crate::types::openai::chat_completions::OpenAIChatCompletionsStreamingEngine;
+use dynamo_llm::types::openai::chat_completions::OpenAIChatCompletionsStreamingEngine;

 /// How many requests mistral will run at once in the paged attention scheduler.
 /// It actually runs 1 fewer than this.