refactor: migrate engines to standalone crates (#453)

Moved all of `lib/llm/src/engines` to their own crates as e.g. `lib/engines/mistralrs`. This will allow publishing of the `dynamo-llm` crate as it won't have any github dependencies. The only engines in dynamo-llm will be the demo `echo` ones. Co-authored-by: Graham King <grahamk@nvidia.com>

refactor: migrate engines to standalone crates (#453)
Moved all of `lib/llm/src/engines` to their own crates as e.g. `lib/engines/mistralrs`. This will allow publishing of the `dynamo-llm` crate as it won't have any github dependencies. The only engines in dynamo-llm will be the demo `echo` ones. Co-authored-by: Graham King <grahamk@nvidia.com>
84985d3f · Ryan Olson · GitHub · 6eb10540 · 84985d3f · 84985d3f
Commit 84985d3f authored Apr 03, 2025 by Ryan Olson Committed by GitHub Apr 03, 2025
20 changed files
--- a/lib/engines/python/Cargo.toml
+++ b/lib/engines/python/Cargo.toml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[package]
+name = "dynamo-engine-python"
+version.workspace = true
+edition.workspace = true
+description.workspace = true
+authors.workspace = true
+license.workspace = true
+homepage.workspace = true
+repository.workspace = true
+keywords.workspace = true
+
+[dependencies]
+dynamo-runtime = { workspace = true }
+dynamo-llm = { workspace = true }
+
+anyhow = { workspace = true }
+async-stream = { workspace = true }
+async-trait = { workspace = true }
+serde = { workspace = true }
+serde_json = { workspace = true }
+thiserror = { workspace = true }
+tokio = { workspace = true }
+tokio-stream = { workspace = true }
+tracing = { workspace = true }
+
+async-openai = "0.27.2"
+pyo3 = { version = "0.23.3", default-features = false, features = [
+  "macros",
+  "experimental-async",
+  "experimental-inspect",
+  "py-clone",
+] }
+pyo3-async-runtimes = { version = "0.23.0", default-features = false, features = [
+  "attributes",
+  "testing",
+  "tokio-runtime",
+  "unstable-streams",
+] }
+pythonize = { version = "0.23" }
--- a/lib/llm/src/engines/python.rs
+++ b/lib/llm/src/engines/python.rs
@@ -36,8 +36,8 @@ use tokio::sync::mpsc;
 use tokio::sync::oneshot::Sender;
 use tokio_stream::{wrappers::ReceiverStream, StreamExt};

-use crate::backend::ExecutionContext;
-use crate::types::openai::chat_completions::OpenAIChatCompletionsStreamingEngine;
+use dynamo_llm::backend::ExecutionContext;
+use dynamo_llm::types::openai::chat_completions::OpenAIChatCompletionsStreamingEngine;

 /// Python snippet to import a file as a module
 const PY_IMPORT: &CStr = cr#"
@@ -78,7 +78,7 @@ pub async fn make_string_engine(
    pyo3::prepare_freethreaded_python();
    if let Ok(venv) = env::var("VIRTUAL_ENV") {
        Python::with_gil(|py| {
-            if let Err(e) = super::fix_venv(venv, py) {
+            if let Err(e) = fix_venv(venv, py) {
                tracing::warn!("failed to fix venv: {}", e);
            }
        });
@@ -98,7 +98,7 @@ pub async fn make_token_engine(
    pyo3::prepare_freethreaded_python();
    if let Ok(venv) = env::var("VIRTUAL_ENV") {
        Python::with_gil(|py| {
-            if let Err(e) = super::fix_venv(venv, py) {
+            if let Err(e) = fix_venv(venv, py) {
                tracing::warn!("failed to fix venv: {}", e);
            }
        });
@@ -360,3 +360,23 @@ where

    Ok(response)
 }
+
+/// On Mac embedded Python interpreters do not pick up the virtual env.
+#[cfg(target_os = "macos")]
+fn fix_venv(venv: String, py: Python<'_>) -> anyhow::Result<()> {
+    let version_info = py.version_info();
+    let sys: PyObject = py.import("sys")?.into();
+    let sys_path = sys.getattr(py, "path")?;
+    let venv_path = format!(
+        "{venv}/lib/python{}.{}/site-packages",
+        version_info.major, version_info.minor
+    );
+    // TODO: This should go _before_ the site-packages
+    sys_path.call_method1(py, "append", (venv_path,))?;
+    Ok(())
+}
+
+#[cfg(not(target_os = "macos"))]
+fn fix_venv(_venv: String, _py: Python<'_>) -> anyhow::Result<()> {
+    Ok(())
+}
--- a/lib/engines/sglang/Cargo.toml
+++ b/lib/engines/sglang/Cargo.toml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[package]
+name = "dynamo-engine-sglang"
+version.workspace = true
+edition.workspace = true
+description.workspace = true
+authors.workspace = true
+license.workspace = true
+homepage.workspace = true
+repository.workspace = true
+keywords.workspace = true
+
+[dependencies]
+dynamo-runtime = { workspace = true }
+dynamo-llm = { workspace = true }
+
+anyhow = { workspace = true }
+async-stream = { workspace = true }
+async-trait = { workspace = true }
+async_zmq = { workspace = true }
+serde_json = { workspace = true }
+tokio = { workspace = true }
+tracing = { workspace = true }
+
+async-openai = "0.27.2"
+libc = "0.2"
+pyo3 = { version = "0.23.3", default-features = false, features = [
+  "macros",
+  "experimental-async",
+  "experimental-inspect",
+  "py-clone",
+] }
+regex = "1"
--- a/lib/llm/src/engines/sglang/engine.rs
+++ b/lib/llm/src/engines/sglang/engine.rs
@@ -18,14 +18,14 @@ use std::path::{Path, PathBuf};
 use async_stream::stream;
 use async_trait::async_trait;

-use crate::protocols::common::llm_backend::{BackendInput, LLMEngineOutput};
+use dynamo_llm::engines::MultiNodeConfig;
+use dynamo_llm::protocols::common::llm_backend::{BackendInput, LLMEngineOutput};
+
 use dynamo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
 use dynamo_runtime::pipeline::{Error, ManyOut, SingleIn};
 use dynamo_runtime::protocols::annotated::Annotated;
 use dynamo_runtime::runtime::CancellationToken;

-use crate::engines::MultiNodeConfig;
-
 pub struct SgLangEngine {
    cancel_token: CancellationToken,
    worker: super::worker::SgLangWorker,

--- a/lib/llm/src/engines/sglang.rs
+++ b/lib/llm/src/engines/sglang.rs
@@ -16,10 +16,12 @@
 use std::path::{Path, PathBuf};
 use std::sync::Arc;

-use crate::backend::ExecutionContext;
+use dynamo_llm::backend::ExecutionContext;
 use dynamo_runtime::pipeline::error as pipeline_error;
 use dynamo_runtime::CancellationToken;

+use pyo3::prelude::*;
+
 mod worker;

 mod engine;
@@ -35,7 +37,7 @@ pub async fn make_engine(
    // Unique string to name zmq sockets
    sock_code: &str,
    // Multi node settings
-    node_conf: super::MultiNodeConfig,
+    node_conf: dynamo_llm::engines::MultiNodeConfig,
    // How many GPUs to use
    tensor_parallel_size: u32,
    // The base GPU ID to start allocating GPUs from
@@ -77,3 +79,22 @@ impl Default for MultiGPUConfig {
        }
    }
 }
+
+#[cfg(target_os = "macos")]
+fn fix_venv(venv: String, py: Python<'_>) -> anyhow::Result<()> {
+    let version_info = py.version_info();
+    let sys: PyObject = py.import("sys")?.into();
+    let sys_path = sys.getattr(py, "path")?;
+    let venv_path = format!(
+        "{venv}/lib/python{}.{}/site-packages",
+        version_info.major, version_info.minor
+    );
+    // TODO: This should go _before_ the site-packages
+    sys_path.call_method1(py, "append", (venv_path,))?;
+    Ok(())
+}
+
+#[cfg(not(target_os = "macos"))]
+fn fix_venv(_venv: String, _py: Python<'_>) -> anyhow::Result<()> {
+    Ok(())
+}
--- a/lib/llm/src/engines/sglang/sglang_inc.py
+++ b/lib/llm/src/engines/sglang/sglang_inc.py
--- a/lib/llm/src/engines/sglang/subprocess.rs
+++ b/lib/llm/src/engines/sglang/subprocess.rs
@@ -21,7 +21,7 @@ use std::{
    path::{Path, PathBuf},
 };

-use crate::engines::MultiNodeConfig;
+use dynamo_llm::engines::MultiNodeConfig;

 const PY_START_ENGINE: &str = include_str!("sglang_inc.py");

@@ -44,7 +44,7 @@ pub fn run_subprocess(
 ) -> anyhow::Result<()> {
    pyo3::prepare_freethreaded_python(); // or enable feature "auto-initialize"
    if let Ok(venv) = env::var("VIRTUAL_ENV") {
-        let _ = Python::with_gil(|py| crate::engines::fix_venv(venv, py));
+        let _ = Python::with_gil(|py| crate::fix_venv(venv, py));
    }
    let dir = model_path.display().to_string();
    let extra_engine_args_str = &extra_engine_args

--- a/lib/llm/src/engines/sglang/worker.rs
+++ b/lib/llm/src/engines/sglang/worker.rs
@@ -40,12 +40,13 @@ use tokio::{io::AsyncReadExt as _, task::JoinHandle};
 use dynamo_runtime::protocols::annotated::Annotated;
 use dynamo_runtime::runtime::CancellationToken;

-use crate::engines::sglang::MultiGPUConfig;
-use crate::engines::MultiNodeConfig;
-use crate::protocols::common::llm_backend::LLMEngineOutput;
-use crate::protocols::common::preprocessor::PreprocessedRequest;
-use crate::protocols::common::FinishReason;
-use crate::protocols::TokenIdType;
+use dynamo_llm::engines::MultiNodeConfig;
+use dynamo_llm::protocols::common::llm_backend::LLMEngineOutput;
+use dynamo_llm::protocols::common::preprocessor::PreprocessedRequest;
+use dynamo_llm::protocols::common::FinishReason;
+use dynamo_llm::protocols::TokenIdType;
+
+use crate::MultiGPUConfig;

 /// Wait this long for the sglang sub-process to stop after we send it a KILL
 const SGLANG_STOP_TIMEOUT: Duration = Duration::from_millis(1500);
@@ -293,7 +294,7 @@ pub async fn start(
 ) -> anyhow::Result<SgLangWorker> {
    pyo3::prepare_freethreaded_python();
    if let Ok(venv) = env::var("VIRTUAL_ENV") {
-        let _ = Python::with_gil(|py| crate::engines::fix_venv(venv, py));
+        let _ = Python::with_gil(|py| crate::fix_venv(venv, py));
    }

    let Sockets {

--- a/lib/engines/trtllm/Cargo.toml
+++ b/lib/engines/trtllm/Cargo.toml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[package]
+name = "dynamo-engine-trtllm"
+version.workspace = true
+edition.workspace = true
+description.workspace = true
+authors.workspace = true
+license.workspace = true
+homepage.workspace = true
+repository.workspace = true
+keywords.workspace = true
+
+[dependencies]
+dynamo-runtime = { workspace = true }
+dynamo-llm = { workspace = true }
+
+anyhow = { workspace = true }
+async-stream = { workspace = true }
+async-trait = { workspace = true }
+derive_builder = {workspace = true }
+futures =  { workspace = true }
+serde = { workspace = true }
+serde_json = { workspace = true }
+thiserror = { workspace = true }
+tokio = { workspace = true }
+tokio-util = { workspace = true }
+tracing = { workspace = true }
+
+async-openai = "0.27.2"
+serde_repr = "0.1"
+
+[build-dependencies]
+bindgen = "0.70"
+cmake = "0.1"
--- a/lib/engines/trtllm/build.rs
+++ b/lib/engines/trtllm/build.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+fn main() {
+    extern crate bindgen;
+
+    use cmake::Config;
+    use std::env;
+    use std::path::PathBuf;
+    let installed_headers = "/usr/local/include/nvidia/nvllm/nvllm_trt.h";
+    let local_headers = "../../bindings/cpp/nvllm-trt/include/nvidia/nvllm/nvllm_trt.h";
+    let headers_path;
+
+    if PathBuf::from(installed_headers).exists() {
+        headers_path = installed_headers;
+        println!("cargo:warning=nvllm found. Building with installed version...");
+        println!("cargo:rustc-link-search=native=/usr/local/lib");
+        println!("cargo:rustc-link-search=native=/opt/tensorrt_llm/lib");
+        println!("cargo:rustc-link-lib=dylib=nvllm_trt");
+        println!("cargo:rustc-link-lib=dylib=tensorrt_llm");
+        println!("cargo:rustc-link-lib=dylib=tensorrt_llm_nvrtc_wrapper");
+        println!("cargo:rustc-link-lib=dylib=nvinfer_plugin_tensorrt_llm");
+        println!("cargo:rustc-link-lib=dylib=decoder_attention");
+
+        println!("cargo:rerun-if-changed=/usr/local/lib");
+    } else if PathBuf::from(local_headers).exists() {
+        headers_path = local_headers;
+        println!("cargo:warning=nvllm not found. Building stub version...");
+
+        let dst = Config::new("../../bindings/cpp/nvllm-trt")
+            .define("USE_STUBS", "ON")
+            .no_build_target(true)
+            .build();
+
+        println!("cargo:warning=building stubs in {}", dst.display());
+        let dst = dst.canonicalize().unwrap();
+
+        println!("cargo:rustc-link-search=native={}/build", dst.display());
+        println!("cargo:rustc-link-lib=dylib=nvllm_trt");
+        println!("cargo:rustc-link-lib=dylib=tensorrt_llm");
+
+        println!("cargo:rerun-if-changed=../bindings/cpp/nvllm-trt");
+    } else {
+        panic!("nvllm_trt.h not found");
+    }
+
+    // generate bindings for the trtllm c api
+    let bindings = bindgen::Builder::default()
+        .header(headers_path)
+        .generate()
+        .expect("Unable to generate bindings");
+
+    // Write the bindings to a file
+    let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
+    bindings
+        .write_to_file(out_path.join("bindings.rs"))
+        .expect("Could not write bindings!");
+
+    // // Build protobuf
+    // tonic_build::configure()
+    //     .build_server(false)
+    //     .compile_protos(&["../../proto/trtllm.proto"], &["../../proto"])
+    //     .expect("Failed to compile protos");
+}
--- a/lib/llm/src/engines/trtllm/executor.rs
+++ b/lib/llm/src/engines/trtllm/executor.rs
--- a/lib/llm/src/engines/trtllm/executor/config.rs
+++ b/lib/llm/src/engines/trtllm/executor/config.rs
--- a/lib/llm/src/engines/trtllm/executor/cpp.rs
+++ b/lib/llm/src/engines/trtllm/executor/cpp.rs
@@ -20,7 +20,7 @@ use std::ffi::CString;
 use std::ptr::NonNull;

 use super::protocols;
-use crate::kv_router::protocols::{ForwardPassMetrics, KvCacheEvents};
+use dynamo_llm::kv_router::protocols::{ForwardPassMetrics, KvCacheEvents};

 mod bindings {
    #![allow(warnings, missing_docs)]

--- a/lib/llm/src/engines/trtllm/executor/engine.rs
+++ b/lib/llm/src/engines/trtllm/executor/engine.rs
@@ -22,8 +22,9 @@ use futures::stream;
 use tokio::sync::mpsc;
 use tokio_util::sync::CancellationToken;

+use dynamo_llm::protocols::common::llm_backend::{BackendInput, LLMEngineOutput};
+
 use super::Executor;
-use crate::protocols::common::llm_backend::{BackendInput, LLMEngineOutput};

 struct State {
    request_id: String,

--- a/lib/llm/src/engines/trtllm/executor/processors.rs
+++ b/lib/llm/src/engines/trtllm/executor/processors.rs
--- a/lib/llm/src/engines/trtllm/executor/processors/iteration.rs
+++ b/lib/llm/src/engines/trtllm/executor/processors/iteration.rs
@@ -13,7 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use crate::kv_router::protocols::ForwardPassMetrics;
+use dynamo_llm::kv_router::protocols::ForwardPassMetrics;
 use std::{
    sync::{
        atomic::{AtomicBool, Ordering},

--- a/lib/llm/src/engines/trtllm/executor/processors/kv.rs
+++ b/lib/llm/src/engines/trtllm/executor/processors/kv.rs
@@ -13,7 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use crate::kv_router::protocols::KvCacheEvents;
+use dynamo_llm::kv_router::protocols::KvCacheEvents;
 use std::{
    sync::{
        atomic::{AtomicBool, Ordering},

--- a/lib/llm/src/engines/trtllm/executor/processors/response.rs
+++ b/lib/llm/src/engines/trtllm/executor/processors/response.rs
@@ -17,7 +17,7 @@ use std::thread;
 use tokio::sync::mpsc;

 use super::*;
-use crate::engines::trtllm::executor::ResponseQueues;
+use crate::executor::ResponseQueues;

 pub struct ResponseProcessor {
    handle: thread::JoinHandle<()>,

--- a/lib/llm/src/engines/trtllm/executor/protocols.rs
+++ b/lib/llm/src/engines/trtllm/executor/protocols.rs
@@ -158,8 +158,8 @@ impl Request {
 }

 // todo convert to a TryFrom
-impl From<crate::protocols::common::llm_backend::BackendInput> for Request {
-    fn from(input: crate::protocols::common::llm_backend::BackendInput) -> Self {
+impl From<dynamo_llm::protocols::common::llm_backend::BackendInput> for Request {
+    fn from(input: dynamo_llm::protocols::common::llm_backend::BackendInput) -> Self {
        let request = RequestBuilder::default()
            .input_token_ids(input.token_ids)
            .max_tokens(input.stop_conditions.max_tokens.unwrap_or(16))

--- a/lib/llm/src/engines/trtllm/executor/protocols/kv.rs
+++ b/lib/llm/src/engines/trtllm/executor/protocols/kv.rs
@@ -13,4 +13,4 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-pub use crate::kv_router::protocols::ForwardPassMetrics;
+pub use dynamo_llm::kv_router::protocols::ForwardPassMetrics;