Commit 84985d3f authored by Ryan Olson's avatar Ryan Olson Committed by GitHub
Browse files

refactor: migrate engines to standalone crates (#453)



Moved all of `lib/llm/src/engines` to their own crates as e.g. `lib/engines/mistralrs`. This will allow publishing of the `dynamo-llm` crate as it won't have any github dependencies.

The only engines in dynamo-llm will be the demo `echo` ones.
Co-authored-by: default avatarGraham King <grahamk@nvidia.com>
parent 6eb10540
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
[package]
name = "dynamo-engine-python"
version.workspace = true
edition.workspace = true
description.workspace = true
authors.workspace = true
license.workspace = true
homepage.workspace = true
repository.workspace = true
keywords.workspace = true
[dependencies]
dynamo-runtime = { workspace = true }
dynamo-llm = { workspace = true }
anyhow = { workspace = true }
async-stream = { workspace = true }
async-trait = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
thiserror = { workspace = true }
tokio = { workspace = true }
tokio-stream = { workspace = true }
tracing = { workspace = true }
async-openai = "0.27.2"
pyo3 = { version = "0.23.3", default-features = false, features = [
"macros",
"experimental-async",
"experimental-inspect",
"py-clone",
] }
pyo3-async-runtimes = { version = "0.23.0", default-features = false, features = [
"attributes",
"testing",
"tokio-runtime",
"unstable-streams",
] }
pythonize = { version = "0.23" }
......@@ -36,8 +36,8 @@ use tokio::sync::mpsc;
use tokio::sync::oneshot::Sender;
use tokio_stream::{wrappers::ReceiverStream, StreamExt};
use crate::backend::ExecutionContext;
use crate::types::openai::chat_completions::OpenAIChatCompletionsStreamingEngine;
use dynamo_llm::backend::ExecutionContext;
use dynamo_llm::types::openai::chat_completions::OpenAIChatCompletionsStreamingEngine;
/// Python snippet to import a file as a module
const PY_IMPORT: &CStr = cr#"
......@@ -78,7 +78,7 @@ pub async fn make_string_engine(
pyo3::prepare_freethreaded_python();
if let Ok(venv) = env::var("VIRTUAL_ENV") {
Python::with_gil(|py| {
if let Err(e) = super::fix_venv(venv, py) {
if let Err(e) = fix_venv(venv, py) {
tracing::warn!("failed to fix venv: {}", e);
}
});
......@@ -98,7 +98,7 @@ pub async fn make_token_engine(
pyo3::prepare_freethreaded_python();
if let Ok(venv) = env::var("VIRTUAL_ENV") {
Python::with_gil(|py| {
if let Err(e) = super::fix_venv(venv, py) {
if let Err(e) = fix_venv(venv, py) {
tracing::warn!("failed to fix venv: {}", e);
}
});
......@@ -360,3 +360,23 @@ where
Ok(response)
}
/// On Mac embedded Python interpreters do not pick up the virtual env.
#[cfg(target_os = "macos")]
fn fix_venv(venv: String, py: Python<'_>) -> anyhow::Result<()> {
let version_info = py.version_info();
let sys: PyObject = py.import("sys")?.into();
let sys_path = sys.getattr(py, "path")?;
let venv_path = format!(
"{venv}/lib/python{}.{}/site-packages",
version_info.major, version_info.minor
);
// TODO: This should go _before_ the site-packages
sys_path.call_method1(py, "append", (venv_path,))?;
Ok(())
}
#[cfg(not(target_os = "macos"))]
fn fix_venv(_venv: String, _py: Python<'_>) -> anyhow::Result<()> {
Ok(())
}
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
[package]
name = "dynamo-engine-sglang"
version.workspace = true
edition.workspace = true
description.workspace = true
authors.workspace = true
license.workspace = true
homepage.workspace = true
repository.workspace = true
keywords.workspace = true
[dependencies]
dynamo-runtime = { workspace = true }
dynamo-llm = { workspace = true }
anyhow = { workspace = true }
async-stream = { workspace = true }
async-trait = { workspace = true }
async_zmq = { workspace = true }
serde_json = { workspace = true }
tokio = { workspace = true }
tracing = { workspace = true }
async-openai = "0.27.2"
libc = "0.2"
pyo3 = { version = "0.23.3", default-features = false, features = [
"macros",
"experimental-async",
"experimental-inspect",
"py-clone",
] }
regex = "1"
......@@ -18,14 +18,14 @@ use std::path::{Path, PathBuf};
use async_stream::stream;
use async_trait::async_trait;
use crate::protocols::common::llm_backend::{BackendInput, LLMEngineOutput};
use dynamo_llm::engines::MultiNodeConfig;
use dynamo_llm::protocols::common::llm_backend::{BackendInput, LLMEngineOutput};
use dynamo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
use dynamo_runtime::pipeline::{Error, ManyOut, SingleIn};
use dynamo_runtime::protocols::annotated::Annotated;
use dynamo_runtime::runtime::CancellationToken;
use crate::engines::MultiNodeConfig;
pub struct SgLangEngine {
cancel_token: CancellationToken,
worker: super::worker::SgLangWorker,
......
......@@ -16,10 +16,12 @@
use std::path::{Path, PathBuf};
use std::sync::Arc;
use crate::backend::ExecutionContext;
use dynamo_llm::backend::ExecutionContext;
use dynamo_runtime::pipeline::error as pipeline_error;
use dynamo_runtime::CancellationToken;
use pyo3::prelude::*;
mod worker;
mod engine;
......@@ -35,7 +37,7 @@ pub async fn make_engine(
// Unique string to name zmq sockets
sock_code: &str,
// Multi node settings
node_conf: super::MultiNodeConfig,
node_conf: dynamo_llm::engines::MultiNodeConfig,
// How many GPUs to use
tensor_parallel_size: u32,
// The base GPU ID to start allocating GPUs from
......@@ -77,3 +79,22 @@ impl Default for MultiGPUConfig {
}
}
}
#[cfg(target_os = "macos")]
fn fix_venv(venv: String, py: Python<'_>) -> anyhow::Result<()> {
let version_info = py.version_info();
let sys: PyObject = py.import("sys")?.into();
let sys_path = sys.getattr(py, "path")?;
let venv_path = format!(
"{venv}/lib/python{}.{}/site-packages",
version_info.major, version_info.minor
);
// TODO: This should go _before_ the site-packages
sys_path.call_method1(py, "append", (venv_path,))?;
Ok(())
}
#[cfg(not(target_os = "macos"))]
fn fix_venv(_venv: String, _py: Python<'_>) -> anyhow::Result<()> {
Ok(())
}
......@@ -21,7 +21,7 @@ use std::{
path::{Path, PathBuf},
};
use crate::engines::MultiNodeConfig;
use dynamo_llm::engines::MultiNodeConfig;
const PY_START_ENGINE: &str = include_str!("sglang_inc.py");
......@@ -44,7 +44,7 @@ pub fn run_subprocess(
) -> anyhow::Result<()> {
pyo3::prepare_freethreaded_python(); // or enable feature "auto-initialize"
if let Ok(venv) = env::var("VIRTUAL_ENV") {
let _ = Python::with_gil(|py| crate::engines::fix_venv(venv, py));
let _ = Python::with_gil(|py| crate::fix_venv(venv, py));
}
let dir = model_path.display().to_string();
let extra_engine_args_str = &extra_engine_args
......
......@@ -40,12 +40,13 @@ use tokio::{io::AsyncReadExt as _, task::JoinHandle};
use dynamo_runtime::protocols::annotated::Annotated;
use dynamo_runtime::runtime::CancellationToken;
use crate::engines::sglang::MultiGPUConfig;
use crate::engines::MultiNodeConfig;
use crate::protocols::common::llm_backend::LLMEngineOutput;
use crate::protocols::common::preprocessor::PreprocessedRequest;
use crate::protocols::common::FinishReason;
use crate::protocols::TokenIdType;
use dynamo_llm::engines::MultiNodeConfig;
use dynamo_llm::protocols::common::llm_backend::LLMEngineOutput;
use dynamo_llm::protocols::common::preprocessor::PreprocessedRequest;
use dynamo_llm::protocols::common::FinishReason;
use dynamo_llm::protocols::TokenIdType;
use crate::MultiGPUConfig;
/// Wait this long for the sglang sub-process to stop after we send it a KILL
const SGLANG_STOP_TIMEOUT: Duration = Duration::from_millis(1500);
......@@ -293,7 +294,7 @@ pub async fn start(
) -> anyhow::Result<SgLangWorker> {
pyo3::prepare_freethreaded_python();
if let Ok(venv) = env::var("VIRTUAL_ENV") {
let _ = Python::with_gil(|py| crate::engines::fix_venv(venv, py));
let _ = Python::with_gil(|py| crate::fix_venv(venv, py));
}
let Sockets {
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
[package]
name = "dynamo-engine-trtllm"
version.workspace = true
edition.workspace = true
description.workspace = true
authors.workspace = true
license.workspace = true
homepage.workspace = true
repository.workspace = true
keywords.workspace = true
[dependencies]
dynamo-runtime = { workspace = true }
dynamo-llm = { workspace = true }
anyhow = { workspace = true }
async-stream = { workspace = true }
async-trait = { workspace = true }
derive_builder = {workspace = true }
futures = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
thiserror = { workspace = true }
tokio = { workspace = true }
tokio-util = { workspace = true }
tracing = { workspace = true }
async-openai = "0.27.2"
serde_repr = "0.1"
[build-dependencies]
bindgen = "0.70"
cmake = "0.1"
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
fn main() {
extern crate bindgen;
use cmake::Config;
use std::env;
use std::path::PathBuf;
let installed_headers = "/usr/local/include/nvidia/nvllm/nvllm_trt.h";
let local_headers = "../../bindings/cpp/nvllm-trt/include/nvidia/nvllm/nvllm_trt.h";
let headers_path;
if PathBuf::from(installed_headers).exists() {
headers_path = installed_headers;
println!("cargo:warning=nvllm found. Building with installed version...");
println!("cargo:rustc-link-search=native=/usr/local/lib");
println!("cargo:rustc-link-search=native=/opt/tensorrt_llm/lib");
println!("cargo:rustc-link-lib=dylib=nvllm_trt");
println!("cargo:rustc-link-lib=dylib=tensorrt_llm");
println!("cargo:rustc-link-lib=dylib=tensorrt_llm_nvrtc_wrapper");
println!("cargo:rustc-link-lib=dylib=nvinfer_plugin_tensorrt_llm");
println!("cargo:rustc-link-lib=dylib=decoder_attention");
println!("cargo:rerun-if-changed=/usr/local/lib");
} else if PathBuf::from(local_headers).exists() {
headers_path = local_headers;
println!("cargo:warning=nvllm not found. Building stub version...");
let dst = Config::new("../../bindings/cpp/nvllm-trt")
.define("USE_STUBS", "ON")
.no_build_target(true)
.build();
println!("cargo:warning=building stubs in {}", dst.display());
let dst = dst.canonicalize().unwrap();
println!("cargo:rustc-link-search=native={}/build", dst.display());
println!("cargo:rustc-link-lib=dylib=nvllm_trt");
println!("cargo:rustc-link-lib=dylib=tensorrt_llm");
println!("cargo:rerun-if-changed=../bindings/cpp/nvllm-trt");
} else {
panic!("nvllm_trt.h not found");
}
// generate bindings for the trtllm c api
let bindings = bindgen::Builder::default()
.header(headers_path)
.generate()
.expect("Unable to generate bindings");
// Write the bindings to a file
let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
bindings
.write_to_file(out_path.join("bindings.rs"))
.expect("Could not write bindings!");
// // Build protobuf
// tonic_build::configure()
// .build_server(false)
// .compile_protos(&["../../proto/trtllm.proto"], &["../../proto"])
// .expect("Failed to compile protos");
}
......@@ -20,7 +20,7 @@ use std::ffi::CString;
use std::ptr::NonNull;
use super::protocols;
use crate::kv_router::protocols::{ForwardPassMetrics, KvCacheEvents};
use dynamo_llm::kv_router::protocols::{ForwardPassMetrics, KvCacheEvents};
mod bindings {
#![allow(warnings, missing_docs)]
......
......@@ -22,8 +22,9 @@ use futures::stream;
use tokio::sync::mpsc;
use tokio_util::sync::CancellationToken;
use dynamo_llm::protocols::common::llm_backend::{BackendInput, LLMEngineOutput};
use super::Executor;
use crate::protocols::common::llm_backend::{BackendInput, LLMEngineOutput};
struct State {
request_id: String,
......
......@@ -13,7 +13,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use crate::kv_router::protocols::ForwardPassMetrics;
use dynamo_llm::kv_router::protocols::ForwardPassMetrics;
use std::{
sync::{
atomic::{AtomicBool, Ordering},
......
......@@ -13,7 +13,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use crate::kv_router::protocols::KvCacheEvents;
use dynamo_llm::kv_router::protocols::KvCacheEvents;
use std::{
sync::{
atomic::{AtomicBool, Ordering},
......
......@@ -17,7 +17,7 @@ use std::thread;
use tokio::sync::mpsc;
use super::*;
use crate::engines::trtllm::executor::ResponseQueues;
use crate::executor::ResponseQueues;
pub struct ResponseProcessor {
handle: thread::JoinHandle<()>,
......
......@@ -158,8 +158,8 @@ impl Request {
}
// todo convert to a TryFrom
impl From<crate::protocols::common::llm_backend::BackendInput> for Request {
fn from(input: crate::protocols::common::llm_backend::BackendInput) -> Self {
impl From<dynamo_llm::protocols::common::llm_backend::BackendInput> for Request {
fn from(input: dynamo_llm::protocols::common::llm_backend::BackendInput) -> Self {
let request = RequestBuilder::default()
.input_token_ids(input.token_ids)
.max_tokens(input.stop_conditions.max_tokens.unwrap_or(16))
......
......@@ -13,4 +13,4 @@
// See the License for the specific language governing permissions and
// limitations under the License.
pub use crate::kv_router::protocols::ForwardPassMetrics;
pub use dynamo_llm::kv_router::protocols::ForwardPassMetrics;
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment