Commit 84985d3f authored by Ryan Olson's avatar Ryan Olson Committed by GitHub
Browse files

refactor: migrate engines to standalone crates (#453)



Moved all of `lib/llm/src/engines` to their own crates as e.g. `lib/engines/mistralrs`. This will allow publishing of the `dynamo-llm` crate as it won't have any github dependencies.

The only engines in dynamo-llm will be the demo `echo` ones.
Co-authored-by: default avatarGraham King <grahamk@nvidia.com>
parent 6eb10540
......@@ -4,7 +4,6 @@
.helix
[Bb]inlog/
[Bb][Uu][Ii][Ll][Dd]/
[Cc][Mm][Aa][Kk][Ee]/
[Oo][Bb][Jj]/
[Oo][Uu][Tt]/
[Tt][Mm][Pp]/
......
......@@ -1500,6 +1500,120 @@ dependencies = [
"bytemuck",
]
[[package]]
name = "dynamo-engine-llamacpp"
version = "0.1.0"
dependencies = [
"async-stream",
"dynamo-llm",
"dynamo-runtime",
"llama-cpp-2",
"tokio",
"tracing",
]
[[package]]
name = "dynamo-engine-mistralrs"
version = "0.1.0"
dependencies = [
"anyhow",
"async-openai",
"async-stream",
"async-trait",
"candle-core 0.8.4",
"dynamo-llm",
"dynamo-runtime",
"either",
"indexmap 2.8.0",
"mistralrs",
"serde_json",
"tokio",
"tracing",
]
[[package]]
name = "dynamo-engine-python"
version = "0.1.0"
dependencies = [
"anyhow",
"async-openai",
"async-stream",
"async-trait",
"dynamo-llm",
"dynamo-runtime",
"pyo3",
"pyo3-async-runtimes",
"pythonize",
"serde",
"serde_json",
"thiserror 2.0.12",
"tokio",
"tokio-stream",
"tracing",
]
[[package]]
name = "dynamo-engine-sglang"
version = "0.1.0"
dependencies = [
"anyhow",
"async-openai",
"async-stream",
"async-trait",
"async_zmq",
"dynamo-llm",
"dynamo-runtime",
"libc",
"pyo3",
"regex",
"serde_json",
"tokio",
"tracing",
]
[[package]]
name = "dynamo-engine-trtllm"
version = "0.1.0"
dependencies = [
"anyhow",
"async-openai",
"async-stream",
"async-trait",
"bindgen 0.70.1",
"cmake",
"derive_builder",
"dynamo-llm",
"dynamo-runtime",
"futures",
"serde",
"serde_json",
"serde_repr",
"thiserror 2.0.12",
"tokio",
"tokio-util",
"tracing",
]
[[package]]
name = "dynamo-engine-vllm"
version = "0.1.0"
dependencies = [
"anyhow",
"async-openai",
"async-stream",
"async-trait",
"async_zmq",
"dynamo-llm",
"dynamo-runtime",
"pyo3",
"regex",
"serde-pickle",
"serde_json",
"thiserror 2.0.12",
"tokio",
"tracing",
]
[[package]]
name = "dynamo-llm"
version = "0.1.0"
......@@ -1511,14 +1625,12 @@ dependencies = [
"async-trait",
"async_zmq",
"axum 0.8.1",
"bindgen 0.70.1",
"blake3",
"bs62",
"bytemuck",
"bytes",
"candle-core 0.8.4",
"chrono",
"cmake",
"cudarc 0.13.9 (git+https://github.com/coreylowman/cudarc.git?rev=8c52e735b55bf8e979e1a16bd85e3dfe4f87c9fe)",
"derive-getters",
"derive_builder",
......@@ -1529,33 +1641,23 @@ dependencies = [
"galil-seiferas",
"ggus",
"hf-hub",
"indexmap 2.8.0",
"insta",
"itertools 0.14.0",
"libc",
"llama-cpp-2",
"memmap2",
"minijinja",
"minijinja-contrib",
"mistralrs",
"ndarray",
"prometheus",
"proptest",
"pyo3",
"pyo3-async-runtimes",
"pythonize",
"rand 0.9.0",
"rayon",
"regex",
"reqwest",
"rstest 0.18.2",
"rstest_reuse",
"semver",
"sentencepiece",
"serde",
"serde-pickle",
"serde_json",
"serde_repr",
"strum 0.27.1",
"tempfile",
"thiserror 2.0.12",
......@@ -1582,6 +1684,12 @@ dependencies = [
"async-trait",
"clap",
"dialoguer",
"dynamo-engine-llamacpp",
"dynamo-engine-mistralrs",
"dynamo-engine-python",
"dynamo-engine-sglang",
"dynamo-engine-trtllm",
"dynamo-engine-vllm",
"dynamo-llm",
"dynamo-runtime",
"futures",
......@@ -5539,9 +5647,6 @@ name = "semver"
version = "1.0.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0"
dependencies = [
"serde",
]
[[package]]
name = "sentencepiece"
......
......@@ -20,6 +20,7 @@ members = [
"lib/llm",
"lib/runtime",
"lib/bindings/c",
"lib/engines/*",
]
resolver = "2"
......
......@@ -19,17 +19,18 @@
[licenses]
confidence-threshold = 0.93
allow = [
"MIT-0",
"MIT",
"Apache-2.0",
"Apache-2.0 WITH LLVM-exception",
"ISC",
"0BSD",
"BSD-2-Clause",
"BSD-3-Clause",
"OpenSSL",
"Unicode-3.0",
"BSL-1.0",
"MPL-2.0",
"MIT-0"
"MPL-2.0"
]
# TODO exceptions
......
......@@ -26,22 +26,30 @@ description = "Dynamo Run CLI"
[features]
# Build with `--no-default-features` to disable these defaults
default = ["mistralrs", "vllm", "sglang"]
mistralrs = ["dynamo-llm/mistralrs"]
sglang = ["dynamo-llm/sglang", "dep:netlink-packet-route", "dep:rtnetlink"]
vllm = ["dynamo-llm/vllm", "dep:netlink-packet-route", "dep:rtnetlink"]
# We don't include llamacpp by default until we figure out when it needs external libraries
llamacpp = ["dynamo-llm/llamacpp"]
trtllm = ["dynamo-llm/trtllm"]
python = ["dynamo-llm/python"]
cuda = ["dynamo-llm/cuda"]
metal = ["dynamo-llm/metal"]
vulkan = ["dynamo-llm/vulkan"]
default = ["mistralrs", "vllm", "sglang"]
mistralrs = ["dep:dynamo-engine-mistralrs"]
llamacpp = ["dep:dynamo-engine-llamacpp"]
vllm = ["dep:dynamo-engine-vllm", "dep:netlink-packet-route", "dep:rtnetlink"]
sglang = ["dep:dynamo-engine-sglang", "dep:netlink-packet-route", "dep:rtnetlink"]
trtllm = ["dep:dynamo-engine-trtllm"]
python = ["dep:dynamo-engine-python"]
cuda = ["dynamo-engine-llamacpp/cuda", "dynamo-engine-mistralrs/cuda"]
metal = ["dynamo-engine-llamacpp/metal", "dynamo-engine-mistralrs/metal"]
vulkan = ["dynamo-engine-llamacpp/vulkan"]
[dependencies]
dynamo-llm = { workspace = true }
dynamo-runtime = { workspace = true }
dynamo-engine-llamacpp = { path = "../../lib/engines/llamacpp", optional = true }
dynamo-engine-mistralrs = { path = "../../lib/engines/mistralrs", optional = true }
dynamo-engine-sglang = { path = "../../lib/engines/sglang", optional = true }
dynamo-engine-vllm = { path = "../../lib/engines/vllm", optional = true }
dynamo-engine-trtllm = { path = "../../lib/engines/trtllm", optional = true }
dynamo-engine-python = { path = "../../lib/engines/python", optional = true }
anyhow = { workspace = true }
async-stream = { workspace = true }
async-trait = { workspace = true }
......
......@@ -30,7 +30,6 @@ mod input;
#[cfg(any(feature = "vllm", feature = "sglang"))]
mod net;
mod opt;
mod output;
pub use opt::{Input, Output};
/// How we identify a namespace/component/endpoint URL.
......@@ -185,7 +184,7 @@ pub async fn run(
};
EngineConfig::StaticFull {
service_name: model_name,
engine: output::echo_full::make_engine_full(),
engine: dynamo_llm::engines::make_engine_full(),
}
}
Output::EchoCore => {
......@@ -197,7 +196,7 @@ pub async fn run(
card.requires_preprocessing = true;
EngineConfig::StaticCore {
service_name: card.service_name.clone(),
engine: output::echo_core::make_engine_core(),
engine: dynamo_llm::engines::make_engine_core(),
card: Box::new(card),
}
}
......@@ -215,12 +214,12 @@ pub async fn run(
};
EngineConfig::StaticFull {
service_name: model_name,
engine: dynamo_llm::engines::mistralrs::make_engine(&model_path).await?,
engine: dynamo_engine_mistralrs::make_engine(&model_path).await?,
}
}
#[cfg(feature = "sglang")]
Output::SgLang => {
use dynamo_llm::engines::sglang;
use dynamo_engine_sglang;
let Some(model_path) = model_path else {
anyhow::bail!("out=sglang requires flag --model-path=<full-path-to-model-dir>");
};
......@@ -250,7 +249,7 @@ pub async fn run(
}
}
let (engine, sglang_process) = sglang::make_engine(
let (engine, sglang_process) = dynamo_engine_sglang::make_engine(
cancel_token.clone(),
&model_path,
&sock_prefix,
......@@ -271,7 +270,6 @@ pub async fn run(
}
#[cfg(feature = "vllm")]
Output::Vllm => {
use dynamo_llm::engines::vllm;
if flags.base_gpu_id != 0 {
anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
}
......@@ -305,7 +303,7 @@ pub async fn run(
}
if node_conf.node_rank == 0 {
// vllm multi-node only the leader runs vllm
let (engine, vllm_future) = vllm::make_leader_engine(
let (engine, vllm_future) = dynamo_engine_vllm::make_leader_engine(
cancel_token.clone(),
&model_path,
&sock_prefix,
......@@ -324,14 +322,15 @@ pub async fn run(
}
} else {
// Nodes rank > 0 only run 'ray'
let stop_future = vllm::start_follower(cancel_token.clone(), node_conf).await?;
let stop_future =
dynamo_engine_vllm::start_follower(cancel_token.clone(), node_conf).await?;
extra = Some(Box::pin(stop_future));
EngineConfig::None
}
}
#[cfg(feature = "llamacpp")]
Output::LlamaCpp => {
use dynamo_llm::engines::llamacpp;
use dynamo_engine_llamacpp;
let Some(model_path) = model_path else {
anyhow::bail!("out=llamacpp requires flag --model-path=<full-path-to-model-gguf>");
};
......@@ -343,7 +342,8 @@ pub async fn run(
"Pass --model-config so we can find the tokenizer, should be an HF checkout."
);
};
let engine = llamacpp::make_engine(cancel_token.clone(), &model_path).await?;
let engine =
dynamo_engine_llamacpp::make_engine(cancel_token.clone(), &model_path).await?;
EngineConfig::StaticCore {
service_name: card.service_name.clone(),
engine,
......@@ -352,7 +352,6 @@ pub async fn run(
}
#[cfg(feature = "trtllm")]
Output::TrtLLM => {
use dynamo_llm::engines::trtllm;
let Some(model_path) = model_path else {
anyhow::bail!("out=trtllm requires flag --model-path=<full-path-to-model-dir>");
};
......@@ -363,7 +362,10 @@ pub async fn run(
}
// Safety: Earlier we build maybe_card from model_path, which we checked right above
let card = maybe_card.clone().unwrap();
let engine = trtllm::make_engine(model_path.display(), flags.tensor_parallel_size)?;
let engine = dynamo_engine_trtllm::make_engine(
model_path.display(),
flags.tensor_parallel_size,
)?;
EngineConfig::StaticCore {
service_name: card.service_name.clone(),
engine,
......@@ -372,13 +374,13 @@ pub async fn run(
}
#[cfg(feature = "python")]
Output::PythonStr(path_str) => {
use dynamo_llm::engines::python;
let Some(model_name) = model_name else {
anyhow::bail!("Provide model service name as `--model-name <this>`");
};
let py_args = flags.as_vec(&path_str, &model_name);
let p = std::path::PathBuf::from(path_str);
let engine = python::make_string_engine(cancel_token.clone(), &p, py_args).await?;
let engine =
dynamo_engine_python::make_string_engine(cancel_token.clone(), &p, py_args).await?;
EngineConfig::StaticFull {
service_name: model_name,
engine,
......@@ -386,7 +388,6 @@ pub async fn run(
}
#[cfg(feature = "python")]
Output::PythonTok(path_str) => {
use dynamo_llm::engines::python;
let Some(card) = maybe_card.clone() else {
anyhow::bail!("Could not find tokenizer. Pass flag --model-path <path>");
};
......@@ -395,7 +396,8 @@ pub async fn run(
};
let py_args = flags.as_vec(&path_str, &model_name);
let p = std::path::PathBuf::from(path_str);
let engine = python::make_token_engine(cancel_token.clone(), &p, py_args).await?;
let engine =
dynamo_engine_python::make_token_engine(cancel_token.clone(), &p, py_args).await?;
EngineConfig::StaticCore {
service_name: model_name.clone(),
engine,
......
......@@ -51,8 +51,7 @@ fn main() -> anyhow::Result<()> {
if cfg!(feature = "sglang") {
#[cfg(feature = "sglang")]
{
use dynamo_llm::engines::sglang;
let gpu_config = sglang::MultiGPUConfig {
let gpu_config = dynamo_engine_sglang::MultiGPUConfig {
tp_size: flags.tensor_parallel_size,
tp_rank: sglang_flags.tp_rank,
gpu_id: sglang_flags.gpu_id,
......@@ -62,7 +61,7 @@ fn main() -> anyhow::Result<()> {
node_rank: flags.node_rank,
leader_addr: flags.leader_addr.unwrap_or_default(),
};
return sglang::run_subprocess(
return dynamo_engine_sglang::run_subprocess(
ZMQ_SOCKET_PREFIX,
model_path,
sglang_flags.pipe_fd as std::os::fd::RawFd,
......@@ -84,13 +83,12 @@ fn main() -> anyhow::Result<()> {
if cfg!(feature = "vllm") {
#[cfg(feature = "vllm")]
{
use dynamo_llm::engines::vllm;
let node_config = dynamo_llm::engines::MultiNodeConfig {
num_nodes: flags.num_nodes,
node_rank: flags.node_rank,
leader_addr: flags.leader_addr.unwrap_or_default(),
};
return vllm::run_subprocess(
return dynamo_engine_vllm::run_subprocess(
ZMQ_SOCKET_PREFIX,
&model_path,
node_config,
......
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::env;
use std::sync::LazyLock;
use std::time::Duration;
/// How long to sleep between echoed tokens.
/// Default is 10ms which gives us 100 tok/s.
/// Can be configured via the DYN_TOKEN_ECHO_DELAY_MS environment variable.
pub static TOKEN_ECHO_DELAY: LazyLock<Duration> = LazyLock::new(|| {
const DEFAULT_DELAY_MS: u64 = 10;
let delay_ms = env::var("DYN_TOKEN_ECHO_DELAY_MS")
.ok()
.and_then(|val| val.parse::<u64>().ok())
.unwrap_or(DEFAULT_DELAY_MS);
Duration::from_millis(delay_ms)
});
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::sync::Arc;
use async_stream::stream;
use async_trait::async_trait;
use dynamo_llm::backend::ExecutionContext;
use dynamo_llm::preprocessor::BackendInput;
use dynamo_llm::protocols::common::llm_backend::LLMEngineOutput;
use dynamo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
use dynamo_runtime::pipeline::{Error, ManyOut, SingleIn};
use dynamo_runtime::protocols::annotated::Annotated;
use super::common::TOKEN_ECHO_DELAY;
/// Engine that accepts pre-processed requests and echos the tokens back as the response
/// The response will include the full prompt template.
/// Useful for testing pre-processing.
struct EchoEngineCore {}
pub fn make_engine_core() -> ExecutionContext {
Arc::new(EchoEngineCore {})
}
#[async_trait]
impl AsyncEngine<SingleIn<BackendInput>, ManyOut<Annotated<LLMEngineOutput>>, Error>
for EchoEngineCore
{
async fn generate(
&self,
incoming_request: SingleIn<BackendInput>,
) -> Result<ManyOut<Annotated<LLMEngineOutput>>, Error> {
let (request, context) = incoming_request.into_parts();
let ctx = context.context();
let output = stream! {
for tok in request.token_ids {
tokio::time::sleep(*TOKEN_ECHO_DELAY).await;
yield delta_core(tok);
}
yield Annotated::from_data(LLMEngineOutput::stop());
};
Ok(ResponseStream::new(Box::pin(output), ctx))
}
}
fn delta_core(tok: u32) -> Annotated<LLMEngineOutput> {
let delta = LLMEngineOutput {
token_ids: vec![tok],
tokens: None,
text: None,
cum_log_probs: None,
log_probs: None,
finish_reason: None,
};
Annotated::from_data(delta)
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::sync::Arc;
use async_stream::stream;
use async_trait::async_trait;
use dynamo_llm::protocols::openai::chat_completions::{
NvCreateChatCompletionRequest, NvCreateChatCompletionStreamResponse,
};
use dynamo_llm::types::openai::chat_completions::OpenAIChatCompletionsStreamingEngine;
use dynamo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
use dynamo_runtime::pipeline::{Error, ManyOut, SingleIn};
use dynamo_runtime::protocols::annotated::Annotated;
use super::common::TOKEN_ECHO_DELAY;
/// Engine that accepts un-preprocessed requests and echos the prompt back as the response
/// Useful for testing ingress such as service-http.
struct EchoEngineFull {}
pub fn make_engine_full() -> OpenAIChatCompletionsStreamingEngine {
Arc::new(EchoEngineFull {})
}
#[async_trait]
impl
AsyncEngine<
SingleIn<NvCreateChatCompletionRequest>,
ManyOut<Annotated<NvCreateChatCompletionStreamResponse>>,
Error,
> for EchoEngineFull
{
async fn generate(
&self,
incoming_request: SingleIn<NvCreateChatCompletionRequest>,
) -> Result<ManyOut<Annotated<NvCreateChatCompletionStreamResponse>>, Error> {
let (request, context) = incoming_request.transfer(());
let deltas = request.response_generator();
let ctx = context.context();
let req = request.inner.messages.into_iter().last().unwrap();
let prompt = match req {
async_openai::types::ChatCompletionRequestMessage::User(user_msg) => {
match user_msg.content {
async_openai::types::ChatCompletionRequestUserMessageContent::Text(prompt) => {
prompt
}
_ => anyhow::bail!("Invalid request content field, expected Content::Text"),
}
}
_ => anyhow::bail!("Invalid request type, expected User message"),
};
let output = stream! {
let mut id = 1;
for c in prompt.chars() {
// we are returning characters not tokens, so there will be some postprocessing overhead
tokio::time::sleep(*TOKEN_ECHO_DELAY).await;
let inner = deltas.create_choice(0, Some(c.to_string()), None, None);
let response = NvCreateChatCompletionStreamResponse {
inner,
};
yield Annotated{ id: Some(id.to_string()), data: Some(response), event: None, comment: None };
id += 1;
}
let inner = deltas.create_choice(0, None, Some(async_openai::types::FinishReason::Stop), None);
let response = NvCreateChatCompletionStreamResponse {
inner,
};
yield Annotated { id: Some(id.to_string()), data: Some(response), event: None, comment: None };
};
Ok(ResponseStream::new(Box::pin(output), ctx))
}
}
#
# SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION &
# AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.
#
macro(find_library_create_target target_name lib libtype hints)
message(
STATUS
"========================= Importing and creating target ${target_name} =========================="
)
message(STATUS "Looking for library ${lib}")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
find_library(
${lib}_LIB_PATH ${lib}${TRT_DEBUG_POSTFIX}
HINTS ${hints}
NO_DEFAULT_PATH)
endif()
find_library(${lib}_LIB_PATH ${lib} HINTS ${hints} NO_DEFAULT_PATH)
find_library(${lib}_LIB_PATH ${lib})
message(STATUS "Library that was found ${${lib}_LIB_PATH}")
add_library(${target_name} ${libtype} IMPORTED)
set_target_properties(
${target_name} PROPERTIES IMPORTED_LOCATION ${${lib}_LIB_PATH}
IMPORTED_IMPLIB ${${lib}_LIB_PATH})
message(
STATUS
"=========================================================================================="
)
endmacro()
#
# SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION &
# AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.
#
function(set_ifndef variable value)
if(NOT DEFINED ${variable})
set(${variable}
${value}
PARENT_SCOPE)
endif()
endfunction()
......@@ -419,26 +419,6 @@ version = "1.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "89e25b6adfb930f02d1981565a6e5d9c547ac15a96606256d3b59040e5cd4ca3"
[[package]]
name = "bindgen"
version = "0.70.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f"
dependencies = [
"bitflags 2.9.0",
"cexpr",
"clang-sys",
"itertools 0.13.0",
"log",
"prettyplease",
"proc-macro2",
"quote",
"regex",
"rustc-hash 1.1.0",
"shlex",
"syn 2.0.100",
]
[[package]]
name = "bit-set"
version = "0.8.0"
......@@ -573,15 +553,6 @@ dependencies = [
"shlex",
]
[[package]]
name = "cexpr"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766"
dependencies = [
"nom",
]
[[package]]
name = "cfg-expr"
version = "0.15.8"
......@@ -623,17 +594,6 @@ dependencies = [
"windows-link",
]
[[package]]
name = "clang-sys"
version = "1.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
dependencies = [
"glob",
"libc",
"libloading",
]
[[package]]
name = "clap"
version = "4.5.32"
......@@ -661,15 +621,6 @@ version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6"
[[package]]
name = "cmake"
version = "0.1.54"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e7caa3f9de89ddbe2c607f4101924c5abec803763ae9534e4f4d7d8f84aa81f0"
dependencies = [
"cc",
]
[[package]]
name = "colorchoice"
version = "1.0.3"
......@@ -1054,6 +1005,27 @@ dependencies = [
"bytemuck",
]
[[package]]
name = "dynamo-engine-python"
version = "0.1.0"
dependencies = [
"anyhow",
"async-openai",
"async-stream",
"async-trait",
"dynamo-llm",
"dynamo-runtime",
"pyo3",
"pyo3-async-runtimes",
"pythonize",
"serde",
"serde_json",
"thiserror 2.0.12",
"tokio",
"tokio-stream",
"tracing",
]
[[package]]
name = "dynamo-llm"
version = "0.1.0"
......@@ -1065,14 +1037,12 @@ dependencies = [
"async-trait",
"async_zmq",
"axum 0.8.1",
"bindgen",
"blake3",
"bs62",
"bytemuck",
"bytes",
"candle-core",
"chrono",
"cmake",
"derive-getters",
"derive_builder",
"dynamo-runtime",
......@@ -1081,24 +1051,16 @@ dependencies = [
"futures",
"galil-seiferas",
"ggus",
"indexmap 2.8.0",
"itertools 0.14.0",
"libc",
"memmap2",
"minijinja",
"minijinja-contrib",
"prometheus",
"pyo3",
"pyo3-async-runtimes",
"pythonize",
"rand 0.9.0",
"rayon",
"regex",
"semver",
"serde",
"serde-pickle",
"serde_json",
"serde_repr",
"strum",
"thiserror 2.0.12",
"tokenizers",
......@@ -1118,6 +1080,7 @@ dependencies = [
name = "dynamo-py3"
version = "0.1.0"
dependencies = [
"dynamo-engine-python",
"dynamo-llm",
"dynamo-runtime",
"futures",
......@@ -1838,12 +1801,6 @@ version = "0.31.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f"
[[package]]
name = "glob"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2"
[[package]]
name = "h2"
version = "0.4.8"
......@@ -2297,12 +2254,6 @@ version = "1.70.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
[[package]]
name = "iter-read"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "071ed4cc1afd86650602c7b11aa2e1ce30762a1c27193201cb5cee9c6ebb1294"
[[package]]
name = "itertools"
version = "0.11.0"
......@@ -3367,7 +3318,7 @@ dependencies = [
"pin-project-lite",
"quinn-proto",
"quinn-udp",
"rustc-hash 2.1.1",
"rustc-hash",
"rustls",
"socket2",
"thiserror 2.0.12",
......@@ -3386,7 +3337,7 @@ dependencies = [
"getrandom 0.3.2",
"rand 0.9.0",
"ring",
"rustc-hash 2.1.1",
"rustc-hash",
"rustls",
"rustls-pki-types",
"slab",
......@@ -3697,12 +3648,6 @@ version = "0.1.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
[[package]]
name = "rustc-hash"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
[[package]]
name = "rustc-hash"
version = "2.1.1"
......@@ -3913,9 +3858,6 @@ name = "semver"
version = "1.0.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0"
dependencies = [
"serde",
]
[[package]]
name = "seq-macro"
......@@ -3932,19 +3874,6 @@ dependencies = [
"serde_derive",
]
[[package]]
name = "serde-pickle"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b641fdc8bcf2781ee78b30c599700d64ad4f412976143e4c5d0b9df906bb4843"
dependencies = [
"byteorder",
"iter-read",
"num-bigint",
"num-traits",
"serde",
]
[[package]]
name = "serde_derive"
version = "1.0.219"
......
......@@ -35,8 +35,9 @@ crate-type = ["cdylib"]
[dependencies]
dynamo-llm = { path = "../../llm", features = ["python"] }
dynamo-llm = { path = "../../llm" }
dynamo-runtime = { path = "../../runtime" }
dynamo-engine-python = { path = "../../engines/python" }
futures = { version = "0.3" }
once_cell = { version = "1.20.3" }
......
......@@ -15,7 +15,7 @@
use std::sync::Arc;
use dynamo_llm::engines::python::PythonServerStreamingEngine;
use dynamo_engine_python::PythonServerStreamingEngine;
use dynamo_runtime::CancellationToken;
pub use dynamo_runtime::{
pipeline::{async_trait, AsyncEngine, Data, ManyOut, SingleIn},
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
[package]
name = "dynamo-engine-llamacpp"
version.workspace = true
edition.workspace = true
description.workspace = true
authors.workspace = true
license.workspace = true
homepage.workspace = true
repository.workspace = true
keywords.workspace = true
[features]
default = []
cuda = ["llama-cpp-2/cuda"]
metal = ["llama-cpp-2/metal"]
vulkan = ["llama-cpp-2/vulkan"]
[dependencies]
dynamo-runtime = { workspace = true }
dynamo-llm = { workspace = true }
async-stream = { workspace = true }
tokio = { workspace = true }
tracing = { workspace = true }
llama-cpp-2 = { version = "0.1.102" }
# llamacpp engine for dynamo
The [`llama-cpp-2`](https://crates.io/crates/llama-cpp-2) project used to build a Dynamo backend
requires that clang is installed on your system. See [bindgen user guide](https://rust-lang.github.io/rust-bindgen/requirements.html)
for more details.
......@@ -19,14 +19,12 @@ use std::{
sync::{Arc, Mutex, OnceLock},
};
use anyhow::Context;
use async_stream::stream;
use async_trait::async_trait;
use dynamo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
use dynamo_runtime::pipeline::error as pipeline_error;
use dynamo_runtime::pipeline::{Error, ManyOut, SingleIn};
use dynamo_runtime::pipeline::{async_trait, Error, ManyOut, SingleIn};
use dynamo_runtime::protocols::annotated::Annotated;
use dynamo_runtime::CancellationToken;
use dynamo_runtime::{CancellationToken, ErrorContext, Result};
use llama_cpp_2::{
context::{params::LlamaContextParams, LlamaContext},
llama_backend::LlamaBackend,
......@@ -36,9 +34,9 @@ use llama_cpp_2::{
token::LlamaToken,
};
use crate::backend::ExecutionContext;
use crate::protocols::common::llm_backend::{BackendInput, LLMEngineOutput};
use crate::protocols::common::preprocessor::PreprocessedRequest;
use dynamo_llm::backend::ExecutionContext;
use dynamo_llm::protocols::common::llm_backend::{BackendInput, LLMEngineOutput};
use dynamo_llm::protocols::common::preprocessor::PreprocessedRequest;
/// If user does not provide a max_tokens limit prompt+output to this many
const DEFAULT_MAX_TOKENS: u32 = 8192;
......@@ -113,7 +111,7 @@ impl LlamacppEngine {
}
}
fn load_model(backend: &LlamaBackend, model_path: &Path) -> anyhow::Result<LlamaModel> {
fn load_model(backend: &LlamaBackend, model_path: &Path) -> Result<LlamaModel> {
let model_params = {
if cfg!(any(feature = "cuda", feature = "vulkan")) {
LlamaModelParams::default().with_n_gpu_layers(1000)
......@@ -212,7 +210,7 @@ fn run_request(
cancel_token: CancellationToken,
work_request: WorkRequest,
llama_context: &mut ContextWrapper,
) -> anyhow::Result<()> {
) -> Result<()> {
let tokens_list: Vec<LlamaToken> = work_request
.request
.token_ids
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
[package]
name = "dynamo-engine-mistralrs"
version.workspace = true
edition.workspace = true
description.workspace = true
authors.workspace = true
license.workspace = true
homepage.workspace = true
repository.workspace = true
keywords.workspace = true
[features]
default = []
cuda = ["mistralrs/cuda", "candle-core/cuda"]
metal = ["mistralrs/metal"]
[dependencies]
dynamo-runtime = { workspace = true }
dynamo-llm = { workspace = true }
anyhow = { workspace = true }
async-openai = "0.27.2"
async-stream = { workspace = true }
async-trait = { workspace = true }
candle-core = { version = "0.8.0" }
either = { workspace = true }
indexmap = { version = "2.6" }
mistralrs = { git = "https://github.com/EricLBuehler/mistral.rs.git", rev = "aaafc2ef" }
serde_json = { workspace = true }
tokio = { workspace = true }
tracing = { workspace = true }
......@@ -34,10 +34,10 @@ use dynamo_runtime::pipeline::error as pipeline_error;
use dynamo_runtime::pipeline::{Error, ManyOut, SingleIn};
use dynamo_runtime::protocols::annotated::Annotated;
use crate::protocols::openai::chat_completions::{
use dynamo_llm::protocols::openai::chat_completions::{
NvCreateChatCompletionRequest, NvCreateChatCompletionStreamResponse,
};
use crate::types::openai::chat_completions::OpenAIChatCompletionsStreamingEngine;
use dynamo_llm::types::openai::chat_completions::OpenAIChatCompletionsStreamingEngine;
/// How many requests mistral will run at once in the paged attention scheduler.
/// It actually runs 1 fewer than this.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment