Commit 1af7433b authored by Neelay Shah's avatar Neelay Shah Committed by GitHub
Browse files

refactor: rename triton_distributed to dynemo (#22)


Co-authored-by: default avatarGraham King <grahamk@nvidia.com>
parent ee4ef06b
...@@ -13,12 +13,7 @@ ...@@ -13,12 +13,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
use futures::StreamExt; use dynemo_llm::{
use std::{
io::{ErrorKind, Read, Write},
sync::Arc,
};
use triton_distributed_llm::{
backend::Backend, backend::Backend,
preprocessor::OpenAIPreprocessor, preprocessor::OpenAIPreprocessor,
types::{ types::{
...@@ -29,10 +24,15 @@ use triton_distributed_llm::{ ...@@ -29,10 +24,15 @@ use triton_distributed_llm::{
Annotated, Annotated,
}, },
}; };
use triton_distributed_runtime::{ use dynemo_runtime::{
pipeline::{Context, ManyOut, Operator, ServiceBackend, ServiceFrontend, SingleIn, Source}, pipeline::{Context, ManyOut, Operator, ServiceBackend, ServiceFrontend, SingleIn, Source},
runtime::CancellationToken, runtime::CancellationToken,
}; };
use futures::StreamExt;
use std::{
io::{ErrorKind, Read, Write},
sync::Arc,
};
use crate::EngineConfig; use crate::EngineConfig;
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
#[cfg(any(feature = "vllm", feature = "sglang"))] #[cfg(any(feature = "vllm", feature = "sglang"))]
use std::{future::Future, pin::Pin}; use std::{future::Future, pin::Pin};
use triton_distributed_llm::{ use dynemo_llm::{
backend::ExecutionContext, backend::ExecutionContext,
model_card::model::ModelDeploymentCard, model_card::model::ModelDeploymentCard,
types::{ types::{
...@@ -27,7 +27,7 @@ use triton_distributed_llm::{ ...@@ -27,7 +27,7 @@ use triton_distributed_llm::{
Annotated, Annotated,
}, },
}; };
use triton_distributed_runtime::{component::Client, protocols::Endpoint, DistributedRuntime}; use dynemo_runtime::{component::Client, protocols::Endpoint, DistributedRuntime};
mod flags; mod flags;
pub use flags::Flags; pub use flags::Flags;
...@@ -67,7 +67,7 @@ pub enum EngineConfig { ...@@ -67,7 +67,7 @@ pub enum EngineConfig {
#[allow(unused_mut)] #[allow(unused_mut)]
pub async fn run( pub async fn run(
runtime: triton_distributed_runtime::Runtime, runtime: dynemo_runtime::Runtime,
mut in_opt: Input, // mut because vllm and sglang multi-node can change it mut in_opt: Input, // mut because vllm and sglang multi-node can change it
out_opt: Output, out_opt: Output,
flags: Flags, flags: Flags,
...@@ -173,13 +173,12 @@ pub async fn run( ...@@ -173,13 +173,12 @@ pub async fn run(
}; };
EngineConfig::StaticFull { EngineConfig::StaticFull {
service_name: model_name, service_name: model_name,
engine: triton_distributed_llm::engines::mistralrs::make_engine(&model_path) engine: dynemo_llm::engines::mistralrs::make_engine(&model_path).await?,
.await?,
} }
} }
#[cfg(feature = "sglang")] #[cfg(feature = "sglang")]
Output::SgLang => { Output::SgLang => {
use triton_distributed_llm::engines::sglang; use dynemo_llm::engines::sglang;
let Some(model_path) = model_path else { let Some(model_path) = model_path else {
anyhow::bail!("out=sglang requires flag --model-path=<full-path-to-model-dir>"); anyhow::bail!("out=sglang requires flag --model-path=<full-path-to-model-dir>");
}; };
...@@ -191,7 +190,7 @@ pub async fn run( ...@@ -191,7 +190,7 @@ pub async fn run(
let Some(sock_prefix) = zmq_socket_prefix else { let Some(sock_prefix) = zmq_socket_prefix else {
anyhow::bail!("sglang requires zmq_socket_prefix"); anyhow::bail!("sglang requires zmq_socket_prefix");
}; };
let node_conf = triton_distributed_llm::engines::MultiNodeConfig { let node_conf = dynemo_llm::engines::MultiNodeConfig {
num_nodes: flags.num_nodes, num_nodes: flags.num_nodes,
node_rank: flags.node_rank, node_rank: flags.node_rank,
leader_addr: flags.leader_addr.unwrap_or_default(), leader_addr: flags.leader_addr.unwrap_or_default(),
...@@ -229,7 +228,7 @@ pub async fn run( ...@@ -229,7 +228,7 @@ pub async fn run(
} }
#[cfg(feature = "vllm")] #[cfg(feature = "vllm")]
Output::Vllm => { Output::Vllm => {
use triton_distributed_llm::engines::vllm; use dynemo_llm::engines::vllm;
if flags.base_gpu_id != 0 { if flags.base_gpu_id != 0 {
anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead."); anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
} }
...@@ -253,7 +252,7 @@ pub async fn run( ...@@ -253,7 +252,7 @@ pub async fn run(
let Some(sock_prefix) = zmq_socket_prefix else { let Some(sock_prefix) = zmq_socket_prefix else {
anyhow::bail!("vllm requires zmq_socket_prefix"); anyhow::bail!("vllm requires zmq_socket_prefix");
}; };
let node_conf = triton_distributed_llm::engines::MultiNodeConfig { let node_conf = dynemo_llm::engines::MultiNodeConfig {
num_nodes: flags.num_nodes, num_nodes: flags.num_nodes,
node_rank: flags.node_rank, node_rank: flags.node_rank,
leader_addr: flags.leader_addr.unwrap_or_default(), leader_addr: flags.leader_addr.unwrap_or_default(),
...@@ -296,7 +295,7 @@ pub async fn run( ...@@ -296,7 +295,7 @@ pub async fn run(
} }
#[cfg(feature = "llamacpp")] #[cfg(feature = "llamacpp")]
Output::LlamaCpp => { Output::LlamaCpp => {
use triton_distributed_llm::engines::llamacpp; use dynemo_llm::engines::llamacpp;
let Some(model_path) = model_path else { let Some(model_path) = model_path else {
anyhow::bail!("out=llamacpp requires flag --model-path=<full-path-to-model-gguf>"); anyhow::bail!("out=llamacpp requires flag --model-path=<full-path-to-model-gguf>");
}; };
...@@ -317,7 +316,7 @@ pub async fn run( ...@@ -317,7 +316,7 @@ pub async fn run(
} }
#[cfg(feature = "trtllm")] #[cfg(feature = "trtllm")]
Output::TrtLLM => { Output::TrtLLM => {
use triton_distributed_llm::engines::trtllm; use dynemo_llm::engines::trtllm;
let Some(model_path) = model_path else { let Some(model_path) = model_path else {
anyhow::bail!("out=trtllm requires flag --model-path=<full-path-to-model-dir>"); anyhow::bail!("out=trtllm requires flag --model-path=<full-path-to-model-dir>");
}; };
......
...@@ -18,7 +18,7 @@ use std::env; ...@@ -18,7 +18,7 @@ use std::env;
use clap::Parser; use clap::Parser;
use dynemo_run::{Input, Output}; use dynemo_run::{Input, Output};
use triton_distributed_runtime::logging; use dynemo_runtime::logging;
const HELP: &str = r#" const HELP: &str = r#"
dynemo-run is a single binary that wires together the various inputs (http, text, network) and workers (network, engine), that runs the services. It is the simplest way to use dynemo locally. dynemo-run is a single binary that wires together the various inputs (http, text, network) and workers (network, engine), that runs the services. It is the simplest way to use dynemo locally.
...@@ -60,13 +60,13 @@ fn main() -> anyhow::Result<()> { ...@@ -60,13 +60,13 @@ fn main() -> anyhow::Result<()> {
if cfg!(feature = "sglang") { if cfg!(feature = "sglang") {
#[cfg(feature = "sglang")] #[cfg(feature = "sglang")]
{ {
use triton_distributed_llm::engines::sglang; use dynemo_llm::engines::sglang;
let gpu_config = sglang::MultiGPUConfig { let gpu_config = sglang::MultiGPUConfig {
tp_size: flags.tensor_parallel_size, tp_size: flags.tensor_parallel_size,
tp_rank: sglang_flags.tp_rank, tp_rank: sglang_flags.tp_rank,
gpu_id: sglang_flags.gpu_id, gpu_id: sglang_flags.gpu_id,
}; };
let node_config = triton_distributed_llm::engines::MultiNodeConfig { let node_config = dynemo_llm::engines::MultiNodeConfig {
num_nodes: flags.num_nodes, num_nodes: flags.num_nodes,
node_rank: flags.node_rank, node_rank: flags.node_rank,
leader_addr: flags.leader_addr.unwrap_or_default(), leader_addr: flags.leader_addr.unwrap_or_default(),
...@@ -98,8 +98,8 @@ fn main() -> anyhow::Result<()> { ...@@ -98,8 +98,8 @@ fn main() -> anyhow::Result<()> {
if cfg!(feature = "vllm") { if cfg!(feature = "vllm") {
#[cfg(feature = "vllm")] #[cfg(feature = "vllm")]
{ {
use triton_distributed_llm::engines::vllm; use dynemo_llm::engines::vllm;
let node_config = triton_distributed_llm::engines::MultiNodeConfig { let node_config = dynemo_llm::engines::MultiNodeConfig {
num_nodes: flags.num_nodes, num_nodes: flags.num_nodes,
node_rank: flags.node_rank, node_rank: flags.node_rank,
leader_addr: flags.leader_addr.unwrap_or_default(), leader_addr: flags.leader_addr.unwrap_or_default(),
...@@ -119,15 +119,15 @@ fn main() -> anyhow::Result<()> { ...@@ -119,15 +119,15 @@ fn main() -> anyhow::Result<()> {
} }
// max_worker_threads and max_blocking_threads from env vars or config file. // max_worker_threads and max_blocking_threads from env vars or config file.
let rt_config = triton_distributed_runtime::RuntimeConfig::from_settings()?; let rt_config = dynemo_runtime::RuntimeConfig::from_settings()?;
// One per process. Wraps a Runtime with holds two tokio runtimes. // One per process. Wraps a Runtime with holds two tokio runtimes.
let worker = triton_distributed_runtime::Worker::from_config(rt_config)?; let worker = dynemo_runtime::Worker::from_config(rt_config)?;
worker.execute(wrapper) worker.execute(wrapper)
} }
async fn wrapper(runtime: triton_distributed_runtime::Runtime) -> anyhow::Result<()> { async fn wrapper(runtime: dynemo_runtime::Runtime) -> anyhow::Result<()> {
let mut in_opt = None; let mut in_opt = None;
let mut out_opt = None; let mut out_opt = None;
let args: Vec<String> = env::args().skip(1).collect(); let args: Vec<String> = env::args().skip(1).collect();
......
...@@ -18,12 +18,12 @@ use std::{sync::Arc, time::Duration}; ...@@ -18,12 +18,12 @@ use std::{sync::Arc, time::Duration};
use async_stream::stream; use async_stream::stream;
use async_trait::async_trait; use async_trait::async_trait;
use triton_distributed_llm::backend::ExecutionContext; use dynemo_llm::backend::ExecutionContext;
use triton_distributed_llm::preprocessor::BackendInput; use dynemo_llm::preprocessor::BackendInput;
use triton_distributed_llm::protocols::common::llm_backend::LLMEngineOutput; use dynemo_llm::protocols::common::llm_backend::LLMEngineOutput;
use triton_distributed_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream}; use dynemo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
use triton_distributed_runtime::pipeline::{Error, ManyOut, SingleIn}; use dynemo_runtime::pipeline::{Error, ManyOut, SingleIn};
use triton_distributed_runtime::protocols::annotated::Annotated; use dynemo_runtime::protocols::annotated::Annotated;
/// How long to sleep between echoed tokens. /// How long to sleep between echoed tokens.
/// 50ms gives us 20 tok/s. /// 50ms gives us 20 tok/s.
......
...@@ -18,13 +18,13 @@ use std::{sync::Arc, time::Duration}; ...@@ -18,13 +18,13 @@ use std::{sync::Arc, time::Duration};
use async_stream::stream; use async_stream::stream;
use async_trait::async_trait; use async_trait::async_trait;
use triton_distributed_llm::protocols::openai::chat_completions::{ use dynemo_llm::protocols::openai::chat_completions::{
NvCreateChatCompletionRequest, NvCreateChatCompletionStreamResponse, NvCreateChatCompletionRequest, NvCreateChatCompletionStreamResponse,
}; };
use triton_distributed_llm::types::openai::chat_completions::OpenAIChatCompletionsStreamingEngine; use dynemo_llm::types::openai::chat_completions::OpenAIChatCompletionsStreamingEngine;
use triton_distributed_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream}; use dynemo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
use triton_distributed_runtime::pipeline::{Error, ManyOut, SingleIn}; use dynemo_runtime::pipeline::{Error, ManyOut, SingleIn};
use triton_distributed_runtime::protocols::annotated::Annotated; use dynemo_runtime::protocols::annotated::Annotated;
/// How long to sleep between echoed tokens. /// How long to sleep between echoed tokens.
/// 50ms gives us 20 tok/s. /// 50ms gives us 20 tok/s.
......
...@@ -954,6 +954,99 @@ dependencies = [ ...@@ -954,6 +954,99 @@ dependencies = [
"syn 2.0.96", "syn 2.0.96",
] ]
[[package]]
name = "dynemo-llm"
version = "0.2.1"
dependencies = [
"anyhow",
"async-openai",
"async-stream",
"async-trait",
"axum 0.8.1",
"bindgen",
"blake3",
"bs62",
"bytes",
"chrono",
"cmake",
"derive_builder",
"dynemo-runtime",
"either",
"erased-serde",
"futures",
"galil-seiferas",
"indexmap 2.7.0",
"itertools 0.14.0",
"libc",
"minijinja",
"minijinja-contrib",
"prometheus",
"pyo3",
"regex",
"semver",
"serde",
"serde-pickle",
"serde_json",
"serde_repr",
"strum",
"thiserror 2.0.11",
"tokenizers",
"tokio",
"tokio-stream",
"tokio-util",
"toktrie",
"toktrie_hf_tokenizers",
"tracing",
"unicode-segmentation",
"uuid",
"validator",
"xxhash-rust",
]
[[package]]
name = "dynemo-runtime"
version = "0.2.1"
dependencies = [
"anyhow",
"async-nats",
"async-once-cell",
"async-stream",
"async-trait",
"async_zmq",
"blake3",
"bytes",
"chrono",
"derive-getters",
"derive_builder",
"educe",
"either",
"etcd-client",
"figment",
"futures",
"humantime",
"local-ip-address",
"log",
"nid",
"nix",
"nuid",
"once_cell",
"prometheus",
"rand",
"regex",
"serde",
"serde_json",
"socket2",
"thiserror 1.0.69",
"tokio",
"tokio-stream",
"tokio-util",
"tracing",
"tracing-subscriber",
"uuid",
"validator",
"xxhash-rust",
]
[[package]] [[package]]
name = "ed25519" name = "ed25519"
version = "2.2.3" version = "2.2.3"
...@@ -1853,6 +1946,27 @@ version = "0.2.169" ...@@ -1853,6 +1946,27 @@ version = "0.2.169"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
[[package]]
name = "libdynemo-llm"
version = "0.2.1"
dependencies = [
"anyhow",
"async-once-cell",
"cbindgen",
"dynemo-llm",
"dynemo-runtime",
"futures",
"libc",
"once_cell",
"serde",
"serde_json",
"tokio",
"tokio-stream",
"tracing",
"tracing-subscriber",
"uuid",
]
[[package]] [[package]]
name = "libloading" name = "libloading"
version = "0.8.6" version = "0.8.6"
...@@ -1873,27 +1987,6 @@ dependencies = [ ...@@ -1873,27 +1987,6 @@ dependencies = [
"libc", "libc",
] ]
[[package]]
name = "libtriton-distributed-llm"
version = "0.2.1"
dependencies = [
"anyhow",
"async-once-cell",
"cbindgen",
"futures",
"libc",
"once_cell",
"serde",
"serde_json",
"tokio",
"tokio-stream",
"tracing",
"tracing-subscriber",
"triton-distributed-llm",
"triton-distributed-runtime",
"uuid",
]
[[package]] [[package]]
name = "linux-raw-sys" name = "linux-raw-sys"
version = "0.4.15" version = "0.4.15"
...@@ -3955,99 +4048,6 @@ dependencies = [ ...@@ -3955,99 +4048,6 @@ dependencies = [
"tracing-serde", "tracing-serde",
] ]
[[package]]
name = "triton-distributed-llm"
version = "0.2.1"
dependencies = [
"anyhow",
"async-openai",
"async-stream",
"async-trait",
"axum 0.8.1",
"bindgen",
"blake3",
"bs62",
"bytes",
"chrono",
"cmake",
"derive_builder",
"either",
"erased-serde",
"futures",
"galil-seiferas",
"indexmap 2.7.0",
"itertools 0.14.0",
"libc",
"minijinja",
"minijinja-contrib",
"prometheus",
"pyo3",
"regex",
"semver",
"serde",
"serde-pickle",
"serde_json",
"serde_repr",
"strum",
"thiserror 2.0.11",
"tokenizers",
"tokio",
"tokio-stream",
"tokio-util",
"toktrie",
"toktrie_hf_tokenizers",
"tracing",
"triton-distributed-runtime",
"unicode-segmentation",
"uuid",
"validator",
"xxhash-rust",
]
[[package]]
name = "triton-distributed-runtime"
version = "0.2.1"
dependencies = [
"anyhow",
"async-nats",
"async-once-cell",
"async-stream",
"async-trait",
"async_zmq",
"blake3",
"bytes",
"chrono",
"derive-getters",
"derive_builder",
"educe",
"either",
"etcd-client",
"figment",
"futures",
"humantime",
"local-ip-address",
"log",
"nid",
"nix",
"nuid",
"once_cell",
"prometheus",
"rand",
"regex",
"serde",
"serde_json",
"socket2",
"thiserror 1.0.69",
"tokio",
"tokio-stream",
"tokio-util",
"tracing",
"tracing-subscriber",
"uuid",
"validator",
"xxhash-rust",
]
[[package]] [[package]]
name = "try-lock" name = "try-lock"
version = "0.2.5" version = "0.2.5"
......
...@@ -14,24 +14,24 @@ ...@@ -14,24 +14,24 @@
# limitations under the License. # limitations under the License.
[package] [package]
name = "libtriton-distributed-llm" name = "libdynemo-llm"
version = "0.2.1" version = "0.2.1"
edition = "2021" edition = "2021"
authors = ["NVIDIA"] authors = ["NVIDIA"]
license = "Apache-2.0" license = "Apache-2.0"
homepage = "https://github.com/triton-inference-server/triton_distributed" homepage = "https://github.com/dynemo-ai/dynemo"
repository = "https://github.com/triton-inference-server/triton_distributed" repository = "https://github.com/dynemo-ai/dynemo.git"
[lib] [lib]
name = "triton_distributed_llm_capi" name = "dynemo_llm_capi"
crate-type = ["cdylib"] crate-type = ["cdylib"]
[build-dependencies] [build-dependencies]
cbindgen = "0.27" cbindgen = "0.27"
[dependencies] [dependencies]
triton-distributed-llm = { path = "../../llm" } dynemo-llm = { path = "../../llm" }
triton-distributed-runtime = { path = "../../runtime" } dynemo-runtime = { path = "../../runtime" }
anyhow = { version = "1" } anyhow = { version = "1" }
futures = "0.3" futures = "0.3"
......
...@@ -22,7 +22,7 @@ fn main() { ...@@ -22,7 +22,7 @@ fn main() {
let header_path = Path::new(&crate_dir) let header_path = Path::new(&crate_dir)
.join("include") .join("include")
.join("nvidia") .join("nvidia")
.join("triton_llm") .join("dynemo_llm")
.join("llm_engine.h"); .join("llm_engine.h");
cbindgen::generate(crate_dir) cbindgen::generate(crate_dir)
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
language = "C++" language = "C++"
cpp_compat = true cpp_compat = true
include_guard = "__NVIDIA_TRITON_LLM_API__" include_guard = "__NVIDIA_DYNEMO_LLM_API__"
[enum] [enum]
...@@ -25,7 +25,7 @@ enum_class = false ...@@ -25,7 +25,7 @@ enum_class = false
[export] [export]
include = ["TritonLlmResult", "triton_llm_init", "triton_llm_shutdown"] include = ["DynemoLlmResult", "dynemo_llm_init", "dynemo_llm_shutdown"]
[export.rename] [export.rename]
"TritonLlmResult" = "triton_llm_result_t" "DynemoLlmResult" = "dynemo_llm_result_t"
...@@ -19,10 +19,10 @@ use once_cell::sync::OnceCell; ...@@ -19,10 +19,10 @@ use once_cell::sync::OnceCell;
use std::ffi::CStr; use std::ffi::CStr;
use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::atomic::{AtomicU32, Ordering};
use triton_distributed_llm::kv_router::{ use dynemo_llm::kv_router::{
indexer::compute_block_hash_for_seq, protocols::*, publisher::KvEventPublisher, indexer::compute_block_hash_for_seq, protocols::*, publisher::KvEventPublisher,
}; };
use triton_distributed_runtime::{DistributedRuntime, Worker}; use dynemo_runtime::{DistributedRuntime, Worker};
static WK: OnceCell<Worker> = OnceCell::new(); static WK: OnceCell<Worker> = OnceCell::new();
static DRT: AsyncOnceCell<DistributedRuntime> = AsyncOnceCell::new(); static DRT: AsyncOnceCell<DistributedRuntime> = AsyncOnceCell::new();
// [FIXME] shouldn't the publisher be instance passing between API calls? // [FIXME] shouldn't the publisher be instance passing between API calls?
...@@ -41,7 +41,7 @@ fn initialize_tracing() { ...@@ -41,7 +41,7 @@ fn initialize_tracing() {
} }
#[repr(u32)] #[repr(u32)]
pub enum TritonLlmResult { pub enum DynemoLlmResult {
OK = 0, OK = 0,
ERR = 1, ERR = 1,
} }
...@@ -49,17 +49,17 @@ pub enum TritonLlmResult { ...@@ -49,17 +49,17 @@ pub enum TritonLlmResult {
/// # Safety /// # Safety
/// the namespace_c_str and component_c_str are passed as pointers to C strings /// the namespace_c_str and component_c_str are passed as pointers to C strings
#[no_mangle] #[no_mangle]
pub unsafe extern "C" fn triton_llm_init( pub unsafe extern "C" fn dynemo_llm_init(
namespace_c_str: *const c_char, namespace_c_str: *const c_char,
component_c_str: *const c_char, component_c_str: *const c_char,
worker_id: i64, worker_id: i64,
) -> TritonLlmResult { ) -> DynemoLlmResult {
initialize_tracing(); initialize_tracing();
let wk = match WK.get_or_try_init(Worker::from_settings) { let wk = match WK.get_or_try_init(Worker::from_settings) {
Ok(wk) => wk.clone(), Ok(wk) => wk.clone(),
Err(e) => { Err(e) => {
eprintln!("Failed to initialize runtime: {:?}", e); eprintln!("Failed to initialize runtime: {:?}", e);
return TritonLlmResult::ERR; return DynemoLlmResult::ERR;
} }
}; };
let rt = wk.runtime(); let rt = wk.runtime();
...@@ -73,7 +73,7 @@ pub unsafe extern "C" fn triton_llm_init( ...@@ -73,7 +73,7 @@ pub unsafe extern "C" fn triton_llm_init(
Ok(_) => Ok(()), Ok(_) => Ok(()),
Err(e) => { Err(e) => {
eprintln!("Failed to initialize distributed runtime: {:?}", e); eprintln!("Failed to initialize distributed runtime: {:?}", e);
Err(TritonLlmResult::ERR) Err(DynemoLlmResult::ERR)
} }
} }
}); });
...@@ -81,7 +81,7 @@ pub unsafe extern "C" fn triton_llm_init( ...@@ -81,7 +81,7 @@ pub unsafe extern "C" fn triton_llm_init(
Ok(s) => s.to_string(), Ok(s) => s.to_string(),
Err(e) => { Err(e) => {
eprintln!("Failed to convert C string to Rust string: {:?}", e); eprintln!("Failed to convert C string to Rust string: {:?}", e);
return TritonLlmResult::ERR; return DynemoLlmResult::ERR;
} }
}; };
...@@ -89,18 +89,18 @@ pub unsafe extern "C" fn triton_llm_init( ...@@ -89,18 +89,18 @@ pub unsafe extern "C" fn triton_llm_init(
Ok(s) => s.to_string(), Ok(s) => s.to_string(),
Err(e) => { Err(e) => {
eprintln!("Failed to convert C string to Rust string: {:?}", e); eprintln!("Failed to convert C string to Rust string: {:?}", e);
return TritonLlmResult::ERR; return DynemoLlmResult::ERR;
} }
}; };
match result { match result {
Ok(_) => match KV_PUB Ok(_) => match KV_PUB
.get_or_try_init(move || triton_create_kv_publisher(namespace, component, worker_id)) .get_or_try_init(move || dynemo_create_kv_publisher(namespace, component, worker_id))
{ {
Ok(_) => TritonLlmResult::OK, Ok(_) => DynemoLlmResult::OK,
Err(e) => { Err(e) => {
eprintln!("Failed to initialize distributed runtime: {:?}", e); eprintln!("Failed to initialize distributed runtime: {:?}", e);
TritonLlmResult::ERR DynemoLlmResult::ERR
} }
}, },
Err(e) => e, Err(e) => e,
...@@ -108,33 +108,33 @@ pub unsafe extern "C" fn triton_llm_init( ...@@ -108,33 +108,33 @@ pub unsafe extern "C" fn triton_llm_init(
} }
#[no_mangle] #[no_mangle]
pub extern "C" fn triton_llm_shutdown() -> TritonLlmResult { pub extern "C" fn dynemo_llm_shutdown() -> DynemoLlmResult {
let wk = match WK.get() { let wk = match WK.get() {
Some(wk) => wk, Some(wk) => wk,
None => { None => {
eprintln!("Runtime not initialized"); eprintln!("Runtime not initialized");
return TritonLlmResult::ERR; return DynemoLlmResult::ERR;
} }
}; };
wk.runtime().shutdown(); wk.runtime().shutdown();
TritonLlmResult::OK DynemoLlmResult::OK
} }
#[no_mangle] #[no_mangle]
pub extern "C" fn triton_llm_load_publisher_create() -> TritonLlmResult { pub extern "C" fn dynemo_llm_load_publisher_create() -> DynemoLlmResult {
TritonLlmResult::OK DynemoLlmResult::OK
} }
// instantiate a kv publisher // instantiate a kv publisher
// this will bring up the task to publish and the channels to await publishing events // this will bring up the task to publish and the channels to await publishing events
// the [`triton_kv_publish_store_event`] call will use a handle to the publisher to send events // the [`dynemo_kv_publish_store_event`] call will use a handle to the publisher to send events
// store and the [`triton_kv_event_create_removed`] will create remove events // store and the [`dynemo_kv_event_create_removed`] will create remove events
// these call mus be driving by external c++ threads that are consuming the kv events from the // these call mus be driving by external c++ threads that are consuming the kv events from the
// c++ executor api // c++ executor api
fn triton_create_kv_publisher( fn dynemo_create_kv_publisher(
namespace: String, namespace: String,
component: String, component: String,
worker_id: i64, worker_id: i64,
...@@ -238,7 +238,7 @@ fn kv_event_create_removed_from_parts( ...@@ -238,7 +238,7 @@ fn kv_event_create_removed_from_parts(
/// parent_hash is passed as pointer to indicate whether the blocks /// parent_hash is passed as pointer to indicate whether the blocks
/// has a parent hash or not. nullptr is used to represent no parent hash /// has a parent hash or not. nullptr is used to represent no parent hash
#[no_mangle] #[no_mangle]
pub unsafe extern "C" fn triton_kv_event_publish_stored( pub unsafe extern "C" fn dynemo_kv_event_publish_stored(
event_id: u64, event_id: u64,
token_ids: *const u32, token_ids: *const u32,
num_block_tokens: *const usize, num_block_tokens: *const usize,
...@@ -246,7 +246,7 @@ pub unsafe extern "C" fn triton_kv_event_publish_stored( ...@@ -246,7 +246,7 @@ pub unsafe extern "C" fn triton_kv_event_publish_stored(
num_blocks: usize, num_blocks: usize,
parent_hash: *const u64, parent_hash: *const u64,
lora_id: u64, lora_id: u64,
) -> TritonLlmResult { ) -> DynemoLlmResult {
let publisher = KV_PUB.get().unwrap(); let publisher = KV_PUB.get().unwrap();
let parent_hash = { let parent_hash = {
if parent_hash.is_null() { if parent_hash.is_null() {
...@@ -265,40 +265,40 @@ pub unsafe extern "C" fn triton_kv_event_publish_stored( ...@@ -265,40 +265,40 @@ pub unsafe extern "C" fn triton_kv_event_publish_stored(
lora_id, lora_id,
); );
match publisher.publish(event) { match publisher.publish(event) {
Ok(_) => TritonLlmResult::OK, Ok(_) => DynemoLlmResult::OK,
Err(e) => { Err(e) => {
eprintln!("Error publishing stored kv event {:?}", e); eprintln!("Error publishing stored kv event {:?}", e);
TritonLlmResult::ERR DynemoLlmResult::ERR
} }
} }
} }
#[no_mangle] #[no_mangle]
pub extern "C" fn triton_kv_event_publish_removed( pub extern "C" fn dynemo_kv_event_publish_removed(
event_id: u64, event_id: u64,
block_ids: *const u64, block_ids: *const u64,
num_blocks: usize, num_blocks: usize,
) -> TritonLlmResult { ) -> DynemoLlmResult {
let publisher = KV_PUB.get().unwrap(); let publisher = KV_PUB.get().unwrap();
let event = kv_event_create_removed_from_parts(event_id, block_ids, num_blocks); let event = kv_event_create_removed_from_parts(event_id, block_ids, num_blocks);
match publisher.publish(event) { match publisher.publish(event) {
Ok(_) => TritonLlmResult::OK, Ok(_) => DynemoLlmResult::OK,
Err(e) => { Err(e) => {
eprintln!("Error publishing removed kv event {:?}", e); eprintln!("Error publishing removed kv event {:?}", e);
TritonLlmResult::ERR DynemoLlmResult::ERR
} }
} }
} }
// #[no_mangle] // #[no_mangle]
// pub extern "C" fn triton_kv_publish_store_event( // pub extern "C" fn dynemo_kv_publish_store_event(
// event_id: u64, // event_id: u64,
// token_ids: *const u32, // token_ids: *const u32,
// num_tokens: usize, // num_tokens: usize,
// lora_id: u64, // lora_id: u64,
// ) -> TritonLlmResult { // ) -> DynemoLlmResult {
// // if event.is_null() || token_ids.is_null() { // // if event.is_null() || token_ids.is_null() {
// // return tritonKvErrorType::INVALID_TOKEN_IDS; // // return dynemoKvErrorType::INVALID_TOKEN_IDS;
// // } // // }
// // let tokens = unsafe { std::slice::from_raw_parts(token_ids, num_tokens) }.to_vec(); // // let tokens = unsafe { std::slice::from_raw_parts(token_ids, num_tokens) }.to_vec();
...@@ -311,15 +311,15 @@ pub extern "C" fn triton_kv_event_publish_removed( ...@@ -311,15 +311,15 @@ pub extern "C" fn triton_kv_event_publish_removed(
// // unsafe { *event = Box::into_raw(new_event) }; // // unsafe { *event = Box::into_raw(new_event) };
// TritonLlmResult::OK // DynemoLlmResult::OK
// } // }
// #[no_mangle] // #[no_mangle]
// pub extern "C" fn triton_kv_event_create_removed( // pub extern "C" fn dynemo_kv_event_create_removed(
// event_id: u64, // event_id: u64,
// block_hashes: *const u64, // block_hashes: *const u64,
// num_hashes: usize, // num_hashes: usize,
// ) -> TritonLlmResult { // ) -> DynemoLlmResult {
// // if event.is_null() || block_hashes.is_null() { // // if event.is_null() || block_hashes.is_null() {
// // return -1; // // return -1;
// // } // // }
...@@ -334,19 +334,19 @@ pub extern "C" fn triton_kv_event_publish_removed( ...@@ -334,19 +334,19 @@ pub extern "C" fn triton_kv_event_publish_removed(
// // unsafe { *event = Box::into_raw(new_event) }; // // unsafe { *event = Box::into_raw(new_event) };
// // 0 // // 0
// TritonLlmResult::OK // DynemoLlmResult::OK
// } // }
// /// create load publisher object and return a handle // /// create load publisher object and return a handle
// /// load publisher will instantiate the nats service and tie its stats handler to // /// load publisher will instantiate the nats service and tie its stats handler to
// /// a watch channel receiver. the watch channel sender will be attach to the // /// a watch channel receiver. the watch channel sender will be attach to the
// /// handle and calls to [`triton_load_stats_publish`] issue the stats to the watch t // /// handle and calls to [`dynemo_load_stats_publish`] issue the stats to the watch t
// pub extern "C" fn triton_load_publisher_create() -> *mut LoadPublisher { // pub extern "C" fn dynemo_load_publisher_create() -> *mut LoadPublisher {
// // let publisher = Box::new(LoadPublisher::new()); // // let publisher = Box::new(LoadPublisher::new());
// // Box::into_raw(publisher) // // Box::into_raw(publisher)
// } // }
// pub extern "C" fn triton_load_stats_publish( // pub extern "C" fn dynemo_load_stats_publish(
// publisher: *mut LoadPublisher, // publisher: *mut LoadPublisher,
// active_slots: u64, // active_slots: u64,
// total_slots: u64, // total_slots: u64,
......
/target /target
python/triton_distributed/*.so python/dynemo/.*.so
...@@ -956,6 +956,119 @@ dependencies = [ ...@@ -956,6 +956,119 @@ dependencies = [
"syn 2.0.98", "syn 2.0.98",
] ]
[[package]]
name = "dynemo-llm"
version = "0.2.1"
dependencies = [
"anyhow",
"async-openai",
"async-stream",
"async-trait",
"axum 0.8.1",
"bindgen",
"blake3",
"bs62",
"bytes",
"chrono",
"cmake",
"derive_builder",
"dynemo-runtime",
"either",
"erased-serde",
"futures",
"galil-seiferas",
"indexmap 2.7.1",
"itertools 0.14.0",
"libc",
"minijinja",
"minijinja-contrib",
"prometheus",
"pyo3",
"regex",
"semver",
"serde",
"serde-pickle",
"serde_json",
"serde_repr",
"strum",
"thiserror 2.0.11",
"tokenizers",
"tokio",
"tokio-stream",
"tokio-util",
"toktrie",
"toktrie_hf_tokenizers",
"tracing",
"unicode-segmentation",
"uuid",
"validator",
"xxhash-rust",
]
[[package]]
name = "dynemo-py3"
version = "0.2.1"
dependencies = [
"dynemo-llm",
"dynemo-runtime",
"futures",
"once_cell",
"pyo3",
"pyo3-async-runtimes",
"pythonize",
"serde",
"serde_json",
"thiserror 2.0.11",
"tokio",
"tokio-stream",
"tracing",
"tracing-subscriber",
]
[[package]]
name = "dynemo-runtime"
version = "0.2.1"
dependencies = [
"anyhow",
"async-nats",
"async-once-cell",
"async-stream",
"async-trait",
"async_zmq",
"blake3",
"bytes",
"chrono",
"derive-getters",
"derive_builder",
"educe",
"either",
"etcd-client",
"figment",
"futures",
"humantime",
"local-ip-address",
"log",
"nid",
"nix",
"nuid",
"once_cell",
"prometheus",
"rand",
"regex",
"serde",
"serde_json",
"socket2",
"thiserror 1.0.69",
"tokio",
"tokio-stream",
"tokio-util",
"tracing",
"tracing-subscriber",
"uuid",
"validator",
"xxhash-rust",
]
[[package]] [[package]]
name = "ed25519" name = "ed25519"
version = "2.2.3" version = "2.2.3"
...@@ -4004,119 +4117,6 @@ dependencies = [ ...@@ -4004,119 +4117,6 @@ dependencies = [
"tracing-serde", "tracing-serde",
] ]
[[package]]
name = "triton-distributed-llm"
version = "0.2.1"
dependencies = [
"anyhow",
"async-openai",
"async-stream",
"async-trait",
"axum 0.8.1",
"bindgen",
"blake3",
"bs62",
"bytes",
"chrono",
"cmake",
"derive_builder",
"either",
"erased-serde",
"futures",
"galil-seiferas",
"indexmap 2.7.1",
"itertools 0.14.0",
"libc",
"minijinja",
"minijinja-contrib",
"prometheus",
"pyo3",
"regex",
"semver",
"serde",
"serde-pickle",
"serde_json",
"serde_repr",
"strum",
"thiserror 2.0.11",
"tokenizers",
"tokio",
"tokio-stream",
"tokio-util",
"toktrie",
"toktrie_hf_tokenizers",
"tracing",
"triton-distributed-runtime",
"unicode-segmentation",
"uuid",
"validator",
"xxhash-rust",
]
[[package]]
name = "triton-distributed-py3"
version = "0.2.1"
dependencies = [
"futures",
"once_cell",
"pyo3",
"pyo3-async-runtimes",
"pythonize",
"serde",
"serde_json",
"thiserror 2.0.11",
"tokio",
"tokio-stream",
"tracing",
"tracing-subscriber",
"triton-distributed-llm",
"triton-distributed-runtime",
]
[[package]]
name = "triton-distributed-runtime"
version = "0.2.1"
dependencies = [
"anyhow",
"async-nats",
"async-once-cell",
"async-stream",
"async-trait",
"async_zmq",
"blake3",
"bytes",
"chrono",
"derive-getters",
"derive_builder",
"educe",
"either",
"etcd-client",
"figment",
"futures",
"humantime",
"local-ip-address",
"log",
"nid",
"nix",
"nuid",
"once_cell",
"prometheus",
"rand",
"regex",
"serde",
"serde_json",
"socket2",
"thiserror 1.0.69",
"tokio",
"tokio-stream",
"tokio-util",
"tracing",
"tracing-subscriber",
"uuid",
"validator",
"xxhash-rust",
]
[[package]] [[package]]
name = "try-lock" name = "try-lock"
version = "0.2.5" version = "0.2.5"
......
...@@ -14,13 +14,13 @@ ...@@ -14,13 +14,13 @@
# limitations under the License. # limitations under the License.
[package] [package]
name = "triton-distributed-py3" name = "dynemo-py3"
version = "0.2.1" version = "0.2.1"
edition = "2021" edition = "2021"
authors = ["NVIDIA"] authors = ["NVIDIA"]
license = "Apache-2.0" license = "Apache-2.0"
homepage = "https://github.com/triton-inference-server/triton_distributed" homepage = "https://github.com/dynemo-ai/dynemo"
repository = "https://github.com/triton-inference-server/triton_distributed" repository = "https://github.com/dynemo-ai/dynemo.git"
[lib] [lib]
path = "rust/lib.rs" path = "rust/lib.rs"
...@@ -30,8 +30,8 @@ crate-type = ["cdylib"] ...@@ -30,8 +30,8 @@ crate-type = ["cdylib"]
[dependencies] [dependencies]
triton-distributed-llm = { path = "../../llm" } dynemo-llm = { path = "../../llm" }
triton-distributed-runtime = { path = "../../runtime" } dynemo-runtime = { path = "../../runtime" }
futures = "0.3" futures = "0.3"
once_cell = "1.20.3" once_cell = "1.20.3"
......
...@@ -41,7 +41,7 @@ source .venv/bin/activate ...@@ -41,7 +41,7 @@ source .venv/bin/activate
uv pip install maturin uv pip install maturin
``` ```
4. Build and install triton_distributed wheel 4. Build and install dynemo wheel
``` ```
maturin develop --uv maturin develop --uv
``` ```
......
...@@ -17,7 +17,7 @@ import asyncio ...@@ -17,7 +17,7 @@ import asyncio
import uvloop import uvloop
from triton_distributed.runtime import DistributedRuntime, triton_worker from dynemo.runtime import DistributedRuntime, dynemo_worker
uvloop.install() uvloop.install()
...@@ -29,7 +29,7 @@ class RequestHandler: ...@@ -29,7 +29,7 @@ class RequestHandler:
yield char yield char
@triton_worker() @dynemo_worker()
async def worker(runtime: DistributedRuntime): async def worker(runtime: DistributedRuntime):
component = runtime.namespace("examples/bls").component("bar") component = runtime.namespace("examples/bls").component("bar")
await component.create_service() await component.create_service()
......
...@@ -17,12 +17,12 @@ import asyncio ...@@ -17,12 +17,12 @@ import asyncio
import uvloop import uvloop
from triton_distributed.runtime import DistributedRuntime, triton_worker from dynemo.runtime import DistributedRuntime, dynemo_worker
uvloop.install() uvloop.install()
@triton_worker() @dynemo_worker()
async def worker(runtime: DistributedRuntime): async def worker(runtime: DistributedRuntime):
foo = ( foo = (
await runtime.namespace("examples/bls") await runtime.namespace("examples/bls")
......
...@@ -17,7 +17,7 @@ import asyncio ...@@ -17,7 +17,7 @@ import asyncio
import uvloop import uvloop
from triton_distributed.runtime import DistributedRuntime, triton_worker from dynemo.runtime import DistributedRuntime, dynemo_worker
uvloop.install() uvloop.install()
...@@ -28,7 +28,7 @@ class RequestHandler: ...@@ -28,7 +28,7 @@ class RequestHandler:
yield char yield char
@triton_worker() @dynemo_worker()
async def worker(runtime: DistributedRuntime): async def worker(runtime: DistributedRuntime):
component = runtime.namespace("examples/bls").component("foo") component = runtime.namespace("examples/bls").component("foo")
await component.create_service() await component.create_service()
......
...@@ -17,12 +17,12 @@ import asyncio ...@@ -17,12 +17,12 @@ import asyncio
import uvloop import uvloop
from triton_distributed.runtime import DistributedRuntime, triton_worker from dynemo.runtime import DistributedRuntime, dynemo_worker
@triton_worker() @dynemo_worker()
async def worker(runtime: DistributedRuntime): async def worker(runtime: DistributedRuntime):
await init(runtime, "triton-init") await init(runtime, "dynemo")
async def init(runtime: DistributedRuntime, ns: str): async def init(runtime: DistributedRuntime, ns: str):
......
...@@ -21,7 +21,7 @@ import uvloop ...@@ -21,7 +21,7 @@ import uvloop
from client import init as client_init from client import init as client_init
from server import init as server_init from server import init as server_init
from triton_distributed.runtime import DistributedRuntime, triton_worker from dynemo.runtime import DistributedRuntime, dynemo_worker
def random_string(length=10): def random_string(length=10):
...@@ -29,7 +29,7 @@ def random_string(length=10): ...@@ -29,7 +29,7 @@ def random_string(length=10):
return "".join(random.choices(chars, k=length)) return "".join(random.choices(chars, k=length))
@triton_worker() @dynemo_worker()
async def worker(runtime: DistributedRuntime): async def worker(runtime: DistributedRuntime):
ns = random_string() ns = random_string()
task = asyncio.create_task(server_init(runtime, ns)) task = asyncio.create_task(server_init(runtime, ns))
......
...@@ -17,7 +17,7 @@ import asyncio ...@@ -17,7 +17,7 @@ import asyncio
import uvloop import uvloop
from triton_distributed.runtime import DistributedRuntime, triton_worker from dynemo.runtime import DistributedRuntime, dynemo_worker
class RequestHandler: class RequestHandler:
...@@ -33,9 +33,9 @@ class RequestHandler: ...@@ -33,9 +33,9 @@ class RequestHandler:
yield char yield char
@triton_worker() @dynemo_worker()
async def worker(runtime: DistributedRuntime): async def worker(runtime: DistributedRuntime):
await init(runtime, "triton-init") await init(runtime, "dynemo")
async def init(runtime: DistributedRuntime, ns: str): async def init(runtime: DistributedRuntime, ns: str):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment