Commit 1af7433b authored by Neelay Shah's avatar Neelay Shah Committed by GitHub
Browse files

refactor: rename triton_distributed to dynemo (#22)


Co-authored-by: default avatarGraham King <grahamk@nvidia.com>
parent ee4ef06b
......@@ -13,12 +13,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use futures::StreamExt;
use std::{
io::{ErrorKind, Read, Write},
sync::Arc,
};
use triton_distributed_llm::{
use dynemo_llm::{
backend::Backend,
preprocessor::OpenAIPreprocessor,
types::{
......@@ -29,10 +24,15 @@ use triton_distributed_llm::{
Annotated,
},
};
use triton_distributed_runtime::{
use dynemo_runtime::{
pipeline::{Context, ManyOut, Operator, ServiceBackend, ServiceFrontend, SingleIn, Source},
runtime::CancellationToken,
};
use futures::StreamExt;
use std::{
io::{ErrorKind, Read, Write},
sync::Arc,
};
use crate::EngineConfig;
......
......@@ -16,7 +16,7 @@
#[cfg(any(feature = "vllm", feature = "sglang"))]
use std::{future::Future, pin::Pin};
use triton_distributed_llm::{
use dynemo_llm::{
backend::ExecutionContext,
model_card::model::ModelDeploymentCard,
types::{
......@@ -27,7 +27,7 @@ use triton_distributed_llm::{
Annotated,
},
};
use triton_distributed_runtime::{component::Client, protocols::Endpoint, DistributedRuntime};
use dynemo_runtime::{component::Client, protocols::Endpoint, DistributedRuntime};
mod flags;
pub use flags::Flags;
......@@ -67,7 +67,7 @@ pub enum EngineConfig {
#[allow(unused_mut)]
pub async fn run(
runtime: triton_distributed_runtime::Runtime,
runtime: dynemo_runtime::Runtime,
mut in_opt: Input, // mut because vllm and sglang multi-node can change it
out_opt: Output,
flags: Flags,
......@@ -173,13 +173,12 @@ pub async fn run(
};
EngineConfig::StaticFull {
service_name: model_name,
engine: triton_distributed_llm::engines::mistralrs::make_engine(&model_path)
.await?,
engine: dynemo_llm::engines::mistralrs::make_engine(&model_path).await?,
}
}
#[cfg(feature = "sglang")]
Output::SgLang => {
use triton_distributed_llm::engines::sglang;
use dynemo_llm::engines::sglang;
let Some(model_path) = model_path else {
anyhow::bail!("out=sglang requires flag --model-path=<full-path-to-model-dir>");
};
......@@ -191,7 +190,7 @@ pub async fn run(
let Some(sock_prefix) = zmq_socket_prefix else {
anyhow::bail!("sglang requires zmq_socket_prefix");
};
let node_conf = triton_distributed_llm::engines::MultiNodeConfig {
let node_conf = dynemo_llm::engines::MultiNodeConfig {
num_nodes: flags.num_nodes,
node_rank: flags.node_rank,
leader_addr: flags.leader_addr.unwrap_or_default(),
......@@ -229,7 +228,7 @@ pub async fn run(
}
#[cfg(feature = "vllm")]
Output::Vllm => {
use triton_distributed_llm::engines::vllm;
use dynemo_llm::engines::vllm;
if flags.base_gpu_id != 0 {
anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
}
......@@ -253,7 +252,7 @@ pub async fn run(
let Some(sock_prefix) = zmq_socket_prefix else {
anyhow::bail!("vllm requires zmq_socket_prefix");
};
let node_conf = triton_distributed_llm::engines::MultiNodeConfig {
let node_conf = dynemo_llm::engines::MultiNodeConfig {
num_nodes: flags.num_nodes,
node_rank: flags.node_rank,
leader_addr: flags.leader_addr.unwrap_or_default(),
......@@ -296,7 +295,7 @@ pub async fn run(
}
#[cfg(feature = "llamacpp")]
Output::LlamaCpp => {
use triton_distributed_llm::engines::llamacpp;
use dynemo_llm::engines::llamacpp;
let Some(model_path) = model_path else {
anyhow::bail!("out=llamacpp requires flag --model-path=<full-path-to-model-gguf>");
};
......@@ -317,7 +316,7 @@ pub async fn run(
}
#[cfg(feature = "trtllm")]
Output::TrtLLM => {
use triton_distributed_llm::engines::trtllm;
use dynemo_llm::engines::trtllm;
let Some(model_path) = model_path else {
anyhow::bail!("out=trtllm requires flag --model-path=<full-path-to-model-dir>");
};
......
......@@ -18,7 +18,7 @@ use std::env;
use clap::Parser;
use dynemo_run::{Input, Output};
use triton_distributed_runtime::logging;
use dynemo_runtime::logging;
const HELP: &str = r#"
dynemo-run is a single binary that wires together the various inputs (http, text, network) and workers (network, engine), that runs the services. It is the simplest way to use dynemo locally.
......@@ -60,13 +60,13 @@ fn main() -> anyhow::Result<()> {
if cfg!(feature = "sglang") {
#[cfg(feature = "sglang")]
{
use triton_distributed_llm::engines::sglang;
use dynemo_llm::engines::sglang;
let gpu_config = sglang::MultiGPUConfig {
tp_size: flags.tensor_parallel_size,
tp_rank: sglang_flags.tp_rank,
gpu_id: sglang_flags.gpu_id,
};
let node_config = triton_distributed_llm::engines::MultiNodeConfig {
let node_config = dynemo_llm::engines::MultiNodeConfig {
num_nodes: flags.num_nodes,
node_rank: flags.node_rank,
leader_addr: flags.leader_addr.unwrap_or_default(),
......@@ -98,8 +98,8 @@ fn main() -> anyhow::Result<()> {
if cfg!(feature = "vllm") {
#[cfg(feature = "vllm")]
{
use triton_distributed_llm::engines::vllm;
let node_config = triton_distributed_llm::engines::MultiNodeConfig {
use dynemo_llm::engines::vllm;
let node_config = dynemo_llm::engines::MultiNodeConfig {
num_nodes: flags.num_nodes,
node_rank: flags.node_rank,
leader_addr: flags.leader_addr.unwrap_or_default(),
......@@ -119,15 +119,15 @@ fn main() -> anyhow::Result<()> {
}
// max_worker_threads and max_blocking_threads from env vars or config file.
let rt_config = triton_distributed_runtime::RuntimeConfig::from_settings()?;
let rt_config = dynemo_runtime::RuntimeConfig::from_settings()?;
// One per process. Wraps a Runtime with holds two tokio runtimes.
let worker = triton_distributed_runtime::Worker::from_config(rt_config)?;
let worker = dynemo_runtime::Worker::from_config(rt_config)?;
worker.execute(wrapper)
}
async fn wrapper(runtime: triton_distributed_runtime::Runtime) -> anyhow::Result<()> {
async fn wrapper(runtime: dynemo_runtime::Runtime) -> anyhow::Result<()> {
let mut in_opt = None;
let mut out_opt = None;
let args: Vec<String> = env::args().skip(1).collect();
......
......@@ -18,12 +18,12 @@ use std::{sync::Arc, time::Duration};
use async_stream::stream;
use async_trait::async_trait;
use triton_distributed_llm::backend::ExecutionContext;
use triton_distributed_llm::preprocessor::BackendInput;
use triton_distributed_llm::protocols::common::llm_backend::LLMEngineOutput;
use triton_distributed_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
use triton_distributed_runtime::pipeline::{Error, ManyOut, SingleIn};
use triton_distributed_runtime::protocols::annotated::Annotated;
use dynemo_llm::backend::ExecutionContext;
use dynemo_llm::preprocessor::BackendInput;
use dynemo_llm::protocols::common::llm_backend::LLMEngineOutput;
use dynemo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
use dynemo_runtime::pipeline::{Error, ManyOut, SingleIn};
use dynemo_runtime::protocols::annotated::Annotated;
/// How long to sleep between echoed tokens.
/// 50ms gives us 20 tok/s.
......
......@@ -18,13 +18,13 @@ use std::{sync::Arc, time::Duration};
use async_stream::stream;
use async_trait::async_trait;
use triton_distributed_llm::protocols::openai::chat_completions::{
use dynemo_llm::protocols::openai::chat_completions::{
NvCreateChatCompletionRequest, NvCreateChatCompletionStreamResponse,
};
use triton_distributed_llm::types::openai::chat_completions::OpenAIChatCompletionsStreamingEngine;
use triton_distributed_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
use triton_distributed_runtime::pipeline::{Error, ManyOut, SingleIn};
use triton_distributed_runtime::protocols::annotated::Annotated;
use dynemo_llm::types::openai::chat_completions::OpenAIChatCompletionsStreamingEngine;
use dynemo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
use dynemo_runtime::pipeline::{Error, ManyOut, SingleIn};
use dynemo_runtime::protocols::annotated::Annotated;
/// How long to sleep between echoed tokens.
/// 50ms gives us 20 tok/s.
......
......@@ -954,6 +954,99 @@ dependencies = [
"syn 2.0.96",
]
[[package]]
name = "dynemo-llm"
version = "0.2.1"
dependencies = [
"anyhow",
"async-openai",
"async-stream",
"async-trait",
"axum 0.8.1",
"bindgen",
"blake3",
"bs62",
"bytes",
"chrono",
"cmake",
"derive_builder",
"dynemo-runtime",
"either",
"erased-serde",
"futures",
"galil-seiferas",
"indexmap 2.7.0",
"itertools 0.14.0",
"libc",
"minijinja",
"minijinja-contrib",
"prometheus",
"pyo3",
"regex",
"semver",
"serde",
"serde-pickle",
"serde_json",
"serde_repr",
"strum",
"thiserror 2.0.11",
"tokenizers",
"tokio",
"tokio-stream",
"tokio-util",
"toktrie",
"toktrie_hf_tokenizers",
"tracing",
"unicode-segmentation",
"uuid",
"validator",
"xxhash-rust",
]
[[package]]
name = "dynemo-runtime"
version = "0.2.1"
dependencies = [
"anyhow",
"async-nats",
"async-once-cell",
"async-stream",
"async-trait",
"async_zmq",
"blake3",
"bytes",
"chrono",
"derive-getters",
"derive_builder",
"educe",
"either",
"etcd-client",
"figment",
"futures",
"humantime",
"local-ip-address",
"log",
"nid",
"nix",
"nuid",
"once_cell",
"prometheus",
"rand",
"regex",
"serde",
"serde_json",
"socket2",
"thiserror 1.0.69",
"tokio",
"tokio-stream",
"tokio-util",
"tracing",
"tracing-subscriber",
"uuid",
"validator",
"xxhash-rust",
]
[[package]]
name = "ed25519"
version = "2.2.3"
......@@ -1853,6 +1946,27 @@ version = "0.2.169"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
[[package]]
name = "libdynemo-llm"
version = "0.2.1"
dependencies = [
"anyhow",
"async-once-cell",
"cbindgen",
"dynemo-llm",
"dynemo-runtime",
"futures",
"libc",
"once_cell",
"serde",
"serde_json",
"tokio",
"tokio-stream",
"tracing",
"tracing-subscriber",
"uuid",
]
[[package]]
name = "libloading"
version = "0.8.6"
......@@ -1873,27 +1987,6 @@ dependencies = [
"libc",
]
[[package]]
name = "libtriton-distributed-llm"
version = "0.2.1"
dependencies = [
"anyhow",
"async-once-cell",
"cbindgen",
"futures",
"libc",
"once_cell",
"serde",
"serde_json",
"tokio",
"tokio-stream",
"tracing",
"tracing-subscriber",
"triton-distributed-llm",
"triton-distributed-runtime",
"uuid",
]
[[package]]
name = "linux-raw-sys"
version = "0.4.15"
......@@ -3955,99 +4048,6 @@ dependencies = [
"tracing-serde",
]
[[package]]
name = "triton-distributed-llm"
version = "0.2.1"
dependencies = [
"anyhow",
"async-openai",
"async-stream",
"async-trait",
"axum 0.8.1",
"bindgen",
"blake3",
"bs62",
"bytes",
"chrono",
"cmake",
"derive_builder",
"either",
"erased-serde",
"futures",
"galil-seiferas",
"indexmap 2.7.0",
"itertools 0.14.0",
"libc",
"minijinja",
"minijinja-contrib",
"prometheus",
"pyo3",
"regex",
"semver",
"serde",
"serde-pickle",
"serde_json",
"serde_repr",
"strum",
"thiserror 2.0.11",
"tokenizers",
"tokio",
"tokio-stream",
"tokio-util",
"toktrie",
"toktrie_hf_tokenizers",
"tracing",
"triton-distributed-runtime",
"unicode-segmentation",
"uuid",
"validator",
"xxhash-rust",
]
[[package]]
name = "triton-distributed-runtime"
version = "0.2.1"
dependencies = [
"anyhow",
"async-nats",
"async-once-cell",
"async-stream",
"async-trait",
"async_zmq",
"blake3",
"bytes",
"chrono",
"derive-getters",
"derive_builder",
"educe",
"either",
"etcd-client",
"figment",
"futures",
"humantime",
"local-ip-address",
"log",
"nid",
"nix",
"nuid",
"once_cell",
"prometheus",
"rand",
"regex",
"serde",
"serde_json",
"socket2",
"thiserror 1.0.69",
"tokio",
"tokio-stream",
"tokio-util",
"tracing",
"tracing-subscriber",
"uuid",
"validator",
"xxhash-rust",
]
[[package]]
name = "try-lock"
version = "0.2.5"
......
......@@ -14,24 +14,24 @@
# limitations under the License.
[package]
name = "libtriton-distributed-llm"
name = "libdynemo-llm"
version = "0.2.1"
edition = "2021"
authors = ["NVIDIA"]
license = "Apache-2.0"
homepage = "https://github.com/triton-inference-server/triton_distributed"
repository = "https://github.com/triton-inference-server/triton_distributed"
homepage = "https://github.com/dynemo-ai/dynemo"
repository = "https://github.com/dynemo-ai/dynemo.git"
[lib]
name = "triton_distributed_llm_capi"
name = "dynemo_llm_capi"
crate-type = ["cdylib"]
[build-dependencies]
cbindgen = "0.27"
[dependencies]
triton-distributed-llm = { path = "../../llm" }
triton-distributed-runtime = { path = "../../runtime" }
dynemo-llm = { path = "../../llm" }
dynemo-runtime = { path = "../../runtime" }
anyhow = { version = "1" }
futures = "0.3"
......
......@@ -22,7 +22,7 @@ fn main() {
let header_path = Path::new(&crate_dir)
.join("include")
.join("nvidia")
.join("triton_llm")
.join("dynemo_llm")
.join("llm_engine.h");
cbindgen::generate(crate_dir)
......
......@@ -15,7 +15,7 @@
language = "C++"
cpp_compat = true
include_guard = "__NVIDIA_TRITON_LLM_API__"
include_guard = "__NVIDIA_DYNEMO_LLM_API__"
[enum]
......@@ -25,7 +25,7 @@ enum_class = false
[export]
include = ["TritonLlmResult", "triton_llm_init", "triton_llm_shutdown"]
include = ["DynemoLlmResult", "dynemo_llm_init", "dynemo_llm_shutdown"]
[export.rename]
"TritonLlmResult" = "triton_llm_result_t"
"DynemoLlmResult" = "dynemo_llm_result_t"
......@@ -19,10 +19,10 @@ use once_cell::sync::OnceCell;
use std::ffi::CStr;
use std::sync::atomic::{AtomicU32, Ordering};
use triton_distributed_llm::kv_router::{
use dynemo_llm::kv_router::{
indexer::compute_block_hash_for_seq, protocols::*, publisher::KvEventPublisher,
};
use triton_distributed_runtime::{DistributedRuntime, Worker};
use dynemo_runtime::{DistributedRuntime, Worker};
static WK: OnceCell<Worker> = OnceCell::new();
static DRT: AsyncOnceCell<DistributedRuntime> = AsyncOnceCell::new();
// [FIXME] shouldn't the publisher be instance passing between API calls?
......@@ -41,7 +41,7 @@ fn initialize_tracing() {
}
#[repr(u32)]
pub enum TritonLlmResult {
pub enum DynemoLlmResult {
OK = 0,
ERR = 1,
}
......@@ -49,17 +49,17 @@ pub enum TritonLlmResult {
/// # Safety
/// the namespace_c_str and component_c_str are passed as pointers to C strings
#[no_mangle]
pub unsafe extern "C" fn triton_llm_init(
pub unsafe extern "C" fn dynemo_llm_init(
namespace_c_str: *const c_char,
component_c_str: *const c_char,
worker_id: i64,
) -> TritonLlmResult {
) -> DynemoLlmResult {
initialize_tracing();
let wk = match WK.get_or_try_init(Worker::from_settings) {
Ok(wk) => wk.clone(),
Err(e) => {
eprintln!("Failed to initialize runtime: {:?}", e);
return TritonLlmResult::ERR;
return DynemoLlmResult::ERR;
}
};
let rt = wk.runtime();
......@@ -73,7 +73,7 @@ pub unsafe extern "C" fn triton_llm_init(
Ok(_) => Ok(()),
Err(e) => {
eprintln!("Failed to initialize distributed runtime: {:?}", e);
Err(TritonLlmResult::ERR)
Err(DynemoLlmResult::ERR)
}
}
});
......@@ -81,7 +81,7 @@ pub unsafe extern "C" fn triton_llm_init(
Ok(s) => s.to_string(),
Err(e) => {
eprintln!("Failed to convert C string to Rust string: {:?}", e);
return TritonLlmResult::ERR;
return DynemoLlmResult::ERR;
}
};
......@@ -89,18 +89,18 @@ pub unsafe extern "C" fn triton_llm_init(
Ok(s) => s.to_string(),
Err(e) => {
eprintln!("Failed to convert C string to Rust string: {:?}", e);
return TritonLlmResult::ERR;
return DynemoLlmResult::ERR;
}
};
match result {
Ok(_) => match KV_PUB
.get_or_try_init(move || triton_create_kv_publisher(namespace, component, worker_id))
.get_or_try_init(move || dynemo_create_kv_publisher(namespace, component, worker_id))
{
Ok(_) => TritonLlmResult::OK,
Ok(_) => DynemoLlmResult::OK,
Err(e) => {
eprintln!("Failed to initialize distributed runtime: {:?}", e);
TritonLlmResult::ERR
DynemoLlmResult::ERR
}
},
Err(e) => e,
......@@ -108,33 +108,33 @@ pub unsafe extern "C" fn triton_llm_init(
}
#[no_mangle]
pub extern "C" fn triton_llm_shutdown() -> TritonLlmResult {
pub extern "C" fn dynemo_llm_shutdown() -> DynemoLlmResult {
let wk = match WK.get() {
Some(wk) => wk,
None => {
eprintln!("Runtime not initialized");
return TritonLlmResult::ERR;
return DynemoLlmResult::ERR;
}
};
wk.runtime().shutdown();
TritonLlmResult::OK
DynemoLlmResult::OK
}
#[no_mangle]
pub extern "C" fn triton_llm_load_publisher_create() -> TritonLlmResult {
TritonLlmResult::OK
pub extern "C" fn dynemo_llm_load_publisher_create() -> DynemoLlmResult {
DynemoLlmResult::OK
}
// instantiate a kv publisher
// this will bring up the task to publish and the channels to await publishing events
// the [`triton_kv_publish_store_event`] call will use a handle to the publisher to send events
// store and the [`triton_kv_event_create_removed`] will create remove events
// the [`dynemo_kv_publish_store_event`] call will use a handle to the publisher to send events
// store and the [`dynemo_kv_event_create_removed`] will create remove events
// these call mus be driving by external c++ threads that are consuming the kv events from the
// c++ executor api
fn triton_create_kv_publisher(
fn dynemo_create_kv_publisher(
namespace: String,
component: String,
worker_id: i64,
......@@ -238,7 +238,7 @@ fn kv_event_create_removed_from_parts(
/// parent_hash is passed as pointer to indicate whether the blocks
/// has a parent hash or not. nullptr is used to represent no parent hash
#[no_mangle]
pub unsafe extern "C" fn triton_kv_event_publish_stored(
pub unsafe extern "C" fn dynemo_kv_event_publish_stored(
event_id: u64,
token_ids: *const u32,
num_block_tokens: *const usize,
......@@ -246,7 +246,7 @@ pub unsafe extern "C" fn triton_kv_event_publish_stored(
num_blocks: usize,
parent_hash: *const u64,
lora_id: u64,
) -> TritonLlmResult {
) -> DynemoLlmResult {
let publisher = KV_PUB.get().unwrap();
let parent_hash = {
if parent_hash.is_null() {
......@@ -265,40 +265,40 @@ pub unsafe extern "C" fn triton_kv_event_publish_stored(
lora_id,
);
match publisher.publish(event) {
Ok(_) => TritonLlmResult::OK,
Ok(_) => DynemoLlmResult::OK,
Err(e) => {
eprintln!("Error publishing stored kv event {:?}", e);
TritonLlmResult::ERR
DynemoLlmResult::ERR
}
}
}
#[no_mangle]
pub extern "C" fn triton_kv_event_publish_removed(
pub extern "C" fn dynemo_kv_event_publish_removed(
event_id: u64,
block_ids: *const u64,
num_blocks: usize,
) -> TritonLlmResult {
) -> DynemoLlmResult {
let publisher = KV_PUB.get().unwrap();
let event = kv_event_create_removed_from_parts(event_id, block_ids, num_blocks);
match publisher.publish(event) {
Ok(_) => TritonLlmResult::OK,
Ok(_) => DynemoLlmResult::OK,
Err(e) => {
eprintln!("Error publishing removed kv event {:?}", e);
TritonLlmResult::ERR
DynemoLlmResult::ERR
}
}
}
// #[no_mangle]
// pub extern "C" fn triton_kv_publish_store_event(
// pub extern "C" fn dynemo_kv_publish_store_event(
// event_id: u64,
// token_ids: *const u32,
// num_tokens: usize,
// lora_id: u64,
// ) -> TritonLlmResult {
// ) -> DynemoLlmResult {
// // if event.is_null() || token_ids.is_null() {
// // return tritonKvErrorType::INVALID_TOKEN_IDS;
// // return dynemoKvErrorType::INVALID_TOKEN_IDS;
// // }
// // let tokens = unsafe { std::slice::from_raw_parts(token_ids, num_tokens) }.to_vec();
......@@ -311,15 +311,15 @@ pub extern "C" fn triton_kv_event_publish_removed(
// // unsafe { *event = Box::into_raw(new_event) };
// TritonLlmResult::OK
// DynemoLlmResult::OK
// }
// #[no_mangle]
// pub extern "C" fn triton_kv_event_create_removed(
// pub extern "C" fn dynemo_kv_event_create_removed(
// event_id: u64,
// block_hashes: *const u64,
// num_hashes: usize,
// ) -> TritonLlmResult {
// ) -> DynemoLlmResult {
// // if event.is_null() || block_hashes.is_null() {
// // return -1;
// // }
......@@ -334,19 +334,19 @@ pub extern "C" fn triton_kv_event_publish_removed(
// // unsafe { *event = Box::into_raw(new_event) };
// // 0
// TritonLlmResult::OK
// DynemoLlmResult::OK
// }
// /// create load publisher object and return a handle
// /// load publisher will instantiate the nats service and tie its stats handler to
// /// a watch channel receiver. the watch channel sender will be attach to the
// /// handle and calls to [`triton_load_stats_publish`] issue the stats to the watch t
// pub extern "C" fn triton_load_publisher_create() -> *mut LoadPublisher {
// /// handle and calls to [`dynemo_load_stats_publish`] issue the stats to the watch t
// pub extern "C" fn dynemo_load_publisher_create() -> *mut LoadPublisher {
// // let publisher = Box::new(LoadPublisher::new());
// // Box::into_raw(publisher)
// }
// pub extern "C" fn triton_load_stats_publish(
// pub extern "C" fn dynemo_load_stats_publish(
// publisher: *mut LoadPublisher,
// active_slots: u64,
// total_slots: u64,
......
/target
python/triton_distributed/*.so
python/dynemo/.*.so
......@@ -956,6 +956,119 @@ dependencies = [
"syn 2.0.98",
]
[[package]]
name = "dynemo-llm"
version = "0.2.1"
dependencies = [
"anyhow",
"async-openai",
"async-stream",
"async-trait",
"axum 0.8.1",
"bindgen",
"blake3",
"bs62",
"bytes",
"chrono",
"cmake",
"derive_builder",
"dynemo-runtime",
"either",
"erased-serde",
"futures",
"galil-seiferas",
"indexmap 2.7.1",
"itertools 0.14.0",
"libc",
"minijinja",
"minijinja-contrib",
"prometheus",
"pyo3",
"regex",
"semver",
"serde",
"serde-pickle",
"serde_json",
"serde_repr",
"strum",
"thiserror 2.0.11",
"tokenizers",
"tokio",
"tokio-stream",
"tokio-util",
"toktrie",
"toktrie_hf_tokenizers",
"tracing",
"unicode-segmentation",
"uuid",
"validator",
"xxhash-rust",
]
[[package]]
name = "dynemo-py3"
version = "0.2.1"
dependencies = [
"dynemo-llm",
"dynemo-runtime",
"futures",
"once_cell",
"pyo3",
"pyo3-async-runtimes",
"pythonize",
"serde",
"serde_json",
"thiserror 2.0.11",
"tokio",
"tokio-stream",
"tracing",
"tracing-subscriber",
]
[[package]]
name = "dynemo-runtime"
version = "0.2.1"
dependencies = [
"anyhow",
"async-nats",
"async-once-cell",
"async-stream",
"async-trait",
"async_zmq",
"blake3",
"bytes",
"chrono",
"derive-getters",
"derive_builder",
"educe",
"either",
"etcd-client",
"figment",
"futures",
"humantime",
"local-ip-address",
"log",
"nid",
"nix",
"nuid",
"once_cell",
"prometheus",
"rand",
"regex",
"serde",
"serde_json",
"socket2",
"thiserror 1.0.69",
"tokio",
"tokio-stream",
"tokio-util",
"tracing",
"tracing-subscriber",
"uuid",
"validator",
"xxhash-rust",
]
[[package]]
name = "ed25519"
version = "2.2.3"
......@@ -4004,119 +4117,6 @@ dependencies = [
"tracing-serde",
]
[[package]]
name = "triton-distributed-llm"
version = "0.2.1"
dependencies = [
"anyhow",
"async-openai",
"async-stream",
"async-trait",
"axum 0.8.1",
"bindgen",
"blake3",
"bs62",
"bytes",
"chrono",
"cmake",
"derive_builder",
"either",
"erased-serde",
"futures",
"galil-seiferas",
"indexmap 2.7.1",
"itertools 0.14.0",
"libc",
"minijinja",
"minijinja-contrib",
"prometheus",
"pyo3",
"regex",
"semver",
"serde",
"serde-pickle",
"serde_json",
"serde_repr",
"strum",
"thiserror 2.0.11",
"tokenizers",
"tokio",
"tokio-stream",
"tokio-util",
"toktrie",
"toktrie_hf_tokenizers",
"tracing",
"triton-distributed-runtime",
"unicode-segmentation",
"uuid",
"validator",
"xxhash-rust",
]
[[package]]
name = "triton-distributed-py3"
version = "0.2.1"
dependencies = [
"futures",
"once_cell",
"pyo3",
"pyo3-async-runtimes",
"pythonize",
"serde",
"serde_json",
"thiserror 2.0.11",
"tokio",
"tokio-stream",
"tracing",
"tracing-subscriber",
"triton-distributed-llm",
"triton-distributed-runtime",
]
[[package]]
name = "triton-distributed-runtime"
version = "0.2.1"
dependencies = [
"anyhow",
"async-nats",
"async-once-cell",
"async-stream",
"async-trait",
"async_zmq",
"blake3",
"bytes",
"chrono",
"derive-getters",
"derive_builder",
"educe",
"either",
"etcd-client",
"figment",
"futures",
"humantime",
"local-ip-address",
"log",
"nid",
"nix",
"nuid",
"once_cell",
"prometheus",
"rand",
"regex",
"serde",
"serde_json",
"socket2",
"thiserror 1.0.69",
"tokio",
"tokio-stream",
"tokio-util",
"tracing",
"tracing-subscriber",
"uuid",
"validator",
"xxhash-rust",
]
[[package]]
name = "try-lock"
version = "0.2.5"
......
......@@ -14,13 +14,13 @@
# limitations under the License.
[package]
name = "triton-distributed-py3"
name = "dynemo-py3"
version = "0.2.1"
edition = "2021"
authors = ["NVIDIA"]
license = "Apache-2.0"
homepage = "https://github.com/triton-inference-server/triton_distributed"
repository = "https://github.com/triton-inference-server/triton_distributed"
homepage = "https://github.com/dynemo-ai/dynemo"
repository = "https://github.com/dynemo-ai/dynemo.git"
[lib]
path = "rust/lib.rs"
......@@ -30,8 +30,8 @@ crate-type = ["cdylib"]
[dependencies]
triton-distributed-llm = { path = "../../llm" }
triton-distributed-runtime = { path = "../../runtime" }
dynemo-llm = { path = "../../llm" }
dynemo-runtime = { path = "../../runtime" }
futures = "0.3"
once_cell = "1.20.3"
......
......@@ -41,7 +41,7 @@ source .venv/bin/activate
uv pip install maturin
```
4. Build and install triton_distributed wheel
4. Build and install dynemo wheel
```
maturin develop --uv
```
......
......@@ -17,7 +17,7 @@ import asyncio
import uvloop
from triton_distributed.runtime import DistributedRuntime, triton_worker
from dynemo.runtime import DistributedRuntime, dynemo_worker
uvloop.install()
......@@ -29,7 +29,7 @@ class RequestHandler:
yield char
@triton_worker()
@dynemo_worker()
async def worker(runtime: DistributedRuntime):
component = runtime.namespace("examples/bls").component("bar")
await component.create_service()
......
......@@ -17,12 +17,12 @@ import asyncio
import uvloop
from triton_distributed.runtime import DistributedRuntime, triton_worker
from dynemo.runtime import DistributedRuntime, dynemo_worker
uvloop.install()
@triton_worker()
@dynemo_worker()
async def worker(runtime: DistributedRuntime):
foo = (
await runtime.namespace("examples/bls")
......
......@@ -17,7 +17,7 @@ import asyncio
import uvloop
from triton_distributed.runtime import DistributedRuntime, triton_worker
from dynemo.runtime import DistributedRuntime, dynemo_worker
uvloop.install()
......@@ -28,7 +28,7 @@ class RequestHandler:
yield char
@triton_worker()
@dynemo_worker()
async def worker(runtime: DistributedRuntime):
component = runtime.namespace("examples/bls").component("foo")
await component.create_service()
......
......@@ -17,12 +17,12 @@ import asyncio
import uvloop
from triton_distributed.runtime import DistributedRuntime, triton_worker
from dynemo.runtime import DistributedRuntime, dynemo_worker
@triton_worker()
@dynemo_worker()
async def worker(runtime: DistributedRuntime):
await init(runtime, "triton-init")
await init(runtime, "dynemo")
async def init(runtime: DistributedRuntime, ns: str):
......
......@@ -21,7 +21,7 @@ import uvloop
from client import init as client_init
from server import init as server_init
from triton_distributed.runtime import DistributedRuntime, triton_worker
from dynemo.runtime import DistributedRuntime, dynemo_worker
def random_string(length=10):
......@@ -29,7 +29,7 @@ def random_string(length=10):
return "".join(random.choices(chars, k=length))
@triton_worker()
@dynemo_worker()
async def worker(runtime: DistributedRuntime):
ns = random_string()
task = asyncio.create_task(server_init(runtime, ns))
......
......@@ -17,7 +17,7 @@ import asyncio
import uvloop
from triton_distributed.runtime import DistributedRuntime, triton_worker
from dynemo.runtime import DistributedRuntime, dynemo_worker
class RequestHandler:
......@@ -33,9 +33,9 @@ class RequestHandler:
yield char
@triton_worker()
@dynemo_worker()
async def worker(runtime: DistributedRuntime):
await init(runtime, "triton-init")
await init(runtime, "dynemo")
async def init(runtime: DistributedRuntime, ns: str):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment