refactor: rename triton_distributed to dynemo (#22)

Co-authored-by: Graham King <grahamk@nvidia.com>

refactor: rename triton_distributed to dynemo (#22)
Co-authored-by: Graham King <grahamk@nvidia.com>
1af7433b · Neelay Shah · GitHub · ee4ef06b · 1af7433b · 1af7433b
Commit 1af7433b authored Mar 05, 2025 by Neelay Shah Committed by GitHub Mar 05, 2025
20 changed files
--- a/launch/dynemo-run/src/input/text.rs
+++ b/launch/dynemo-run/src/input/text.rs
@@ -13,12 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-use futures::StreamExt;
+use dynemo_llm::{
-use std::{
-    io::{ErrorKind, Read, Write},
-    sync::Arc,
-};
-use triton_distributed_llm::{
    backend::Backend,
    preprocessor::OpenAIPreprocessor,
    types::{
@@ -29,10 +24,15 @@ use triton_distributed_llm::{
        Annotated,
    },
 };
-use triton_distributed_runtime::{
+use dynemo_runtime::{
    pipeline::{Context, ManyOut, Operator, ServiceBackend, ServiceFrontend, SingleIn, Source},
    runtime::CancellationToken,
 };
+use futures::StreamExt;
+use std::{
+    io::{ErrorKind, Read, Write},
+    sync::Arc,
+};
 use crate::EngineConfig;

--- a/launch/dynemo-run/src/lib.rs
+++ b/launch/dynemo-run/src/lib.rs
@@ -16,7 +16,7 @@
 #[cfg(any(feature = "vllm", feature = "sglang"))]
 use std::{future::Future, pin::Pin};
-use triton_distributed_llm::{
+use dynemo_llm::{
    backend::ExecutionContext,
    model_card::model::ModelDeploymentCard,
    types::{
@@ -27,7 +27,7 @@ use triton_distributed_llm::{
        Annotated,
    },
 };
-use triton_distributed_runtime::{component::Client, protocols::Endpoint, DistributedRuntime};
+use dynemo_runtime::{component::Client, protocols::Endpoint, DistributedRuntime};
 mod flags;
 pub use flags::Flags;
@@ -67,7 +67,7 @@ pub enum EngineConfig {
 #[allow(unused_mut)]
 pub async fn run(
-    runtime: triton_distributed_runtime::Runtime,
+    runtime: dynemo_runtime::Runtime,
    mut in_opt: Input, // mut because vllm and sglang multi-node can change it
    out_opt: Output,
    flags: Flags,
@@ -173,13 +173,12 @@ pub async fn run(
            };
            EngineConfig::StaticFull {
                service_name: model_name,
-                engine: triton_distributed_llm::engines::mistralrs::make_engine(&model_path)
+                engine: dynemo_llm::engines::mistralrs::make_engine(&model_path).await?,
-                    .await?,
            }
        }
        #[cfg(feature = "sglang")]
        Output::SgLang => {
-            use triton_distributed_llm::engines::sglang;
+            use dynemo_llm::engines::sglang;
            let Some(model_path) = model_path else {
                anyhow::bail!("out=sglang requires flag --model-path=<full-path-to-model-dir>");
            };
@@ -191,7 +190,7 @@ pub async fn run(
            let Some(sock_prefix) = zmq_socket_prefix else {
                anyhow::bail!("sglang requires zmq_socket_prefix");
            };
-            let node_conf = triton_distributed_llm::engines::MultiNodeConfig {
+            let node_conf = dynemo_llm::engines::MultiNodeConfig {
                num_nodes: flags.num_nodes,
                node_rank: flags.node_rank,
                leader_addr: flags.leader_addr.unwrap_or_default(),
@@ -229,7 +228,7 @@ pub async fn run(
        }
        #[cfg(feature = "vllm")]
        Output::Vllm => {
-            use triton_distributed_llm::engines::vllm;
+            use dynemo_llm::engines::vllm;
            if flags.base_gpu_id != 0 {
                anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
            }
@@ -253,7 +252,7 @@ pub async fn run(
            let Some(sock_prefix) = zmq_socket_prefix else {
                anyhow::bail!("vllm requires zmq_socket_prefix");
            };
-            let node_conf = triton_distributed_llm::engines::MultiNodeConfig {
+            let node_conf = dynemo_llm::engines::MultiNodeConfig {
                num_nodes: flags.num_nodes,
                node_rank: flags.node_rank,
                leader_addr: flags.leader_addr.unwrap_or_default(),
@@ -296,7 +295,7 @@ pub async fn run(
        }
        #[cfg(feature = "llamacpp")]
        Output::LlamaCpp => {
-            use triton_distributed_llm::engines::llamacpp;
+            use dynemo_llm::engines::llamacpp;
            let Some(model_path) = model_path else {
                anyhow::bail!("out=llamacpp requires flag --model-path=<full-path-to-model-gguf>");
            };
@@ -317,7 +316,7 @@ pub async fn run(
        }
        #[cfg(feature = "trtllm")]
        Output::TrtLLM => {
-            use triton_distributed_llm::engines::trtllm;
+            use dynemo_llm::engines::trtllm;
            let Some(model_path) = model_path else {
                anyhow::bail!("out=trtllm requires flag --model-path=<full-path-to-model-dir>");
            };

--- a/launch/dynemo-run/src/main.rs
+++ b/launch/dynemo-run/src/main.rs
@@ -18,7 +18,7 @@ use std::env;
 use clap::Parser;
 use dynemo_run::{Input, Output};
-use triton_distributed_runtime::logging;
+use dynemo_runtime::logging;
 const HELP: &str = r#"
 dynemo-run is a single binary that wires together the various inputs (http, text, network) and workers (network, engine), that runs the services. It is the simplest way to use dynemo locally.
@@ -60,13 +60,13 @@ fn main() -> anyhow::Result<()> {
            if cfg!(feature = "sglang") {
                #[cfg(feature = "sglang")]
                {
-                    use triton_distributed_llm::engines::sglang;
+                    use dynemo_llm::engines::sglang;
                    let gpu_config = sglang::MultiGPUConfig {
                        tp_size: flags.tensor_parallel_size,
                        tp_rank: sglang_flags.tp_rank,
                        gpu_id: sglang_flags.gpu_id,
                    };
-                    let node_config = triton_distributed_llm::engines::MultiNodeConfig {
+                    let node_config = dynemo_llm::engines::MultiNodeConfig {
                        num_nodes: flags.num_nodes,
                        node_rank: flags.node_rank,
                        leader_addr: flags.leader_addr.unwrap_or_default(),
@@ -98,8 +98,8 @@ fn main() -> anyhow::Result<()> {
            if cfg!(feature = "vllm") {
                #[cfg(feature = "vllm")]
                {
-                    use triton_distributed_llm::engines::vllm;
+                    use dynemo_llm::engines::vllm;
-                    let node_config = triton_distributed_llm::engines::MultiNodeConfig {
+                    let node_config = dynemo_llm::engines::MultiNodeConfig {
                        num_nodes: flags.num_nodes,
                        node_rank: flags.node_rank,
                        leader_addr: flags.leader_addr.unwrap_or_default(),
@@ -119,15 +119,15 @@ fn main() -> anyhow::Result<()> {
    }
    // max_worker_threads and max_blocking_threads from env vars or config file.
-    let rt_config = triton_distributed_runtime::RuntimeConfig::from_settings()?;
+    let rt_config = dynemo_runtime::RuntimeConfig::from_settings()?;
    // One per process. Wraps a Runtime with holds two tokio runtimes.
-    let worker = triton_distributed_runtime::Worker::from_config(rt_config)?;
+    let worker = dynemo_runtime::Worker::from_config(rt_config)?;
    worker.execute(wrapper)
 }
-async fn wrapper(runtime: triton_distributed_runtime::Runtime) -> anyhow::Result<()> {
+async fn wrapper(runtime: dynemo_runtime::Runtime) -> anyhow::Result<()> {
    let mut in_opt = None;
    let mut out_opt = None;
    let args: Vec<String> = env::args().skip(1).collect();

--- a/launch/dynemo-run/src/output/echo_core.rs
+++ b/launch/dynemo-run/src/output/echo_core.rs
@@ -18,12 +18,12 @@ use std::{sync::Arc, time::Duration};
 use async_stream::stream;
 use async_trait::async_trait;
-use triton_distributed_llm::backend::ExecutionContext;
+use dynemo_llm::backend::ExecutionContext;
-use triton_distributed_llm::preprocessor::BackendInput;
+use dynemo_llm::preprocessor::BackendInput;
-use triton_distributed_llm::protocols::common::llm_backend::LLMEngineOutput;
+use dynemo_llm::protocols::common::llm_backend::LLMEngineOutput;
-use triton_distributed_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
+use dynemo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
-use triton_distributed_runtime::pipeline::{Error, ManyOut, SingleIn};
+use dynemo_runtime::pipeline::{Error, ManyOut, SingleIn};
-use triton_distributed_runtime::protocols::annotated::Annotated;
+use dynemo_runtime::protocols::annotated::Annotated;
 /// How long to sleep between echoed tokens.
 /// 50ms gives us 20 tok/s.

--- a/launch/dynemo-run/src/output/echo_full.rs
+++ b/launch/dynemo-run/src/output/echo_full.rs
@@ -18,13 +18,13 @@ use std::{sync::Arc, time::Duration};
 use async_stream::stream;
 use async_trait::async_trait;
-use triton_distributed_llm::protocols::openai::chat_completions::{
+use dynemo_llm::protocols::openai::chat_completions::{
    NvCreateChatCompletionRequest, NvCreateChatCompletionStreamResponse,
 };
-use triton_distributed_llm::types::openai::chat_completions::OpenAIChatCompletionsStreamingEngine;
+use dynemo_llm::types::openai::chat_completions::OpenAIChatCompletionsStreamingEngine;
-use triton_distributed_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
+use dynemo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
-use triton_distributed_runtime::pipeline::{Error, ManyOut, SingleIn};
+use dynemo_runtime::pipeline::{Error, ManyOut, SingleIn};
-use triton_distributed_runtime::protocols::annotated::Annotated;
+use dynemo_runtime::protocols::annotated::Annotated;
 /// How long to sleep between echoed tokens.
 /// 50ms gives us 20 tok/s.

--- a/lib/bindings/c/Cargo.lock
+++ b/lib/bindings/c/Cargo.lock
@@ -954,6 +954,99 @@ dependencies = [
 "syn 2.0.96",
 ]
+[[package]]
+name = "dynemo-llm"
+version = "0.2.1"
+dependencies = [
+ "anyhow",
+ "async-openai",
+ "async-stream",
+ "async-trait",
+ "axum 0.8.1",
+ "bindgen",
+ "blake3",
+ "bs62",
+ "bytes",
+ "chrono",
+ "cmake",
+ "derive_builder",
+ "dynemo-runtime",
+ "either",
+ "erased-serde",
+ "futures",
+ "galil-seiferas",
+ "indexmap 2.7.0",
+ "itertools 0.14.0",
+ "libc",
+ "minijinja",
+ "minijinja-contrib",
+ "prometheus",
+ "pyo3",
+ "regex",
+ "semver",
+ "serde",
+ "serde-pickle",
+ "serde_json",
+ "serde_repr",
+ "strum",
+ "thiserror 2.0.11",
+ "tokenizers",
+ "tokio",
+ "tokio-stream",
+ "tokio-util",
+ "toktrie",
+ "toktrie_hf_tokenizers",
+ "tracing",
+ "unicode-segmentation",
+ "uuid",
+ "validator",
+ "xxhash-rust",
+]
+[[package]]
+name = "dynemo-runtime"
+version = "0.2.1"
+dependencies = [
+ "anyhow",
+ "async-nats",
+ "async-once-cell",
+ "async-stream",
+ "async-trait",
+ "async_zmq",
+ "blake3",
+ "bytes",
+ "chrono",
+ "derive-getters",
+ "derive_builder",
+ "educe",
+ "either",
+ "etcd-client",
+ "figment",
+ "futures",
+ "humantime",
+ "local-ip-address",
+ "log",
+ "nid",
+ "nix",
+ "nuid",
+ "once_cell",
+ "prometheus",
+ "rand",
+ "regex",
+ "serde",
+ "serde_json",
+ "socket2",
+ "thiserror 1.0.69",
+ "tokio",
+ "tokio-stream",
+ "tokio-util",
+ "tracing",
+ "tracing-subscriber",
+ "uuid",
+ "validator",
+ "xxhash-rust",
+]
 [[package]]
 name = "ed25519"
 version = "2.2.3"
@@ -1853,6 +1946,27 @@ version = "0.2.169"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
+[[package]]
+name = "libdynemo-llm"
+version = "0.2.1"
+dependencies = [
+ "anyhow",
+ "async-once-cell",
+ "cbindgen",
+ "dynemo-llm",
+ "dynemo-runtime",
+ "futures",
+ "libc",
+ "once_cell",
+ "serde",
+ "serde_json",
+ "tokio",
+ "tokio-stream",
+ "tracing",
+ "tracing-subscriber",
+ "uuid",
+]
 [[package]]
 name = "libloading"
 version = "0.8.6"
@@ -1873,27 +1987,6 @@ dependencies = [
 "libc",
 ]
-[[package]]
-name = "libtriton-distributed-llm"
-version = "0.2.1"
-dependencies = [
- "anyhow",
- "async-once-cell",
- "cbindgen",
- "futures",
- "libc",
- "once_cell",
- "serde",
- "serde_json",
- "tokio",
- "tokio-stream",
- "tracing",
- "tracing-subscriber",
- "triton-distributed-llm",
- "triton-distributed-runtime",
- "uuid",
-]
 [[package]]
 name = "linux-raw-sys"
 version = "0.4.15"
@@ -3955,99 +4048,6 @@ dependencies = [
 "tracing-serde",
 ]
-[[package]]
-name = "triton-distributed-llm"
-version = "0.2.1"
-dependencies = [
- "anyhow",
- "async-openai",
- "async-stream",
- "async-trait",
- "axum 0.8.1",
- "bindgen",
- "blake3",
- "bs62",
- "bytes",
- "chrono",
- "cmake",
- "derive_builder",
- "either",
- "erased-serde",
- "futures",
- "galil-seiferas",
- "indexmap 2.7.0",
- "itertools 0.14.0",
- "libc",
- "minijinja",
- "minijinja-contrib",
- "prometheus",
- "pyo3",
- "regex",
- "semver",
- "serde",
- "serde-pickle",
- "serde_json",
- "serde_repr",
- "strum",
- "thiserror 2.0.11",
- "tokenizers",
- "tokio",
- "tokio-stream",
- "tokio-util",
- "toktrie",
- "toktrie_hf_tokenizers",
- "tracing",
- "triton-distributed-runtime",
- "unicode-segmentation",
- "uuid",
- "validator",
- "xxhash-rust",
-]
-[[package]]
-name = "triton-distributed-runtime"
-version = "0.2.1"
-dependencies = [
- "anyhow",
- "async-nats",
- "async-once-cell",
- "async-stream",
- "async-trait",
- "async_zmq",
- "blake3",
- "bytes",
- "chrono",
- "derive-getters",
- "derive_builder",
- "educe",
- "either",
- "etcd-client",
- "figment",
- "futures",
- "humantime",
- "local-ip-address",
- "log",
- "nid",
- "nix",
- "nuid",
- "once_cell",
- "prometheus",
- "rand",
- "regex",
- "serde",
- "serde_json",
- "socket2",
- "thiserror 1.0.69",
- "tokio",
- "tokio-stream",
- "tokio-util",
- "tracing",
- "tracing-subscriber",
- "uuid",
- "validator",
- "xxhash-rust",
-]
 [[package]]
 name = "try-lock"
 version = "0.2.5"

--- a/lib/bindings/c/Cargo.toml
+++ b/lib/bindings/c/Cargo.toml
@@ -14,24 +14,24 @@
 # limitations under the License.
 [package]
-name = "libtriton-distributed-llm"
+name = "libdynemo-llm"
 version = "0.2.1"
 edition = "2021"
 authors = ["NVIDIA"]
 license = "Apache-2.0"
-homepage = "https://github.com/triton-inference-server/triton_distributed"
+homepage = "https://github.com/dynemo-ai/dynemo"
-repository = "https://github.com/triton-inference-server/triton_distributed"
+repository = "https://github.com/dynemo-ai/dynemo.git"
 [lib]
-name = "triton_distributed_llm_capi"
+name = "dynemo_llm_capi"
 crate-type = ["cdylib"]
 [build-dependencies]
 cbindgen = "0.27"
 [dependencies]
-triton-distributed-llm = { path = "../../llm" }
+dynemo-llm = { path = "../../llm" }
-triton-distributed-runtime = { path = "../../runtime" }
+dynemo-runtime = { path = "../../runtime" }
 anyhow = { version = "1" }
 futures = "0.3"

--- a/lib/bindings/c/build.rs
+++ b/lib/bindings/c/build.rs
@@ -22,7 +22,7 @@ fn main() {
    let header_path = Path::new(&crate_dir)
        .join("include")
        .join("nvidia")
-        .join("triton_llm")
+        .join("dynemo_llm")
        .join("llm_engine.h");
    cbindgen::generate(crate_dir)

--- a/lib/bindings/c/cbindgen.toml
+++ b/lib/bindings/c/cbindgen.toml
@@ -15,7 +15,7 @@
 language = "C++"
 cpp_compat = true
-include_guard = "__NVIDIA_TRITON_LLM_API__"
+include_guard = "__NVIDIA_DYNEMO_LLM_API__"
 [enum]
@@ -25,7 +25,7 @@ enum_class = false
 [export]
-include = ["TritonLlmResult", "triton_llm_init", "triton_llm_shutdown"]
+include = ["DynemoLlmResult", "dynemo_llm_init", "dynemo_llm_shutdown"]
 [export.rename]
-"TritonLlmResult" = "triton_llm_result_t"
+"DynemoLlmResult" = "dynemo_llm_result_t"
--- a/lib/bindings/c/src/lib.rs
+++ b/lib/bindings/c/src/lib.rs
@@ -19,10 +19,10 @@ use once_cell::sync::OnceCell;
 use std::ffi::CStr;
 use std::sync::atomic::{AtomicU32, Ordering};
-use triton_distributed_llm::kv_router::{
+use dynemo_llm::kv_router::{
    indexer::compute_block_hash_for_seq, protocols::*, publisher::KvEventPublisher,
 };
-use triton_distributed_runtime::{DistributedRuntime, Worker};
+use dynemo_runtime::{DistributedRuntime, Worker};
 static WK: OnceCell<Worker> = OnceCell::new();
 static DRT: AsyncOnceCell<DistributedRuntime> = AsyncOnceCell::new();
 // [FIXME] shouldn't the publisher be instance passing between API calls?
@@ -41,7 +41,7 @@ fn initialize_tracing() {
 }
 #[repr(u32)]
-pub enum TritonLlmResult {
+pub enum DynemoLlmResult {
    OK = 0,
    ERR = 1,
 }
@@ -49,17 +49,17 @@ pub enum TritonLlmResult {
 /// # Safety
 /// the namespace_c_str and component_c_str are passed as pointers to C strings
 #[no_mangle]
-pub unsafe extern "C" fn triton_llm_init(
+pub unsafe extern "C" fn dynemo_llm_init(
    namespace_c_str: *const c_char,
    component_c_str: *const c_char,
    worker_id: i64,
-) -> TritonLlmResult {
+) -> DynemoLlmResult {
    initialize_tracing();
    let wk = match WK.get_or_try_init(Worker::from_settings) {
        Ok(wk) => wk.clone(),
        Err(e) => {
            eprintln!("Failed to initialize runtime: {:?}", e);
-            return TritonLlmResult::ERR;
+            return DynemoLlmResult::ERR;
        }
    };
    let rt = wk.runtime();
@@ -73,7 +73,7 @@ pub unsafe extern "C" fn triton_llm_init(
            Ok(_) => Ok(()),
            Err(e) => {
                eprintln!("Failed to initialize distributed runtime: {:?}", e);
-                Err(TritonLlmResult::ERR)
+                Err(DynemoLlmResult::ERR)
            }
        }
    });
@@ -81,7 +81,7 @@ pub unsafe extern "C" fn triton_llm_init(
        Ok(s) => s.to_string(),
        Err(e) => {
            eprintln!("Failed to convert C string to Rust string: {:?}", e);
-            return TritonLlmResult::ERR;
+            return DynemoLlmResult::ERR;
        }
    };
@@ -89,18 +89,18 @@ pub unsafe extern "C" fn triton_llm_init(
        Ok(s) => s.to_string(),
        Err(e) => {
            eprintln!("Failed to convert C string to Rust string: {:?}", e);
-            return TritonLlmResult::ERR;
+            return DynemoLlmResult::ERR;
        }
    };
    match result {
        Ok(_) => match KV_PUB
-            .get_or_try_init(move || triton_create_kv_publisher(namespace, component, worker_id))
+            .get_or_try_init(move || dynemo_create_kv_publisher(namespace, component, worker_id))
        {
-            Ok(_) => TritonLlmResult::OK,
+            Ok(_) => DynemoLlmResult::OK,
            Err(e) => {
                eprintln!("Failed to initialize distributed runtime: {:?}", e);
-                TritonLlmResult::ERR
+                DynemoLlmResult::ERR
            }
        },
        Err(e) => e,
@@ -108,33 +108,33 @@ pub unsafe extern "C" fn triton_llm_init(
 }
 #[no_mangle]
-pub extern "C" fn triton_llm_shutdown() -> TritonLlmResult {
+pub extern "C" fn dynemo_llm_shutdown() -> DynemoLlmResult {
    let wk = match WK.get() {
        Some(wk) => wk,
        None => {
            eprintln!("Runtime not initialized");
-            return TritonLlmResult::ERR;
+            return DynemoLlmResult::ERR;
        }
    };
    wk.runtime().shutdown();
-    TritonLlmResult::OK
+    DynemoLlmResult::OK
 }
 #[no_mangle]
-pub extern "C" fn triton_llm_load_publisher_create() -> TritonLlmResult {
+pub extern "C" fn dynemo_llm_load_publisher_create() -> DynemoLlmResult {
-    TritonLlmResult::OK
+    DynemoLlmResult::OK
 }
 // instantiate a kv publisher
 // this will bring up the task to publish and the channels to await publishing events
-// the [`triton_kv_publish_store_event`] call will use a handle to the publisher to send events
+// the [`dynemo_kv_publish_store_event`] call will use a handle to the publisher to send events
-// store and the [`triton_kv_event_create_removed`] will create remove events
+// store and the [`dynemo_kv_event_create_removed`] will create remove events
 // these call mus be driving by external c++ threads that are consuming the kv events from the
 // c++ executor api
-fn triton_create_kv_publisher(
+fn dynemo_create_kv_publisher(
    namespace: String,
    component: String,
    worker_id: i64,
@@ -238,7 +238,7 @@ fn kv_event_create_removed_from_parts(
 /// parent_hash is passed as pointer to indicate whether the blocks
 /// has a parent hash or not. nullptr is used to represent no parent hash
 #[no_mangle]
-pub unsafe extern "C" fn triton_kv_event_publish_stored(
+pub unsafe extern "C" fn dynemo_kv_event_publish_stored(
    event_id: u64,
    token_ids: *const u32,
    num_block_tokens: *const usize,
@@ -246,7 +246,7 @@ pub unsafe extern "C" fn triton_kv_event_publish_stored(
    num_blocks: usize,
    parent_hash: *const u64,
    lora_id: u64,
-) -> TritonLlmResult {
+) -> DynemoLlmResult {
    let publisher = KV_PUB.get().unwrap();
    let parent_hash = {
        if parent_hash.is_null() {
@@ -265,40 +265,40 @@ pub unsafe extern "C" fn triton_kv_event_publish_stored(
        lora_id,
    );
    match publisher.publish(event) {
-        Ok(_) => TritonLlmResult::OK,
+        Ok(_) => DynemoLlmResult::OK,
        Err(e) => {
            eprintln!("Error publishing stored kv event {:?}", e);
-            TritonLlmResult::ERR
+            DynemoLlmResult::ERR
        }
    }
 }
 #[no_mangle]
-pub extern "C" fn triton_kv_event_publish_removed(
+pub extern "C" fn dynemo_kv_event_publish_removed(
    event_id: u64,
    block_ids: *const u64,
    num_blocks: usize,
-) -> TritonLlmResult {
+) -> DynemoLlmResult {
    let publisher = KV_PUB.get().unwrap();
    let event = kv_event_create_removed_from_parts(event_id, block_ids, num_blocks);
    match publisher.publish(event) {
-        Ok(_) => TritonLlmResult::OK,
+        Ok(_) => DynemoLlmResult::OK,
        Err(e) => {
            eprintln!("Error publishing removed kv event {:?}", e);
-            TritonLlmResult::ERR
+            DynemoLlmResult::ERR
        }
    }
 }
 // #[no_mangle]
-// pub extern "C" fn triton_kv_publish_store_event(
+// pub extern "C" fn dynemo_kv_publish_store_event(
 //     event_id: u64,
 //     token_ids: *const u32,
 //     num_tokens: usize,
 //     lora_id: u64,
-// ) -> TritonLlmResult {
+// ) -> DynemoLlmResult {
 //     // if event.is_null() || token_ids.is_null() {
-//     //     return tritonKvErrorType::INVALID_TOKEN_IDS;
+//     //     return dynemoKvErrorType::INVALID_TOKEN_IDS;
 //     // }
 //     // let tokens = unsafe { std::slice::from_raw_parts(token_ids, num_tokens) }.to_vec();
@@ -311,15 +311,15 @@ pub extern "C" fn triton_kv_event_publish_removed(
 //     // unsafe { *event = Box::into_raw(new_event) };
-//     TritonLlmResult::OK
+//     DynemoLlmResult::OK
 // }
 // #[no_mangle]
-// pub extern "C" fn triton_kv_event_create_removed(
+// pub extern "C" fn dynemo_kv_event_create_removed(
 //     event_id: u64,
 //     block_hashes: *const u64,
 //     num_hashes: usize,
-// ) -> TritonLlmResult {
+// ) -> DynemoLlmResult {
 //     // if event.is_null() || block_hashes.is_null() {
 //     //     return -1;
 //     // }
@@ -334,19 +334,19 @@ pub extern "C" fn triton_kv_event_publish_removed(
 //     // unsafe { *event = Box::into_raw(new_event) };
 //     // 0
-//     TritonLlmResult::OK
+//     DynemoLlmResult::OK
 // }
 // /// create load publisher object and return a handle
 // /// load publisher will instantiate the nats service and tie its stats handler to
 // /// a watch channel receiver.  the watch channel sender will be attach to the
-// /// handle and calls to [`triton_load_stats_publish`] issue the stats to the watch t
+// /// handle and calls to [`dynemo_load_stats_publish`] issue the stats to the watch t
-// pub extern "C" fn triton_load_publisher_create() -> *mut LoadPublisher {
+// pub extern "C" fn dynemo_load_publisher_create() -> *mut LoadPublisher {
 //     // let publisher = Box::new(LoadPublisher::new());
 //     // Box::into_raw(publisher)
 // }
-// pub extern "C" fn triton_load_stats_publish(
+// pub extern "C" fn dynemo_load_stats_publish(
 //     publisher: *mut LoadPublisher,
 //     active_slots: u64,
 //     total_slots: u64,

--- a/lib/bindings/python/.gitignore
+++ b/lib/bindings/python/.gitignore
 /target
-python/triton_distributed/*.so
+python/dynemo/.*.so
--- a/lib/bindings/python/Cargo.lock
+++ b/lib/bindings/python/Cargo.lock
@@ -956,6 +956,119 @@ dependencies = [
 "syn 2.0.98",
 ]
+[[package]]
+name = "dynemo-llm"
+version = "0.2.1"
+dependencies = [
+ "anyhow",
+ "async-openai",
+ "async-stream",
+ "async-trait",
+ "axum 0.8.1",
+ "bindgen",
+ "blake3",
+ "bs62",
+ "bytes",
+ "chrono",
+ "cmake",
+ "derive_builder",
+ "dynemo-runtime",
+ "either",
+ "erased-serde",
+ "futures",
+ "galil-seiferas",
+ "indexmap 2.7.1",
+ "itertools 0.14.0",
+ "libc",
+ "minijinja",
+ "minijinja-contrib",
+ "prometheus",
+ "pyo3",
+ "regex",
+ "semver",
+ "serde",
+ "serde-pickle",
+ "serde_json",
+ "serde_repr",
+ "strum",
+ "thiserror 2.0.11",
+ "tokenizers",
+ "tokio",
+ "tokio-stream",
+ "tokio-util",
+ "toktrie",
+ "toktrie_hf_tokenizers",
+ "tracing",
+ "unicode-segmentation",
+ "uuid",
+ "validator",
+ "xxhash-rust",
+]
+[[package]]
+name = "dynemo-py3"
+version = "0.2.1"
+dependencies = [
+ "dynemo-llm",
+ "dynemo-runtime",
+ "futures",
+ "once_cell",
+ "pyo3",
+ "pyo3-async-runtimes",
+ "pythonize",
+ "serde",
+ "serde_json",
+ "thiserror 2.0.11",
+ "tokio",
+ "tokio-stream",
+ "tracing",
+ "tracing-subscriber",
+]
+[[package]]
+name = "dynemo-runtime"
+version = "0.2.1"
+dependencies = [
+ "anyhow",
+ "async-nats",
+ "async-once-cell",
+ "async-stream",
+ "async-trait",
+ "async_zmq",
+ "blake3",
+ "bytes",
+ "chrono",
+ "derive-getters",
+ "derive_builder",
+ "educe",
+ "either",
+ "etcd-client",
+ "figment",
+ "futures",
+ "humantime",
+ "local-ip-address",
+ "log",
+ "nid",
+ "nix",
+ "nuid",
+ "once_cell",
+ "prometheus",
+ "rand",
+ "regex",
+ "serde",
+ "serde_json",
+ "socket2",
+ "thiserror 1.0.69",
+ "tokio",
+ "tokio-stream",
+ "tokio-util",
+ "tracing",
+ "tracing-subscriber",
+ "uuid",
+ "validator",
+ "xxhash-rust",
+]
 [[package]]
 name = "ed25519"
 version = "2.2.3"
@@ -4004,119 +4117,6 @@ dependencies = [
 "tracing-serde",
 ]
-[[package]]
-name = "triton-distributed-llm"
-version = "0.2.1"
-dependencies = [
- "anyhow",
- "async-openai",
- "async-stream",
- "async-trait",
- "axum 0.8.1",
- "bindgen",
- "blake3",
- "bs62",
- "bytes",
- "chrono",
- "cmake",
- "derive_builder",
- "either",
- "erased-serde",
- "futures",
- "galil-seiferas",
- "indexmap 2.7.1",
- "itertools 0.14.0",
- "libc",
- "minijinja",
- "minijinja-contrib",
- "prometheus",
- "pyo3",
- "regex",
- "semver",
- "serde",
- "serde-pickle",
- "serde_json",
- "serde_repr",
- "strum",
- "thiserror 2.0.11",
- "tokenizers",
- "tokio",
- "tokio-stream",
- "tokio-util",
- "toktrie",
- "toktrie_hf_tokenizers",
- "tracing",
- "triton-distributed-runtime",
- "unicode-segmentation",
- "uuid",
- "validator",
- "xxhash-rust",
-]
-[[package]]
-name = "triton-distributed-py3"
-version = "0.2.1"
-dependencies = [
- "futures",
- "once_cell",
- "pyo3",
- "pyo3-async-runtimes",
- "pythonize",
- "serde",
- "serde_json",
- "thiserror 2.0.11",
- "tokio",
- "tokio-stream",
- "tracing",
- "tracing-subscriber",
- "triton-distributed-llm",
- "triton-distributed-runtime",
-]
-[[package]]
-name = "triton-distributed-runtime"
-version = "0.2.1"
-dependencies = [
- "anyhow",
- "async-nats",
- "async-once-cell",
- "async-stream",
- "async-trait",
- "async_zmq",
- "blake3",
- "bytes",
- "chrono",
- "derive-getters",
- "derive_builder",
- "educe",
- "either",
- "etcd-client",
- "figment",
- "futures",
- "humantime",
- "local-ip-address",
- "log",
- "nid",
- "nix",
- "nuid",
- "once_cell",
- "prometheus",
- "rand",
- "regex",
- "serde",
- "serde_json",
- "socket2",
- "thiserror 1.0.69",
- "tokio",
- "tokio-stream",
- "tokio-util",
- "tracing",
- "tracing-subscriber",
- "uuid",
- "validator",
- "xxhash-rust",
-]
 [[package]]
 name = "try-lock"
 version = "0.2.5"

--- a/lib/bindings/python/Cargo.toml
+++ b/lib/bindings/python/Cargo.toml
@@ -14,13 +14,13 @@
 # limitations under the License.
 [package]
-name = "triton-distributed-py3"
+name = "dynemo-py3"
 version = "0.2.1"
 edition = "2021"
 authors = ["NVIDIA"]
 license = "Apache-2.0"
-homepage = "https://github.com/triton-inference-server/triton_distributed"
+homepage = "https://github.com/dynemo-ai/dynemo"
-repository = "https://github.com/triton-inference-server/triton_distributed"
+repository = "https://github.com/dynemo-ai/dynemo.git"
 [lib]
 path = "rust/lib.rs"
@@ -30,8 +30,8 @@ crate-type = ["cdylib"]
 [dependencies]
-triton-distributed-llm = { path = "../../llm" }
+dynemo-llm = { path = "../../llm" }
-triton-distributed-runtime = { path = "../../runtime" }
+dynemo-runtime = { path = "../../runtime" }
 futures = "0.3"
 once_cell = "1.20.3"

--- a/lib/bindings/python/README.md
+++ b/lib/bindings/python/README.md
@@ -41,7 +41,7 @@ source .venv/bin/activate
 uv pip install maturin
 ```
-4. Build and install triton_distributed wheel
+4. Build and install dynemo wheel
 ```
 maturin develop --uv
 ```

--- a/lib/bindings/python/examples/bls/bar.py
+++ b/lib/bindings/python/examples/bls/bar.py
@@ -17,7 +17,7 @@ import asyncio
 import uvloop
-from triton_distributed.runtime import DistributedRuntime, triton_worker
+from dynemo.runtime import DistributedRuntime, dynemo_worker
 uvloop.install()
@@ -29,7 +29,7 @@ class RequestHandler:
            yield char
-@triton_worker()
+@dynemo_worker()
 async def worker(runtime: DistributedRuntime):
    component = runtime.namespace("examples/bls").component("bar")
    await component.create_service()

--- a/lib/bindings/python/examples/bls/bls.py
+++ b/lib/bindings/python/examples/bls/bls.py
@@ -17,12 +17,12 @@ import asyncio
 import uvloop
-from triton_distributed.runtime import DistributedRuntime, triton_worker
+from dynemo.runtime import DistributedRuntime, dynemo_worker
 uvloop.install()
-@triton_worker()
+@dynemo_worker()
 async def worker(runtime: DistributedRuntime):
    foo = (
        await runtime.namespace("examples/bls")

--- a/lib/bindings/python/examples/bls/foo.py
+++ b/lib/bindings/python/examples/bls/foo.py
@@ -17,7 +17,7 @@ import asyncio
 import uvloop
-from triton_distributed.runtime import DistributedRuntime, triton_worker
+from dynemo.runtime import DistributedRuntime, dynemo_worker
 uvloop.install()
@@ -28,7 +28,7 @@ class RequestHandler:
            yield char
-@triton_worker()
+@dynemo_worker()
 async def worker(runtime: DistributedRuntime):
    component = runtime.namespace("examples/bls").component("foo")
    await component.create_service()

--- a/lib/bindings/python/examples/error_handling/client.py
+++ b/lib/bindings/python/examples/error_handling/client.py
@@ -17,12 +17,12 @@ import asyncio
 import uvloop
-from triton_distributed.runtime import DistributedRuntime, triton_worker
+from dynemo.runtime import DistributedRuntime, dynemo_worker
-@triton_worker()
+@dynemo_worker()
 async def worker(runtime: DistributedRuntime):
-    await init(runtime, "triton-init")
+    await init(runtime, "dynemo")
 async def init(runtime: DistributedRuntime, ns: str):

--- a/lib/bindings/python/examples/error_handling/run.py
+++ b/lib/bindings/python/examples/error_handling/run.py
@@ -21,7 +21,7 @@ import uvloop
 from client import init as client_init
 from server import init as server_init
-from triton_distributed.runtime import DistributedRuntime, triton_worker
+from dynemo.runtime import DistributedRuntime, dynemo_worker
 def random_string(length=10):
@@ -29,7 +29,7 @@ def random_string(length=10):
    return "".join(random.choices(chars, k=length))
-@triton_worker()
+@dynemo_worker()
 async def worker(runtime: DistributedRuntime):
    ns = random_string()
    task = asyncio.create_task(server_init(runtime, ns))

--- a/lib/bindings/python/examples/error_handling/server.py
+++ b/lib/bindings/python/examples/error_handling/server.py
@@ -17,7 +17,7 @@ import asyncio
 import uvloop
-from triton_distributed.runtime import DistributedRuntime, triton_worker
+from dynemo.runtime import DistributedRuntime, dynemo_worker
 class RequestHandler:
@@ -33,9 +33,9 @@ class RequestHandler:
            yield char
-@triton_worker()
+@dynemo_worker()
 async def worker(runtime: DistributedRuntime):
-    await init(runtime, "triton-init")
+    await init(runtime, "dynemo")
 async def init(runtime: DistributedRuntime, ns: str):