refactor: rename triton_distributed to dynemo (#22)

Co-authored-by: Graham King <grahamk@nvidia.com>

refactor: rename triton_distributed to dynemo (#22)
Co-authored-by: Graham King <grahamk@nvidia.com>
1af7433b · Neelay Shah · GitHub · ee4ef06b · 1af7433b · 1af7433b
Commit 1af7433b authored Mar 05, 2025 by Neelay Shah Committed by GitHub Mar 05, 2025
20 changed files
--- a/lib/llm/Cargo.toml
+++ b/lib/llm/Cargo.toml
@@ -18,11 +18,11 @@ version = "0.2.1"
 edition = "2021"
 authors = ["NVIDIA"]
 license = "Apache-2.0"
-homepage = "https://github.com/triton-inference-server/triton_distributed"
-repository = "https://github.com/triton-inference-server/triton_distributed"
+homepage = "https://github.com/dynemo-ai/dynemo"
+repository = "https://github.com/dynemo-ai/dynemo.git"

 [package]
-name = "triton-distributed-llm"
+name = "dynemo-llm"
 version.workspace = true
 edition.workspace = true
 authors.workspace = true
@@ -43,7 +43,7 @@ vulkan = ["llama-cpp-2/vulkan"]

 [workspace.dependencies]
 # local or crates.io
-triton-distributed-runtime = { version = "0.2.0", path = "../runtime" }
+dynemo-runtime = { version = "0.2.0", path = "../runtime" }

 # crates.io
 anyhow = { version = "1" }
@@ -66,7 +66,7 @@ strum = { version = "0.27", features = ["derive"] }
 [dependencies]

 # repo
-triton-distributed-runtime = { workspace = true }
+dynemo-runtime = { workspace = true }

 # workspace
 anyhow = { workspace = true }

--- a/lib/llm/src/backend.rs
+++ b/lib/llm/src/backend.rs
@@ -34,7 +34,7 @@ use futures::stream::{self, StreamExt};
 use tracing as log;

 use crate::model_card::model::{ModelDeploymentCard, TokenizerKind};
-use triton_distributed_runtime::{
+use dynemo_runtime::{
    pipeline::{
        async_trait, AsyncEngineContextProvider, ManyOut, Operator, ResponseStream,
        ServerStreamingEngine, SingleIn,

--- a/lib/llm/src/engines/llamacpp.rs
+++ b/lib/llm/src/engines/llamacpp.rs
@@ -22,6 +22,11 @@ use std::{
 use anyhow::Context;
 use async_stream::stream;
 use async_trait::async_trait;
+use dynemo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
+use dynemo_runtime::pipeline::error as pipeline_error;
+use dynemo_runtime::pipeline::{Error, ManyOut, SingleIn};
+use dynemo_runtime::protocols::annotated::Annotated;
+use dynemo_runtime::CancellationToken;
 use llama_cpp_2::{
    context::{params::LlamaContextParams, LlamaContext},
    llama_backend::LlamaBackend,
@@ -30,11 +35,6 @@ use llama_cpp_2::{
    sampling::LlamaSampler,
    token::LlamaToken,
 };
-use triton_distributed_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
-use triton_distributed_runtime::pipeline::error as pipeline_error;
-use triton_distributed_runtime::pipeline::{Error, ManyOut, SingleIn};
-use triton_distributed_runtime::protocols::annotated::Annotated;
-use triton_distributed_runtime::CancellationToken;

 use crate::backend::ExecutionContext;
 use crate::protocols::common::llm_backend::{BackendInput, LLMEngineOutput};

--- a/lib/llm/src/engines/mistralrs.rs
+++ b/lib/llm/src/engines/mistralrs.rs
@@ -28,10 +28,10 @@ use mistralrs::{
 };
 use tokio::sync::mpsc::channel;

-use triton_distributed_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
-use triton_distributed_runtime::pipeline::error as pipeline_error;
-use triton_distributed_runtime::pipeline::{Error, ManyOut, SingleIn};
-use triton_distributed_runtime::protocols::annotated::Annotated;
+use dynemo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
+use dynemo_runtime::pipeline::error as pipeline_error;
+use dynemo_runtime::pipeline::{Error, ManyOut, SingleIn};
+use dynemo_runtime::protocols::annotated::Annotated;

 use crate::protocols::openai::chat_completions::{
    NvCreateChatCompletionRequest, NvCreateChatCompletionStreamResponse,

--- a/lib/llm/src/engines/sglang.rs
+++ b/lib/llm/src/engines/sglang.rs
@@ -17,8 +17,8 @@ use std::path::Path;
 use std::sync::Arc;

 use crate::backend::ExecutionContext;
-use triton_distributed_runtime::pipeline::error as pipeline_error;
-use triton_distributed_runtime::CancellationToken;
+use dynemo_runtime::pipeline::error as pipeline_error;
+use dynemo_runtime::CancellationToken;

 mod worker;


--- a/lib/llm/src/engines/sglang/engine.rs
+++ b/lib/llm/src/engines/sglang/engine.rs
@@ -19,10 +19,10 @@ use async_stream::stream;
 use async_trait::async_trait;

 use crate::protocols::common::llm_backend::{BackendInput, LLMEngineOutput};
-use triton_distributed_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
-use triton_distributed_runtime::pipeline::{Error, ManyOut, SingleIn};
-use triton_distributed_runtime::protocols::annotated::Annotated;
-use triton_distributed_runtime::runtime::CancellationToken;
+use dynemo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
+use dynemo_runtime::pipeline::{Error, ManyOut, SingleIn};
+use dynemo_runtime::protocols::annotated::Annotated;
+use dynemo_runtime::runtime::CancellationToken;

 use crate::engines::MultiNodeConfig;


--- a/lib/llm/src/engines/sglang/worker.rs
+++ b/lib/llm/src/engines/sglang/worker.rs
@@ -37,8 +37,8 @@ use tokio::sync::mpsc::Sender;
 use tokio::{io::AsyncBufReadExt, sync::mpsc::error::SendError};
 use tokio::{io::AsyncReadExt as _, task::JoinHandle};

-use triton_distributed_runtime::protocols::annotated::Annotated;
-use triton_distributed_runtime::runtime::CancellationToken;
+use dynemo_runtime::protocols::annotated::Annotated;
+use dynemo_runtime::runtime::CancellationToken;

 use crate::engines::sglang::MultiGPUConfig;
 use crate::engines::MultiNodeConfig;

--- a/lib/llm/src/engines/trtllm.rs
+++ b/lib/llm/src/engines/trtllm.rs
@@ -16,7 +16,7 @@
 use std::sync::Arc;

 use crate::backend::ExecutionContext;
-use triton_distributed_runtime::pipeline::error as pipeline_error;
+use dynemo_runtime::pipeline::error as pipeline_error;

 pub mod executor;


--- a/lib/llm/src/engines/trtllm/executor/engine.rs
+++ b/lib/llm/src/engines/trtllm/executor/engine.rs
@@ -15,12 +15,12 @@

 use anyhow::{Error, Result};
 use async_trait::async_trait;
+use dynemo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
+use dynemo_runtime::pipeline::{ManyOut, SingleIn};
+use dynemo_runtime::protocols::annotated::Annotated;
 use futures::stream;
 use tokio::sync::mpsc;
 use tokio_util::sync::CancellationToken;
-use triton_distributed_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
-use triton_distributed_runtime::pipeline::{ManyOut, SingleIn};
-use triton_distributed_runtime::protocols::annotated::Annotated;

 use super::Executor;
 use crate::protocols::common::llm_backend::{BackendInput, LLMEngineOutput};

--- a/lib/llm/src/engines/vllm.rs
+++ b/lib/llm/src/engines/vllm.rs
@@ -19,8 +19,8 @@ use std::pin::Pin;
 use std::sync::Arc;
 use std::task::{Context, Poll};

-use triton_distributed_runtime::pipeline::error as pipeline_error;
-use triton_distributed_runtime::CancellationToken;
+use dynemo_runtime::pipeline::error as pipeline_error;
+use dynemo_runtime::CancellationToken;

 use crate::backend::ExecutionContext;
 use crate::engines::MultiNodeConfig;

--- a/lib/llm/src/engines/vllm/engine.rs
+++ b/lib/llm/src/engines/vllm/engine.rs
@@ -21,10 +21,10 @@ use async_trait::async_trait;
 use crate::engines::vllm::worker;
 use crate::engines::MultiNodeConfig;
 use crate::protocols::common::llm_backend::{BackendInput, LLMEngineOutput};
-use triton_distributed_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
-use triton_distributed_runtime::pipeline::{Error, ManyOut, SingleIn};
-use triton_distributed_runtime::protocols::annotated::Annotated;
-use triton_distributed_runtime::runtime::CancellationToken;
+use dynemo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
+use dynemo_runtime::pipeline::{Error, ManyOut, SingleIn};
+use dynemo_runtime::protocols::annotated::Annotated;
+use dynemo_runtime::runtime::CancellationToken;

 pub struct VllmEngine {
    cancel_token: CancellationToken,

--- a/lib/llm/src/engines/vllm/ray.rs
+++ b/lib/llm/src/engines/vllm/ray.rs
@@ -24,7 +24,7 @@ use tokio::select;
 use tokio::time;
 use tracing;

-use triton_distributed_runtime::CancellationToken;
+use dynemo_runtime::CancellationToken;

 /// Default is 16 seconds, we make it a bit shorter
 const RAY_STOP_TIMEOUT_SECS: u32 = 10;

--- a/lib/llm/src/engines/vllm/worker.rs
+++ b/lib/llm/src/engines/vllm/worker.rs
@@ -19,6 +19,8 @@ use std::{
 };

 use async_zmq::{SinkExt, StreamExt};
+use dynemo_runtime::protocols::annotated::Annotated;
+use dynemo_runtime::CancellationToken;
 use pyo3::{
    prelude::*,
    types::{IntoPyDict, PyBytes, PyString},
@@ -26,8 +28,6 @@ use pyo3::{
 use tokio::sync::mpsc::Sender;
 use tokio::task::JoinHandle;
 use tokio::{io::AsyncBufReadExt, sync::mpsc::error::SendError};
-use triton_distributed_runtime::protocols::annotated::Annotated;
-use triton_distributed_runtime::CancellationToken;

 use crate::engines::MultiNodeConfig;
 use crate::protocols::common::llm_backend::LLMEngineOutput;

--- a/lib/llm/src/http/service/discovery.rs
+++ b/lib/llm/src/http/service/discovery.rs
@@ -18,7 +18,7 @@ use std::sync::Arc;
 use serde::{Deserialize, Serialize};
 use tokio::sync::mpsc::Receiver;

-use triton_distributed_runtime::{
+use dynemo_runtime::{
    protocols::{self, annotated::Annotated},
    raise,
    transports::etcd::{KeyValue, WatchEvent},

--- a/lib/llm/src/http/service/openai.rs
+++ b/lib/llm/src/http/service/openai.rs
@@ -48,7 +48,7 @@ use crate::types::{
    Annotated,
 };

-use triton_distributed_runtime::pipeline::{AsyncEngineContext, Context};
+use dynemo_runtime::pipeline::{AsyncEngineContext, Context};

 #[derive(Serialize, Deserialize)]
 pub(crate) struct ErrorResponse {
@@ -91,7 +91,7 @@ impl ErrorResponse {
        )
    }

-    /// The OAI endpoints call an [`triton_distributed_runtime::engine::AsyncEngine`] which are specialized to return
+    /// The OAI endpoints call an [`dynemo_runtime::engine::AsyncEngine`] which are specialized to return
    /// an [`anyhow::Error`]. This method will convert the [`anyhow::Error`] into an [`HttpError`].
    /// If successful, it will return the [`HttpError`] as an [`ErrorResponse::internal_server_error`]
    /// with the details of the error.
@@ -516,7 +516,7 @@ pub fn list_models_router(
    path: Option<String>,
 ) -> (Vec<RouteDoc>, Router) {
    // TODO: Why do we have this endpoint?
-    let custom_path = path.unwrap_or("/triton/alpha/list-models".to_string());
+    let custom_path = path.unwrap_or("/dynemo/alpha/list-models".to_string());
    let doc_for_custom = RouteDoc::new(axum::http::Method::GET, &custom_path);

    // Standard OpenAI compatible list models endpoint

--- a/lib/llm/src/kv_router.rs
+++ b/lib/llm/src/kv_router.rs
@@ -14,11 +14,11 @@
 // limitations under the License.

 use anyhow::Result;
+use dynemo_runtime::{component::Component, DistributedRuntime};
 use futures::stream::StreamExt;
 use std::{sync::Arc, time::Duration};
 use tokio_util::sync::CancellationToken;
 use tracing;
-use triton_distributed_runtime::{component::Component, DistributedRuntime};

 pub mod indexer;
 pub mod protocols;
@@ -62,7 +62,7 @@ impl KvRouter {
    }

    pub async fn new(
-        nats_client: triton_distributed_runtime::transports::nats::Client,
+        nats_client: dynemo_runtime::transports::nats::Client,
        service_name: String,
        kv_subject: String,
    ) -> Result<Arc<Self>> {
@@ -135,7 +135,7 @@ impl KvRouter {
 }

 async fn collect_endpoints(
-    nats_client: triton_distributed_runtime::transports::nats::Client,
+    nats_client: dynemo_runtime::transports::nats::Client,
    service_name: String,
    ep_tx: tokio::sync::mpsc::Sender<ProcessedEndpoints>,
    cancel: CancellationToken,

--- a/lib/llm/src/kv_router/publisher.rs
+++ b/lib/llm/src/kv_router/publisher.rs
@@ -15,11 +15,7 @@

 use crate::kv_router::{indexer::RouterEvent, protocols::*, KV_EVENT_SUBJECT};
 use async_trait::async_trait;
-use futures::stream;
-use std::sync::Arc;
-use tokio::sync::mpsc;
-use tracing as log;
-use triton_distributed_runtime::{
+use dynemo_runtime::{
    component::Component,
    pipeline::{
        network::Ingress, AsyncEngine, AsyncEngineContextProvider, ManyOut, ResponseStream,
@@ -28,6 +24,10 @@ use triton_distributed_runtime::{
    protocols::annotated::Annotated,
    DistributedRuntime, Error, Result,
 };
+use futures::stream;
+use std::sync::Arc;
+use tokio::sync::mpsc;
+use tracing as log;

 pub struct KvEventPublisher {
    tx: mpsc::UnboundedSender<KvCacheEvent>,

--- a/lib/llm/src/kv_router/worker.rs
+++ b/lib/llm/src/kv_router/worker.rs
@@ -19,12 +19,12 @@ pub use crate::kv_router::protocols::ForwardPassMetrics;

 use anyhow::Result;
 use derive_builder::Builder;
-use triton_distributed_runtime::pipeline::network::{
+use dynemo_runtime::pipeline::network::{
    ingress::push_endpoint::PushEndpoint,
    PushWorkHandler,
 };

-use triton_distributed_runtime::transports::nats::{self, ServiceExt};
+use dynemo_runtime::transports::nats::{self, ServiceExt};

 use tokio::sync::watch;
 use tokio_util::sync::CancellationToken;

--- a/lib/llm/src/lib.rs
+++ b/lib/llm/src/lib.rs
@@ -13,9 +13,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-//! # Triton LLM
+//! # Dynemo LLM
 //!
-//! The `triton-llm` crate is a Rust library that provides a set of traits and types for building
+//! The `dynemo-llm` crate is a Rust library that provides a set of traits and types for building
 //! distributed LLM inference solutions.

 pub mod backend;

--- a/lib/llm/src/model_card/model.rs
+++ b/lib/llm/src/model_card/model.rs
@@ -37,7 +37,7 @@ use std::time::Duration;

 use derive_builder::Builder;

-use triton_distributed_runtime::slug::Slug;
+use dynemo_runtime::slug::Slug;

 pub const BUCKET_NAME: &str = "mdc";