Commit 1af7433b authored by Neelay Shah's avatar Neelay Shah Committed by GitHub
Browse files

refactor: rename triton_distributed to dynemo (#22)


Co-authored-by: default avatarGraham King <grahamk@nvidia.com>
parent ee4ef06b
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
**/*.plan **/*.plan
**/.cache/* **/.cache/*
**/*onnx* **/*onnx*
# Engine must be allowed because code contains triton_distributed_engine.py # Engine must be allowed because code contains dynemo_engine.py
**/*tensorrtllm_engines* **/*tensorrtllm_engines*
**/*tensorrtllm_models* **/*tensorrtllm_models*
**/*tensorrtllm_checkpoints* **/*tensorrtllm_checkpoints*
......
...@@ -22,25 +22,6 @@ on: ...@@ -22,25 +22,6 @@ on:
jobs: jobs:
# icp_validation:
# runs-on: ubuntu-latest
# container:
# image: ghcr.io/triton-inference-server/triton3/python_ci:0.1.9
# env:
# BUILD_NUMBER: ${{ github.job }}
# CUDA_VISIBLE_DEVICES: -1
# PATH: /opt/tritonserver/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/ucx/bin:/bin:/sbin:/usr/bin:/usr/sbin:/usr/local/bin:/usr/local/mpi/bin:/usr/local/sbin
# volumes:
# - ${{ github.workspace }}:/workspace
# permissions:
# contents: read
# packages: read
# steps:
# - uses: actions/checkout@v4
# - run: ./icp/protos/gen_python.sh
# - run: pytest --verbose icp
# timeout-minutes: 3
pre-commit: pre-commit:
runs-on: ubuntu-latest runs-on: ubuntu-latest
permissions: permissions:
...@@ -52,41 +33,3 @@ jobs: ...@@ -52,41 +33,3 @@ jobs:
timeout-minutes: 3 timeout-minutes: 3
# providers_validation:
# runs-on: ubuntu-latest
# container:
# image: ghcr.io/triton-inference-server/triton3/python_ci:0.1.9
# env:
# BUILD_NUMBER: ${{ github.job }}
# CUDA_VISIBLE_DEVICES: -1
# PATH: /opt/tritonserver/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/ucx/bin:/bin:/sbin:/usr/bin:/usr/sbin:/usr/local/bin:/usr/local/mpi/bin:/usr/local/sbin
# PROTO_OUT: /python/icp/protos
# volumes:
# - ${{ github.workspace }}:/workspace
# permissions:
# contents: read
# packages: read
# steps:
# - uses: actions/checkout@v4
# - run: pytest --verbose providers
# worker_validation:
# runs-on: ubuntu-latest
# container:
# image: ghcr.io/triton-inference-server/triton3/python_ci:0.1.9
# env:
# BUILD_NUMBER: ${{ github.job }}
# CUDA_VISIBLE_DEVICES: -1
# PATH: /opt/tritonserver/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/ucx/bin:/bin:/sbin:/usr/bin:/usr/sbin:/usr/local/bin:/usr/local/mpi/bin:/usr/local/sbin
# PROTO_OUT: /python/icp/protos
# volumes:
# - ${{ github.workspace }}:/workspace
# permissions:
# contents: read
# packages: read
# steps:
# - uses: actions/checkout@v4
# - run: ./icp/protos/gen_python.sh
# - run: pytest -p no:warnings --verbose worker/python/tests
# timeout-minutes: 2
...@@ -17,7 +17,7 @@ limitations under the License. ...@@ -17,7 +17,7 @@ limitations under the License.
# Open Source License Attribution # Open Source License Attribution
Triton Distributed uses Open Source components. You can find the details of these open-source projects along with license information below. Dynemo uses Open Source components. You can find the details of these open-source projects along with license information below.
We are grateful to the developers for their contributions to open source and acknowledge these below. We are grateful to the developers for their contributions to open source and acknowledge these below.
## nats-py - [Apache License 2.0](https://github.com/nats-io/nats.py/blob/main/LICENSE) ## nats-py - [Apache License 2.0](https://github.com/nats-io/nats.py/blob/main/LICENSE)
......
...@@ -71,7 +71,7 @@ The run script offers a few common workflows: ...@@ -71,7 +71,7 @@ The run script offers a few common workflows:
1. Running a command in a container and exiting. 1. Running a command in a container and exiting.
``` ```
./container/run.sh -- python3 -c "import triton_distributed.runtime; help(triton_distributed.runtime)" ./container/run.sh -- python3 -c "import dynemo.runtime; help(dynemo.runtime)"
``` ```
2. Starting an interactive shell. 2. Starting an interactive shell.
......
...@@ -737,6 +737,8 @@ version = "0.1.0" ...@@ -737,6 +737,8 @@ version = "0.1.0"
dependencies = [ dependencies = [
"axum 0.6.20", "axum 0.6.20",
"clap", "clap",
"dynemo-llm",
"dynemo-runtime",
"opentelemetry", "opentelemetry",
"opentelemetry-prometheus", "opentelemetry-prometheus",
"prometheus", "prometheus",
...@@ -747,8 +749,6 @@ dependencies = [ ...@@ -747,8 +749,6 @@ dependencies = [
"thiserror 1.0.69", "thiserror 1.0.69",
"tokio", "tokio",
"tracing", "tracing",
"triton-distributed-llm",
"triton-distributed-runtime",
] ]
[[package]] [[package]]
...@@ -1024,6 +1024,99 @@ dependencies = [ ...@@ -1024,6 +1024,99 @@ dependencies = [
"syn 2.0.98", "syn 2.0.98",
] ]
[[package]]
name = "dynemo-llm"
version = "0.2.1"
dependencies = [
"anyhow",
"async-openai",
"async-stream",
"async-trait",
"axum 0.8.1",
"bindgen",
"blake3",
"bs62",
"bytes",
"chrono",
"cmake",
"derive_builder",
"dynemo-runtime",
"either",
"erased-serde",
"futures",
"galil-seiferas",
"indexmap 2.7.1",
"itertools 0.14.0",
"libc",
"minijinja",
"minijinja-contrib",
"prometheus",
"pyo3",
"regex",
"semver",
"serde",
"serde-pickle",
"serde_json",
"serde_repr",
"strum",
"thiserror 2.0.11",
"tokenizers",
"tokio",
"tokio-stream",
"tokio-util",
"toktrie",
"toktrie_hf_tokenizers",
"tracing",
"unicode-segmentation",
"uuid",
"validator",
"xxhash-rust",
]
[[package]]
name = "dynemo-runtime"
version = "0.2.1"
dependencies = [
"anyhow",
"async-nats",
"async-once-cell",
"async-stream",
"async-trait",
"async_zmq",
"blake3",
"bytes",
"chrono",
"derive-getters",
"derive_builder",
"educe",
"either",
"etcd-client",
"figment",
"futures",
"humantime",
"local-ip-address",
"log",
"nid",
"nix",
"nuid",
"once_cell",
"prometheus",
"rand",
"regex",
"serde",
"serde_json",
"socket2",
"thiserror 1.0.69",
"tokio",
"tokio-stream",
"tokio-util",
"tracing",
"tracing-subscriber",
"uuid",
"validator",
"xxhash-rust",
]
[[package]] [[package]]
name = "ed25519" name = "ed25519"
version = "2.2.3" version = "2.2.3"
...@@ -4232,99 +4325,6 @@ dependencies = [ ...@@ -4232,99 +4325,6 @@ dependencies = [
"tracing-serde", "tracing-serde",
] ]
[[package]]
name = "triton-distributed-llm"
version = "0.2.1"
dependencies = [
"anyhow",
"async-openai",
"async-stream",
"async-trait",
"axum 0.8.1",
"bindgen",
"blake3",
"bs62",
"bytes",
"chrono",
"cmake",
"derive_builder",
"either",
"erased-serde",
"futures",
"galil-seiferas",
"indexmap 2.7.1",
"itertools 0.14.0",
"libc",
"minijinja",
"minijinja-contrib",
"prometheus",
"pyo3",
"regex",
"semver",
"serde",
"serde-pickle",
"serde_json",
"serde_repr",
"strum",
"thiserror 2.0.11",
"tokenizers",
"tokio",
"tokio-stream",
"tokio-util",
"toktrie",
"toktrie_hf_tokenizers",
"tracing",
"triton-distributed-runtime",
"unicode-segmentation",
"uuid",
"validator",
"xxhash-rust",
]
[[package]]
name = "triton-distributed-runtime"
version = "0.2.1"
dependencies = [
"anyhow",
"async-nats",
"async-once-cell",
"async-stream",
"async-trait",
"async_zmq",
"blake3",
"bytes",
"chrono",
"derive-getters",
"derive_builder",
"educe",
"either",
"etcd-client",
"figment",
"futures",
"humantime",
"local-ip-address",
"log",
"nid",
"nix",
"nuid",
"once_cell",
"prometheus",
"rand",
"regex",
"serde",
"serde_json",
"socket2",
"thiserror 1.0.69",
"tokio",
"tokio-stream",
"tokio-util",
"tracing",
"tracing-subscriber",
"uuid",
"validator",
"xxhash-rust",
]
[[package]] [[package]]
name = "try-lock" name = "try-lock"
version = "0.2.5" version = "0.2.5"
......
...@@ -21,8 +21,8 @@ license = "Apache-2.0" ...@@ -21,8 +21,8 @@ license = "Apache-2.0"
[dependencies] [dependencies]
# local # local
triton-distributed-runtime = { path = "../../../lib/runtime" } dynemo-runtime = { path = "../../../lib/runtime" }
triton-distributed-llm = { path = "../../../lib/llm" } dynemo-llm = { path = "../../../lib/llm" }
# workspace - todo # workspace - todo
......
...@@ -8,17 +8,17 @@ the services associated with that endpoint, do some postprocessing on them, ...@@ -8,17 +8,17 @@ the services associated with that endpoint, do some postprocessing on them,
and then publish an event with the postprocessed data. and then publish an event with the postprocessed data.
```bash ```bash
# For more details, try TRD_LOG=debug # For more details, try DYN_LOG=debug
TRD_LOG=info cargo run --bin count -- --namespace triton-init --component backend --endpoint generate DYN_LOG=info cargo run --bin count -- --namespace dynemo --component backend --endpoint generate
# 2025-02-26T18:45:05.467026Z INFO count: Creating unique instance of Count at triton-init/components/count/instance # 2025-02-26T18:45:05.467026Z INFO count: Creating unique instance of Count at dynemo/components/count/instance
# 2025-02-26T18:45:05.472146Z INFO count: Scraping service triton_init_backend_720278f8 and filtering on subject triton_init_backend_720278f8.generate # 2025-02-26T18:45:05.472146Z INFO count: Scraping service dynemo_init_backend_720278f8 and filtering on subject dynemo_init_backend_720278f8.generate
# ... # ...
``` ```
With no matching endpoints running, you should see warnings in the logs: With no matching endpoints running, you should see warnings in the logs:
```bash ```bash
2025-02-26T18:45:06.474161Z WARN count: No endpoints found matching subject triton_init_backend_720278f8.generate 2025-02-26T18:45:06.474161Z WARN count: No endpoints found matching subject dynemo_init_backend_720278f8.generate
``` ```
To see metrics published to a matching endpoint, you can use the To see metrics published to a matching endpoint, you can use the
...@@ -35,7 +35,7 @@ since the endpoint will automatically get discovered. ...@@ -35,7 +35,7 @@ since the endpoint will automatically get discovered.
When stats are found from the target endpoints being listened on, count will When stats are found from the target endpoints being listened on, count will
aggregate and publish some metrics as both an event and to a prometheus web server: aggregate and publish some metrics as both an event and to a prometheus web server:
``` ```
2025-02-28T04:05:58.077901Z INFO count: Aggregated metrics: ProcessedEndpoints { endpoints: [Endpoint { name: "worker-7587884888253033398", subject: "triton_init_backend_720278f8.generate-694d951a80e06bb6", data: ForwardPassMetrics { request_active_slots: 58, request_total_slots: 100, kv_active_blocks: 77, kv_total_blocks: 100 } }, Endpoint { name: "worker-7587884888253033401", subject: "triton_init_backend_720278f8.generate-694d951a80e06bb9", data: ForwardPassMetrics { request_active_slots: 71, request_total_slots: 100, kv_active_blocks: 29, kv_total_blocks: 100 } }], worker_ids: [7587884888253033398, 7587884888253033401], load_avg: 53.0, load_std: 24.0 } 2025-02-28T04:05:58.077901Z INFO count: Aggregated metrics: ProcessedEndpoints { endpoints: [Endpoint { name: "worker-7587884888253033398", subject: "dynemo_init_backend_720278f8.generate-694d951a80e06bb6", data: ForwardPassMetrics { request_active_slots: 58, request_total_slots: 100, kv_active_blocks: 77, kv_total_blocks: 100 } }, Endpoint { name: "worker-7587884888253033401", subject: "dynemo_init_backend_720278f8.generate-694d951a80e06bb9", data: ForwardPassMetrics { request_active_slots: 71, request_total_slots: 100, kv_active_blocks: 29, kv_total_blocks: 100 } }], worker_ids: [7587884888253033398, 7587884888253033401], load_avg: 53.0, load_std: 24.0 }
``` ```
To see the metrics being published in prometheus format, you can run: To see the metrics being published in prometheus format, you can run:
......
...@@ -13,10 +13,8 @@ ...@@ -13,10 +13,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
use rand::Rng; use dynemo_llm::kv_router::protocols::ForwardPassMetrics;
use std::sync::Arc; use dynemo_runtime::{
use triton_distributed_llm::kv_router::protocols::ForwardPassMetrics;
use triton_distributed_runtime::{
logging, logging,
pipeline::{ pipeline::{
async_trait, network::Ingress, AsyncEngine, AsyncEngineContextProvider, Error, ManyOut, async_trait, network::Ingress, AsyncEngine, AsyncEngineContextProvider, Error, ManyOut,
...@@ -25,6 +23,8 @@ use triton_distributed_runtime::{ ...@@ -25,6 +23,8 @@ use triton_distributed_runtime::{
protocols::annotated::Annotated, protocols::annotated::Annotated,
stream, DistributedRuntime, Result, Runtime, Worker, stream, DistributedRuntime, Result, Runtime, Worker,
}; };
use rand::Rng;
use std::sync::Arc;
fn main() -> Result<()> { fn main() -> Result<()> {
logging::init(); logging::init();
...@@ -69,7 +69,7 @@ async fn backend(runtime: DistributedRuntime) -> Result<()> { ...@@ -69,7 +69,7 @@ async fn backend(runtime: DistributedRuntime) -> Result<()> {
// we must first create a service, then we can attach one more more endpoints // we must first create a service, then we can attach one more more endpoints
runtime runtime
.namespace("triton-init")? .namespace("dynemo")?
.component("backend")? .component("backend")?
.service_builder() .service_builder()
.create() .create()
......
...@@ -20,13 +20,11 @@ use prometheus::register_gauge_vec; ...@@ -20,13 +20,11 @@ use prometheus::register_gauge_vec;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::net::SocketAddr; use std::net::SocketAddr;
use triton_distributed_llm::kv_router::protocols::ForwardPassMetrics; use dynemo_llm::kv_router::protocols::ForwardPassMetrics;
use triton_distributed_llm::kv_router::scheduler::Endpoint; use dynemo_llm::kv_router::scheduler::Endpoint;
use triton_distributed_llm::kv_router::scoring::ProcessedEndpoints; use dynemo_llm::kv_router::scoring::ProcessedEndpoints;
use triton_distributed_runtime::{ use dynemo_runtime::{distributed::Component, service::EndpointInfo, utils::Duration, Result};
distributed::Component, service::EndpointInfo, utils::Duration, Result,
};
/// Configuration for LLM worker load capacity metrics /// Configuration for LLM worker load capacity metrics
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
......
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
//! - KV Cache Blocks: [Active, Total] //! - KV Cache Blocks: [Active, Total]
use clap::Parser; use clap::Parser;
use triton_distributed_runtime::{ use dynemo_runtime::{
error, logging, error, logging,
traits::events::EventPublisher, traits::events::EventPublisher,
utils::{Duration, Instant}, utils::{Duration, Instant},
...@@ -50,7 +50,7 @@ struct Args { ...@@ -50,7 +50,7 @@ struct Args {
endpoint: String, endpoint: String,
/// Namespace to operate in /// Namespace to operate in
#[arg(long, env = "TRD_NAMESPACE", default_value = "triton-init")] #[arg(long, env = "DYN_NAMESPACE", default_value = "dynemo")]
namespace: String, namespace: String,
/// Polling interval in seconds (minimum 1 second) /// Polling interval in seconds (minimum 1 second)
...@@ -155,7 +155,7 @@ mod tests { ...@@ -155,7 +155,7 @@ mod tests {
#[test] #[test]
fn test_namespace_from_env() { fn test_namespace_from_env() {
env::set_var("TRD_NAMESPACE", "test-namespace"); env::set_var("DYN_NAMESPACE", "test-namespace");
let args = Args::parse_from(["count", "--component", "comp", "--endpoint", "end"]); let args = Args::parse_from(["count", "--component", "comp", "--endpoint", "end"]);
assert_eq!(args.namespace, "test-namespace"); assert_eq!(args.namespace, "test-namespace");
} }
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
ARG BASE_IMAGE="nvcr.io/nvidia/tritonserver" ARG BASE_IMAGE="nvcr.io/nvidia/tritonserver"
ARG BASE_IMAGE_TAG="25.01-py3" ARG BASE_IMAGE_TAG="25.01-py3"
FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS triton-distributed FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS dynemo
# TODO: non root user by default # TODO: non root user by default
...@@ -34,7 +34,7 @@ RUN rustup toolchain install 1.85.0-x86_64-unknown-linux-gnu ...@@ -34,7 +34,7 @@ RUN rustup toolchain install 1.85.0-x86_64-unknown-linux-gnu
# Install OpenAI-compatible frontend and its dependencies from triton server # Install OpenAI-compatible frontend and its dependencies from triton server
# repository. These are used to have a consistent interface, schema, and FastAPI # repository. These are used to have a consistent interface, schema, and FastAPI
# app between Triton Core and Triton Distributed implementations. # app between Triton Core and Dynemo implementations.
ARG OPENAI_SERVER_TAG="r25.01" ARG OPENAI_SERVER_TAG="r25.01"
RUN mkdir -p /opt/tritonserver/python && \ RUN mkdir -p /opt/tritonserver/python && \
cd /opt/tritonserver/python && \ cd /opt/tritonserver/python && \
...@@ -78,7 +78,7 @@ ARG TENSORRTLLM_SKIP_CLONE= ...@@ -78,7 +78,7 @@ ARG TENSORRTLLM_SKIP_CLONE=
ENV FRAMEWORK=${FRAMEWORK} ENV FRAMEWORK=${FRAMEWORK}
RUN --mount=type=bind,source=./container/deps/requirements.tensorrtllm.txt,target=/tmp/requirements.txt \ RUN --mount=type=bind,source=./container/deps/requirements.tensorrtllm.txt,target=/tmp/requirements.txt \
--mount=type=bind,source=./container/deps/clone_tensorrtllm.sh,target=/tmp/clone_tensorrtllm.sh \ --mount=type=bind,source=./container/deps/clone_tensorrtllm.sh,target=/tmp/clone_tensorrtllm.sh \
if [[ "$FRAMEWORK" == "TENSORRTLLM" ]] ; then pip install --timeout=2000 -r /tmp/requirements.txt; if [ ${TENSORRTLLM_SKIP_CLONE} -ne 1 ] ; then /tmp/clone_tensorrtllm.sh --tensorrtllm-backend-repo-tag ${TENSORRTLLM_BACKEND_REPO_TAG} --tensorrtllm-backend-rebuild ${TENSORRTLLM_BACKEND_REBUILD} --triton-llm-path /opt/triton/llm_binding ; fi ; fi if [[ "$FRAMEWORK" == "TENSORRTLLM" ]] ; then pip install --timeout=2000 -r /tmp/requirements.txt; if [ ${TENSORRTLLM_SKIP_CLONE} -ne 1 ] ; then /tmp/clone_tensorrtllm.sh --tensorrtllm-backend-repo-tag ${TENSORRTLLM_BACKEND_REPO_TAG} --tensorrtllm-backend-rebuild ${TENSORRTLLM_BACKEND_REBUILD} --triton-llm-path /opt/dynemo/llm_binding ; fi ; fi
RUN --mount=type=bind,source=./container/deps/requirements.standard.txt,target=/tmp/requirements.txt \ RUN --mount=type=bind,source=./container/deps/requirements.standard.txt,target=/tmp/requirements.txt \
...@@ -106,7 +106,7 @@ ENV VLLM_GENERATE_WORKERS=${VLLM_FRAMEWORK:+1} ...@@ -106,7 +106,7 @@ ENV VLLM_GENERATE_WORKERS=${VLLM_FRAMEWORK:+1}
ENV VLLM_BASELINE_TP_SIZE=${VLLM_FRAMEWORK:+1} ENV VLLM_BASELINE_TP_SIZE=${VLLM_FRAMEWORK:+1}
ENV VLLM_CONTEXT_TP_SIZE=${VLLM_FRAMEWORK:+1} ENV VLLM_CONTEXT_TP_SIZE=${VLLM_FRAMEWORK:+1}
ENV VLLM_GENERATE_TP_SIZE=${VLLM_FRAMEWORK:+1} ENV VLLM_GENERATE_TP_SIZE=${VLLM_FRAMEWORK:+1}
ENV VLLM_KV_CAPI_PATH="/opt/triton/llm_binding/lib/libtriton_llm_capi.so" ENV VLLM_KV_CAPI_PATH="/opt/dynemo/llm_binding/lib/libdynemo_llm_capi.so"
ENV PYTHONUNBUFFERED=1 ENV PYTHONUNBUFFERED=1
# Install NATS - pointing toward NATS github instead of binaries.nats.dev due to server instability # Install NATS - pointing toward NATS github instead of binaries.nats.dev due to server instability
...@@ -159,27 +159,27 @@ COPY lib/bindings /workspace/lib/bindings ...@@ -159,27 +159,27 @@ COPY lib/bindings /workspace/lib/bindings
RUN cd lib/bindings/c/ && \ RUN cd lib/bindings/c/ && \
cargo build --release --locked && cargo doc --no-deps cargo build --release --locked && cargo doc --no-deps
# Install uv, create virtualenv for general use, and build triton_distributed wheel # Install uv, create virtualenv for general use, and build dynemo wheel
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
RUN mkdir /opt/triton && \ RUN mkdir /opt/dynemo && \
uv venv /opt/triton/venv --python 3.12 && \ uv venv /opt/dynemo/venv --python 3.12 && \
source /opt/triton/venv/bin/activate && \ source /opt/dynemo/venv/bin/activate && \
uv build --wheel --out-dir /workspace/dist && \ uv build --wheel --out-dir /workspace/dist && \
uv pip install /workspace/dist/triton_distributed*cp312*.whl uv pip install /workspace/dist/dynemo*cp312*.whl
# Package the bindings # Package the bindings
RUN mkdir -p /opt/triton/bindings/wheels && \ RUN mkdir -p /opt/dynemo/bindings/wheels && \
mkdir /opt/triton/bindings/lib && \ mkdir /opt/dynemo/bindings/lib && \
cp dist/triton_distributed*cp312*.whl /opt/triton/bindings/wheels/. && \ cp dist/dynemo*cp312*.whl /opt/dynemo/bindings/wheels/. && \
cp lib/bindings/c/target/release/libtriton_distributed_llm_capi.so /opt/triton/bindings/lib/. && \ cp lib/bindings/c/target/release/libdynemo_llm_capi.so /opt/dynemo/bindings/lib/. && \
cp -r lib/bindings/c/include /opt/triton/bindings/. cp -r lib/bindings/c/include /opt/dynemo/bindings/.
# Install triton_distributed_runtime and triton_distributed_llm wheels globally in container for tests that # Install dynemo.runtime and dynemo.llm wheels globally in container for tests that
# currently run without virtual environment activated. # currently run without virtual environment activated.
# TODO: In future, we may use a virtualenv for everything and remove this. # TODO: In future, we may use a virtualenv for everything and remove this.
RUN pip install /opt/triton/bindings/wheels/triton_distributed*cp312*.whl RUN pip install /opt/dynemo/bindings/wheels/dynemo*cp312*.whl
# Copy everything in after install steps to avoid re-running build/install # Copy everything in after ginstall steps to avoid re-running build/install
# commands on unrelated changes in other dirs. # commands on unrelated changes in other dirs.
COPY . /workspace COPY . /workspace
......
...@@ -24,17 +24,17 @@ ENV PATH=/usr/local/bin/etcd/:$PATH ...@@ -24,17 +24,17 @@ ENV PATH=/usr/local/bin/etcd/:$PATH
# Install uv and create virtualenv # Install uv and create virtualenv
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
RUN mkdir /opt/triton && \ RUN mkdir /opt/dynemo && \
uv venv /opt/triton/venv --python 3.12 uv venv /opt/dynemo/venv --python 3.12
# Activate virtual environment # Activate virtual environment
ENV VIRTUAL_ENV=/opt/triton/venv ENV VIRTUAL_ENV=/opt/dynemo/venv
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}" ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
# Install patched vllm - keep this early in Dockerfile to avoid # Install patched vllm - keep this early in Dockerfile to avoid
# rebuilds from unrelated source code changes # rebuilds from unrelated source code changes
ARG VLLM_REF="v0.7.2" ARG VLLM_REF="v0.7.2"
ARG VLLM_PATCH="vllm_${VLLM_REF}-triton-kv-disagg-patch.patch" ARG VLLM_PATCH="vllm_${VLLM_REF}-dynemo-kv-disagg-patch.patch"
RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
bash /tmp/deps/vllm/install.sh --patch /tmp/deps/vllm/${VLLM_PATCH} --ref ${VLLM_REF} --install-cmd "uv pip install --editable" --use-precompiled --installation-dir /opt/vllm bash /tmp/deps/vllm/install.sh --patch /tmp/deps/vllm/${VLLM_PATCH} --ref ${VLLM_REF} --install-cmd "uv pip install --editable" --use-precompiled --installation-dir /opt/vllm
...@@ -100,25 +100,25 @@ COPY lib/bindings /workspace/lib/bindings ...@@ -100,25 +100,25 @@ COPY lib/bindings /workspace/lib/bindings
RUN cd lib/bindings/c && \ RUN cd lib/bindings/c && \
cargo build --release --locked && cargo doc --no-deps cargo build --release --locked && cargo doc --no-deps
# Build triton_distributed wheel # Build dynemo wheel
RUN source /opt/triton/venv/bin/activate && \ RUN source /opt/dynemo/venv/bin/activate && \
uv build --wheel --out-dir /workspace/dist && \ uv build --wheel --out-dir /workspace/dist && \
uv pip install /workspace/dist/triton_distributed*cp312*.whl uv pip install /workspace/dist/dynemo*cp312*.whl
# Package the bindings # Package the bindings
RUN mkdir -p /opt/triton/bindings/wheels && \ RUN mkdir -p /opt/dynemo/bindings/wheels && \
mkdir /opt/triton/bindings/lib && \ mkdir /opt/dynemo/bindings/lib && \
cp dist/triton_distributed*cp312*.whl /opt/triton/bindings/wheels/. && \ cp dist/dynemo*cp312*.whl /opt/dynemo/bindings/wheels/. && \
cp lib/bindings/c/target/release/libtriton_distributed_llm_capi.so /opt/triton/bindings/lib/. && \ cp lib/bindings/c/target/release/libdynemo_llm_capi.so /opt/dynemo/bindings/lib/. && \
cp -r lib/bindings/c/include /opt/triton/bindings/. cp -r lib/bindings/c/include /opt/dynemo/bindings/.
# Tell vllm to use the Triton LLM C API for KV Cache Routing # Tell vllm to use the Dynemo LLM C API for KV Cache Routing
ENV VLLM_KV_CAPI_PATH="/opt/triton/bindings/lib/libtriton_distributed_llm_capi.so" ENV VLLM_KV_CAPI_PATH="/opt/dynemo/bindings/lib/libdynemo_llm_capi.so"
# FIXME: Copy more specific folders in for dev/debug after directory restructure # FIXME: Copy more specific folders in for dev/debug after directory restructure
COPY . /workspace COPY . /workspace
# FIXME: May want a modification with triton-distributed banner on entry # FIXME: May want a modification with dynemo-distributed banner on entry
ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"] ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
CMD [] CMD []
...@@ -136,10 +136,10 @@ RUN apt update -y && \ ...@@ -136,10 +136,10 @@ RUN apt update -y && \
echo "set -g mouse on" >> /root/.tmux.conf echo "set -g mouse on" >> /root/.tmux.conf
# Set environment variables # Set environment variables
ENV VIRTUAL_ENV=/opt/triton/venv ENV VIRTUAL_ENV=/opt/dynemo/venv
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}" ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
ENV RAPIDS_LIBUCX_PREFER_SYSTEM_LIBRARY=true ENV RAPIDS_LIBUCX_PREFER_SYSTEM_LIBRARY=true
ENV VLLM_KV_CAPI_PATH="/opt/triton/bindings/lib/libtriton_distributed_llm_capi.so" ENV VLLM_KV_CAPI_PATH="/opt/dynemo/bindings/lib/libdynemo_llm_capi.so"
# Copy binaries # Copy binaries
COPY --from=dev /usr/local/bin/http /usr/local/bin/http COPY --from=dev /usr/local/bin/http /usr/local/bin/http
...@@ -166,7 +166,7 @@ COPY examples/python_rs/llm/vllm /workspace/examples/python_rs/llm/vllm ...@@ -166,7 +166,7 @@ COPY examples/python_rs/llm/vllm /workspace/examples/python_rs/llm/vllm
WORKDIR /workspace WORKDIR /workspace
# FIXME: May want a modification with triton-distributed banner on entry # FIXME: May want a modification with dynemo-distributed banner on entry
ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"] ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
CMD [] CMD []
...@@ -150,17 +150,17 @@ ENV PATH=/usr/local/bin/etcd/:$PATH ...@@ -150,17 +150,17 @@ ENV PATH=/usr/local/bin/etcd/:$PATH
# Install uv and create virtualenv # Install uv and create virtualenv
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
RUN mkdir /opt/triton && \ RUN mkdir /opt/dynemo && \
uv venv /opt/triton/venv --python 3.12 uv venv /opt/dynemo/venv --python 3.12
# Activate virtual environment # Activate virtual environment
ENV VIRTUAL_ENV=/opt/triton/venv ENV VIRTUAL_ENV=/opt/dynemo/venv
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}" ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
# Install patched vllm - keep this early in Dockerfile to avoid # Install patched vllm - keep this early in Dockerfile to avoid
# rebuilds from unrelated source code changes # rebuilds from unrelated source code changes
ARG VLLM_REF="v0.7.2" ARG VLLM_REF="v0.7.2"
ARG VLLM_PATCH="vllm_${VLLM_REF}-triton-kv-disagg-patch.patch" ARG VLLM_PATCH="vllm_${VLLM_REF}-dynemo-kv-disagg-patch.patch"
RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
bash /tmp/deps/vllm/install.sh --patch /tmp/deps/vllm/${VLLM_PATCH} --ref ${VLLM_REF} --install-cmd "uv pip install --editable" --use-precompiled --installation-dir /opt/vllm bash /tmp/deps/vllm/install.sh --patch /tmp/deps/vllm/${VLLM_PATCH} --ref ${VLLM_REF} --install-cmd "uv pip install --editable" --use-precompiled --installation-dir /opt/vllm
...@@ -225,25 +225,25 @@ COPY lib/bindings /workspace/lib/bindings ...@@ -225,25 +225,25 @@ COPY lib/bindings /workspace/lib/bindings
RUN cd lib/bindings/c && \ RUN cd lib/bindings/c && \
cargo build --release --locked && cargo doc --no-deps cargo build --release --locked && cargo doc --no-deps
# Build triton_distributed wheel # Build dynemo wheel
RUN source /opt/triton/venv/bin/activate && \ RUN source /opt/dynemo/venv/bin/activate && \
uv build --wheel --out-dir /workspace/dist && \ uv build --wheel --out-dir /workspace/dist && \
uv pip install /workspace/dist/triton_distributed*cp312*.whl uv pip install /workspace/dist/dynemo*cp312*.whl
# Package the bindings # Package the bindings
RUN mkdir -p /opt/triton/bindings/wheels && \ RUN mkdir -p /opt/dynemo/bindings/wheels && \
mkdir /opt/triton/bindings/lib && \ mkdir /opt/dynemo/bindings/lib && \
cp dist/triton_distributed*cp312*.whl /opt/triton/bindings/wheels/. && \ cp dist/dynemo*cp312*.whl /opt/dynemo/bindings/wheels/. && \
cp lib/bindings/c/target/release/libtriton_distributed_llm_capi.so /opt/triton/bindings/lib/. && \ cp lib/bindings/c/target/release/libdynemo_llm_capi.so /opt/dynemo/bindings/lib/. && \
cp -r lib/bindings/c/include /opt/triton/bindings/. cp -r lib/bindings/c/include /opt/dynemo/bindings/.
# Tell vllm to use the Triton LLM C API for KV Cache Routing # Tell vllm to use the Dynemo LLM C API for KV Cache Routing
ENV VLLM_KV_CAPI_PATH="/opt/triton/bindings/lib/libtriton_distributed_llm_capi.so" ENV VLLM_KV_CAPI_PATH="/opt/dynemo/bindings/lib/libdynemo_llm_capi.so"
# FIXME: Copy more specific folders in for dev/debug after directory restructure # FIXME: Copy more specific folders in for dev/debug after directory restructure
COPY . /workspace COPY . /workspace
# FIXME: May want a modification with triton-distributed banner on entry # FIXME: May want a modification with dynemo-distributed banner on entry
ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"] ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
CMD [] CMD []
...@@ -261,10 +261,10 @@ RUN apt update -y && \ ...@@ -261,10 +261,10 @@ RUN apt update -y && \
echo "set -g mouse on" >> /root/.tmux.conf echo "set -g mouse on" >> /root/.tmux.conf
# Set environment variables # Set environment variables
ENV VIRTUAL_ENV=/opt/triton/venv ENV VIRTUAL_ENV=/opt/dynemo/venv
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}" ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
ENV RAPIDS_LIBUCX_PREFER_SYSTEM_LIBRARY=true ENV RAPIDS_LIBUCX_PREFER_SYSTEM_LIBRARY=true
ENV VLLM_KV_CAPI_PATH="/opt/triton/bindings/lib/libtriton_distributed_llm_capi.so" ENV VLLM_KV_CAPI_PATH="/opt/dynemo/bindings/lib/libdynemo_llm_capi.so"
# Copy binaries # Copy binaries
COPY --from=dev /usr/local/bin/http /usr/local/bin/http COPY --from=dev /usr/local/bin/http /usr/local/bin/http
...@@ -291,7 +291,7 @@ COPY examples/python_rs/llm/vllm_nixl /workspace/examples/python_rs/llm/vllm_nix ...@@ -291,7 +291,7 @@ COPY examples/python_rs/llm/vllm_nixl /workspace/examples/python_rs/llm/vllm_nix
WORKDIR /workspace WORKDIR /workspace
# FIXME: May want a modification with triton-distributed banner on entry # FIXME: May want a modification with dynemo-distributed banner on entry
ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"] ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
CMD [] CMD []
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
TENSORRTLLM_BACKEND_REPO_TAG= TENSORRTLLM_BACKEND_REPO_TAG=
TENSORRTLLM_BACKEND_REBUILD= TENSORRTLLM_BACKEND_REBUILD=
TRITON_LLM_PATH= DYNEMO_LLM_PATH=
GIT_TOKEN= GIT_TOKEN=
GIT_REPO= GIT_REPO=
...@@ -43,9 +43,9 @@ get_options() { ...@@ -43,9 +43,9 @@ get_options() {
missing_requirement $1 missing_requirement $1
fi fi
;; ;;
--triton-llm-path) --dynemo-llm-path)
if [ "$2" ]; then if [ "$2" ]; then
TRITON_LLM_PATH=$2 DYNEMO_LLM_PATH=$2
shift shift
else else
missing_requirement $1 missing_requirement $1
...@@ -147,9 +147,9 @@ if [ ! -z ${TENSORRTLLM_BACKEND_REBUILD} ]; then ...@@ -147,9 +147,9 @@ if [ ! -z ${TENSORRTLLM_BACKEND_REBUILD} ]; then
# Build the backend # Build the backend
(cd inflight_batcher_llm/src \ (cd inflight_batcher_llm/src \
&& cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DUSE_CXX11_ABI=1 -DTRITON_LLM_PATH=$TRITON_LLM_PATH .. \ && cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DUSE_CXX11_ABI=1 -DDYNEMO_LLM_PATH=$DYNEMO_LLM_PATH .. \
&& make install \ && make install \
&& cp libtriton_tensorrtllm.so /opt/tritonserver/backends/tensorrtllm/ \ && cp libdynemo_tensorrtllm.so /opt/tritonserver/backends/tensorrtllm/ \
&& cp trtllmExecutorWorker /opt/tritonserver/backends/tensorrtllm/ \ && cp trtllmExecutorWorker /opt/tritonserver/backends/tensorrtllm/ \
) )
fi fi
......
...@@ -26,7 +26,8 @@ import typing as t ...@@ -26,7 +26,8 @@ import typing as t
from typing import Any from typing import Any
import click import click
from triton_distributed_rs import DistributedRuntime, triton_endpoint, triton_worker
from dynemo.runtime import DistributedRuntime, dynemo_endpoint, dynemo_worker
logger = logging.getLogger("compoundai.serve.nova") logger = logging.getLogger("compoundai.serve.nova")
...@@ -102,7 +103,7 @@ def main( ...@@ -102,7 +103,7 @@ def main(
server_context.worker_index = worker_id server_context.worker_index = worker_id
class_instance = service.inner() class_instance = service.inner()
@triton_worker() @dynemo_worker()
async def worker(runtime: DistributedRuntime): async def worker(runtime: DistributedRuntime):
if service_name and service_name != service.name: if service_name and service_name != service.name:
server_context.service_type = "service" server_context.service_type = "service"
...@@ -157,12 +158,12 @@ def main( ...@@ -157,12 +158,12 @@ def main(
# Bind an instance of inner to the endpoint # Bind an instance of inner to the endpoint
bound_method = endpoint.func.__get__(class_instance) bound_method = endpoint.func.__get__(class_instance)
# Only pass request type for now, use Any for response # Only pass request type for now, use Any for response
# TODO: Handle a triton_endpoint not having types # TODO: Handle a dynemo_endpoint not having types
# TODO: Handle multiple endpoints in a single component # TODO: Handle multiple endpoints in a single component
triton_wrapped_method = triton_endpoint(endpoint.request_type, Any)( dynemo_wrapped_method = dynemo_endpoint(endpoint.request_type, Any)(
bound_method bound_method
) )
result = await td_endpoint.serve_endpoint(triton_wrapped_method) result = await td_endpoint.serve_endpoint(dynemo_wrapped_method)
# WARNING: unreachable code :( because serve blocks # WARNING: unreachable code :( because serve blocks
logger.info(f"[{run_id}] Result: {result}") logger.info(f"[{run_id}] Result: {result}")
logger.info(f"[{run_id}] Registered endpoint '{name}'") logger.info(f"[{run_id}] Registered endpoint '{name}'")
......
...@@ -50,7 +50,7 @@ class NovaEndpoint: ...@@ -50,7 +50,7 @@ class NovaEndpoint:
if isinstance(args[1], (str, dict)): if isinstance(args[1], (str, dict)):
args[1] = self.request_type.parse_obj(args[1]) # type: ignore args[1] = self.request_type.parse_obj(args[1]) # type: ignore
# Convert Pydantic model to dict before passing to triton # Convert Pydantic model to dict before passing to dynemo
if len(args) > 1 and isinstance(args[1], BaseModel): if len(args) > 1 and isinstance(args[1], BaseModel):
args = list(args) # type: ignore args = list(args) # type: ignore
args[1] = args[1].model_dump() # type: ignore args[1] = args[1].model_dump() # type: ignore
......
...@@ -72,9 +72,9 @@ class NovaClient: ...@@ -72,9 +72,9 @@ class NovaClient:
else: else:
# Create nova worker if no runtime # Create nova worker if no runtime
from triton_distributed_rs import DistributedRuntime, triton_worker from dynemo.runtime import DistributedRuntime, dynemo_worker
@triton_worker() @dynemo_worker()
async def stream_worker(runtime: DistributedRuntime): async def stream_worker(runtime: DistributedRuntime):
try: try:
# Store runtime for future use # Store runtime for future use
......
...@@ -90,14 +90,14 @@ Note: NATS and ETCD servers should be running and accessible from the container ...@@ -90,14 +90,14 @@ Note: NATS and ETCD servers should be running and accessible from the container
Run the server logging (with debug level logging): Run the server logging (with debug level logging):
```bash ```bash
TRD_LOG=DEBUG http & DYN_LOG=DEBUG http &
``` ```
By default the server will run on port 8080. By default the server will run on port 8080.
Add model to the server: Add model to the server:
```bash ```bash
llmctl http add chat TinyLlama/TinyLlama-1.1B-Chat-v1.0 triton-init.tensorrt-llm.chat/completions llmctl http add chat TinyLlama/TinyLlama-1.1B-Chat-v1.0 dynemo.tensorrt-llm.chat/completions
llmctl http add completion TinyLlama/TinyLlama-1.1B-Chat-v1.0 triton-init.tensorrt-llm.completions llmctl http add completion TinyLlama/TinyLlama-1.1B-Chat-v1.0 dynemo.tensorrt-llm.completions
``` ```
#### 2. Workers #### 2. Workers
...@@ -214,14 +214,14 @@ Run the container interactively with the following command: ...@@ -214,14 +214,14 @@ Run the container interactively with the following command:
Run the server logging (with debug level logging): Run the server logging (with debug level logging):
```bash ```bash
TRD_LOG=DEBUG http & DYN_LOG=DEBUG http &
``` ```
By default the server will run on port 8080. By default the server will run on port 8080.
Add model to the server: Add model to the server:
```bash ```bash
llmctl http add chat TinyLlama/TinyLlama-1.1B-Chat-v1.0 triton-init.router.chat/completions llmctl http add chat TinyLlama/TinyLlama-1.1B-Chat-v1.0 dynemo.router.chat/completions
llmctl http add completion TinyLlama/TinyLlama-1.1B-Chat-v1.0 triton-init.router.completions llmctl http add completion TinyLlama/TinyLlama-1.1B-Chat-v1.0 dynemo.router.completions
``` ```
#### 2. Workers #### 2. Workers
......
...@@ -19,12 +19,12 @@ import asyncio ...@@ -19,12 +19,12 @@ import asyncio
import uvloop import uvloop
from triton_distributed.runtime import DistributedRuntime, triton_worker from dynemo.runtime import DistributedRuntime, dynemo_worker
from .protocol import Request from .protocol import Request
@triton_worker() @dynemo_worker()
async def worker( async def worker(
runtime: DistributedRuntime, runtime: DistributedRuntime,
component: str, component: str,
...@@ -38,7 +38,7 @@ async def worker( ...@@ -38,7 +38,7 @@ async def worker(
""" """
# create client # create client
client = ( client = (
await runtime.namespace("triton-init") await runtime.namespace("dynemo")
.component(component) .component(component)
.endpoint("generate") .endpoint("generate")
.client() .client()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment