Commit 1af7433b authored by Neelay Shah's avatar Neelay Shah Committed by GitHub
Browse files

refactor: rename triton_distributed to dynemo (#22)


Co-authored-by: default avatarGraham King <grahamk@nvidia.com>
parent ee4ef06b
......@@ -19,7 +19,7 @@
**/*.plan
**/.cache/*
**/*onnx*
# Engine must be allowed because code contains triton_distributed_engine.py
# Engine must be allowed because code contains dynemo_engine.py
**/*tensorrtllm_engines*
**/*tensorrtllm_models*
**/*tensorrtllm_checkpoints*
......
......@@ -22,25 +22,6 @@ on:
jobs:
# icp_validation:
# runs-on: ubuntu-latest
# container:
# image: ghcr.io/triton-inference-server/triton3/python_ci:0.1.9
# env:
# BUILD_NUMBER: ${{ github.job }}
# CUDA_VISIBLE_DEVICES: -1
# PATH: /opt/tritonserver/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/ucx/bin:/bin:/sbin:/usr/bin:/usr/sbin:/usr/local/bin:/usr/local/mpi/bin:/usr/local/sbin
# volumes:
# - ${{ github.workspace }}:/workspace
# permissions:
# contents: read
# packages: read
# steps:
# - uses: actions/checkout@v4
# - run: ./icp/protos/gen_python.sh
# - run: pytest --verbose icp
# timeout-minutes: 3
pre-commit:
runs-on: ubuntu-latest
permissions:
......@@ -52,41 +33,3 @@ jobs:
timeout-minutes: 3
# providers_validation:
# runs-on: ubuntu-latest
# container:
# image: ghcr.io/triton-inference-server/triton3/python_ci:0.1.9
# env:
# BUILD_NUMBER: ${{ github.job }}
# CUDA_VISIBLE_DEVICES: -1
# PATH: /opt/tritonserver/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/ucx/bin:/bin:/sbin:/usr/bin:/usr/sbin:/usr/local/bin:/usr/local/mpi/bin:/usr/local/sbin
# PROTO_OUT: /python/icp/protos
# volumes:
# - ${{ github.workspace }}:/workspace
# permissions:
# contents: read
# packages: read
# steps:
# - uses: actions/checkout@v4
# - run: pytest --verbose providers
# worker_validation:
# runs-on: ubuntu-latest
# container:
# image: ghcr.io/triton-inference-server/triton3/python_ci:0.1.9
# env:
# BUILD_NUMBER: ${{ github.job }}
# CUDA_VISIBLE_DEVICES: -1
# PATH: /opt/tritonserver/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/ucx/bin:/bin:/sbin:/usr/bin:/usr/sbin:/usr/local/bin:/usr/local/mpi/bin:/usr/local/sbin
# PROTO_OUT: /python/icp/protos
# volumes:
# - ${{ github.workspace }}:/workspace
# permissions:
# contents: read
# packages: read
# steps:
# - uses: actions/checkout@v4
# - run: ./icp/protos/gen_python.sh
# - run: pytest -p no:warnings --verbose worker/python/tests
# timeout-minutes: 2
......@@ -17,7 +17,7 @@ limitations under the License.
# Open Source License Attribution
Triton Distributed uses Open Source components. You can find the details of these open-source projects along with license information below.
Dynemo uses Open Source components. You can find the details of these open-source projects along with license information below.
We are grateful to the developers for their contributions to open source and acknowledge these below.
## nats-py - [Apache License 2.0](https://github.com/nats-io/nats.py/blob/main/LICENSE)
......
......@@ -71,7 +71,7 @@ The run script offers a few common workflows:
1. Running a command in a container and exiting.
```
./container/run.sh -- python3 -c "import triton_distributed.runtime; help(triton_distributed.runtime)"
./container/run.sh -- python3 -c "import dynemo.runtime; help(dynemo.runtime)"
```
2. Starting an interactive shell.
......
......@@ -737,6 +737,8 @@ version = "0.1.0"
dependencies = [
"axum 0.6.20",
"clap",
"dynemo-llm",
"dynemo-runtime",
"opentelemetry",
"opentelemetry-prometheus",
"prometheus",
......@@ -747,8 +749,6 @@ dependencies = [
"thiserror 1.0.69",
"tokio",
"tracing",
"triton-distributed-llm",
"triton-distributed-runtime",
]
[[package]]
......@@ -1024,6 +1024,99 @@ dependencies = [
"syn 2.0.98",
]
[[package]]
name = "dynemo-llm"
version = "0.2.1"
dependencies = [
"anyhow",
"async-openai",
"async-stream",
"async-trait",
"axum 0.8.1",
"bindgen",
"blake3",
"bs62",
"bytes",
"chrono",
"cmake",
"derive_builder",
"dynemo-runtime",
"either",
"erased-serde",
"futures",
"galil-seiferas",
"indexmap 2.7.1",
"itertools 0.14.0",
"libc",
"minijinja",
"minijinja-contrib",
"prometheus",
"pyo3",
"regex",
"semver",
"serde",
"serde-pickle",
"serde_json",
"serde_repr",
"strum",
"thiserror 2.0.11",
"tokenizers",
"tokio",
"tokio-stream",
"tokio-util",
"toktrie",
"toktrie_hf_tokenizers",
"tracing",
"unicode-segmentation",
"uuid",
"validator",
"xxhash-rust",
]
[[package]]
name = "dynemo-runtime"
version = "0.2.1"
dependencies = [
"anyhow",
"async-nats",
"async-once-cell",
"async-stream",
"async-trait",
"async_zmq",
"blake3",
"bytes",
"chrono",
"derive-getters",
"derive_builder",
"educe",
"either",
"etcd-client",
"figment",
"futures",
"humantime",
"local-ip-address",
"log",
"nid",
"nix",
"nuid",
"once_cell",
"prometheus",
"rand",
"regex",
"serde",
"serde_json",
"socket2",
"thiserror 1.0.69",
"tokio",
"tokio-stream",
"tokio-util",
"tracing",
"tracing-subscriber",
"uuid",
"validator",
"xxhash-rust",
]
[[package]]
name = "ed25519"
version = "2.2.3"
......@@ -4232,99 +4325,6 @@ dependencies = [
"tracing-serde",
]
[[package]]
name = "triton-distributed-llm"
version = "0.2.1"
dependencies = [
"anyhow",
"async-openai",
"async-stream",
"async-trait",
"axum 0.8.1",
"bindgen",
"blake3",
"bs62",
"bytes",
"chrono",
"cmake",
"derive_builder",
"either",
"erased-serde",
"futures",
"galil-seiferas",
"indexmap 2.7.1",
"itertools 0.14.0",
"libc",
"minijinja",
"minijinja-contrib",
"prometheus",
"pyo3",
"regex",
"semver",
"serde",
"serde-pickle",
"serde_json",
"serde_repr",
"strum",
"thiserror 2.0.11",
"tokenizers",
"tokio",
"tokio-stream",
"tokio-util",
"toktrie",
"toktrie_hf_tokenizers",
"tracing",
"triton-distributed-runtime",
"unicode-segmentation",
"uuid",
"validator",
"xxhash-rust",
]
[[package]]
name = "triton-distributed-runtime"
version = "0.2.1"
dependencies = [
"anyhow",
"async-nats",
"async-once-cell",
"async-stream",
"async-trait",
"async_zmq",
"blake3",
"bytes",
"chrono",
"derive-getters",
"derive_builder",
"educe",
"either",
"etcd-client",
"figment",
"futures",
"humantime",
"local-ip-address",
"log",
"nid",
"nix",
"nuid",
"once_cell",
"prometheus",
"rand",
"regex",
"serde",
"serde_json",
"socket2",
"thiserror 1.0.69",
"tokio",
"tokio-stream",
"tokio-util",
"tracing",
"tracing-subscriber",
"uuid",
"validator",
"xxhash-rust",
]
[[package]]
name = "try-lock"
version = "0.2.5"
......
......@@ -21,8 +21,8 @@ license = "Apache-2.0"
[dependencies]
# local
triton-distributed-runtime = { path = "../../../lib/runtime" }
triton-distributed-llm = { path = "../../../lib/llm" }
dynemo-runtime = { path = "../../../lib/runtime" }
dynemo-llm = { path = "../../../lib/llm" }
# workspace - todo
......
......@@ -8,17 +8,17 @@ the services associated with that endpoint, do some postprocessing on them,
and then publish an event with the postprocessed data.
```bash
# For more details, try TRD_LOG=debug
TRD_LOG=info cargo run --bin count -- --namespace triton-init --component backend --endpoint generate
# For more details, try DYN_LOG=debug
DYN_LOG=info cargo run --bin count -- --namespace dynemo --component backend --endpoint generate
# 2025-02-26T18:45:05.467026Z INFO count: Creating unique instance of Count at triton-init/components/count/instance
# 2025-02-26T18:45:05.472146Z INFO count: Scraping service triton_init_backend_720278f8 and filtering on subject triton_init_backend_720278f8.generate
# 2025-02-26T18:45:05.467026Z INFO count: Creating unique instance of Count at dynemo/components/count/instance
# 2025-02-26T18:45:05.472146Z INFO count: Scraping service dynemo_init_backend_720278f8 and filtering on subject dynemo_init_backend_720278f8.generate
# ...
```
With no matching endpoints running, you should see warnings in the logs:
```bash
2025-02-26T18:45:06.474161Z WARN count: No endpoints found matching subject triton_init_backend_720278f8.generate
2025-02-26T18:45:06.474161Z WARN count: No endpoints found matching subject dynemo_init_backend_720278f8.generate
```
To see metrics published to a matching endpoint, you can use the
......@@ -35,7 +35,7 @@ since the endpoint will automatically get discovered.
When stats are found from the target endpoints being listened on, count will
aggregate and publish some metrics as both an event and to a prometheus web server:
```
2025-02-28T04:05:58.077901Z INFO count: Aggregated metrics: ProcessedEndpoints { endpoints: [Endpoint { name: "worker-7587884888253033398", subject: "triton_init_backend_720278f8.generate-694d951a80e06bb6", data: ForwardPassMetrics { request_active_slots: 58, request_total_slots: 100, kv_active_blocks: 77, kv_total_blocks: 100 } }, Endpoint { name: "worker-7587884888253033401", subject: "triton_init_backend_720278f8.generate-694d951a80e06bb9", data: ForwardPassMetrics { request_active_slots: 71, request_total_slots: 100, kv_active_blocks: 29, kv_total_blocks: 100 } }], worker_ids: [7587884888253033398, 7587884888253033401], load_avg: 53.0, load_std: 24.0 }
2025-02-28T04:05:58.077901Z INFO count: Aggregated metrics: ProcessedEndpoints { endpoints: [Endpoint { name: "worker-7587884888253033398", subject: "dynemo_init_backend_720278f8.generate-694d951a80e06bb6", data: ForwardPassMetrics { request_active_slots: 58, request_total_slots: 100, kv_active_blocks: 77, kv_total_blocks: 100 } }, Endpoint { name: "worker-7587884888253033401", subject: "dynemo_init_backend_720278f8.generate-694d951a80e06bb9", data: ForwardPassMetrics { request_active_slots: 71, request_total_slots: 100, kv_active_blocks: 29, kv_total_blocks: 100 } }], worker_ids: [7587884888253033398, 7587884888253033401], load_avg: 53.0, load_std: 24.0 }
```
To see the metrics being published in prometheus format, you can run:
......
......@@ -13,10 +13,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use rand::Rng;
use std::sync::Arc;
use triton_distributed_llm::kv_router::protocols::ForwardPassMetrics;
use triton_distributed_runtime::{
use dynemo_llm::kv_router::protocols::ForwardPassMetrics;
use dynemo_runtime::{
logging,
pipeline::{
async_trait, network::Ingress, AsyncEngine, AsyncEngineContextProvider, Error, ManyOut,
......@@ -25,6 +23,8 @@ use triton_distributed_runtime::{
protocols::annotated::Annotated,
stream, DistributedRuntime, Result, Runtime, Worker,
};
use rand::Rng;
use std::sync::Arc;
fn main() -> Result<()> {
logging::init();
......@@ -69,7 +69,7 @@ async fn backend(runtime: DistributedRuntime) -> Result<()> {
// we must first create a service, then we can attach one more more endpoints
runtime
.namespace("triton-init")?
.namespace("dynemo")?
.component("backend")?
.service_builder()
.create()
......
......@@ -20,13 +20,11 @@ use prometheus::register_gauge_vec;
use serde::{Deserialize, Serialize};
use std::net::SocketAddr;
use triton_distributed_llm::kv_router::protocols::ForwardPassMetrics;
use triton_distributed_llm::kv_router::scheduler::Endpoint;
use triton_distributed_llm::kv_router::scoring::ProcessedEndpoints;
use dynemo_llm::kv_router::protocols::ForwardPassMetrics;
use dynemo_llm::kv_router::scheduler::Endpoint;
use dynemo_llm::kv_router::scoring::ProcessedEndpoints;
use triton_distributed_runtime::{
distributed::Component, service::EndpointInfo, utils::Duration, Result,
};
use dynemo_runtime::{distributed::Component, service::EndpointInfo, utils::Duration, Result};
/// Configuration for LLM worker load capacity metrics
#[derive(Debug, Clone, Serialize, Deserialize)]
......
......@@ -24,7 +24,7 @@
//! - KV Cache Blocks: [Active, Total]
use clap::Parser;
use triton_distributed_runtime::{
use dynemo_runtime::{
error, logging,
traits::events::EventPublisher,
utils::{Duration, Instant},
......@@ -50,7 +50,7 @@ struct Args {
endpoint: String,
/// Namespace to operate in
#[arg(long, env = "TRD_NAMESPACE", default_value = "triton-init")]
#[arg(long, env = "DYN_NAMESPACE", default_value = "dynemo")]
namespace: String,
/// Polling interval in seconds (minimum 1 second)
......@@ -155,7 +155,7 @@ mod tests {
#[test]
fn test_namespace_from_env() {
env::set_var("TRD_NAMESPACE", "test-namespace");
env::set_var("DYN_NAMESPACE", "test-namespace");
let args = Args::parse_from(["count", "--component", "comp", "--endpoint", "end"]);
assert_eq!(args.namespace, "test-namespace");
}
......
......@@ -16,7 +16,7 @@
ARG BASE_IMAGE="nvcr.io/nvidia/tritonserver"
ARG BASE_IMAGE_TAG="25.01-py3"
FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS triton-distributed
FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS dynemo
# TODO: non root user by default
......@@ -34,7 +34,7 @@ RUN rustup toolchain install 1.85.0-x86_64-unknown-linux-gnu
# Install OpenAI-compatible frontend and its dependencies from triton server
# repository. These are used to have a consistent interface, schema, and FastAPI
# app between Triton Core and Triton Distributed implementations.
# app between Triton Core and Dynemo implementations.
ARG OPENAI_SERVER_TAG="r25.01"
RUN mkdir -p /opt/tritonserver/python && \
cd /opt/tritonserver/python && \
......@@ -78,7 +78,7 @@ ARG TENSORRTLLM_SKIP_CLONE=
ENV FRAMEWORK=${FRAMEWORK}
RUN --mount=type=bind,source=./container/deps/requirements.tensorrtllm.txt,target=/tmp/requirements.txt \
--mount=type=bind,source=./container/deps/clone_tensorrtllm.sh,target=/tmp/clone_tensorrtllm.sh \
if [[ "$FRAMEWORK" == "TENSORRTLLM" ]] ; then pip install --timeout=2000 -r /tmp/requirements.txt; if [ ${TENSORRTLLM_SKIP_CLONE} -ne 1 ] ; then /tmp/clone_tensorrtllm.sh --tensorrtllm-backend-repo-tag ${TENSORRTLLM_BACKEND_REPO_TAG} --tensorrtllm-backend-rebuild ${TENSORRTLLM_BACKEND_REBUILD} --triton-llm-path /opt/triton/llm_binding ; fi ; fi
if [[ "$FRAMEWORK" == "TENSORRTLLM" ]] ; then pip install --timeout=2000 -r /tmp/requirements.txt; if [ ${TENSORRTLLM_SKIP_CLONE} -ne 1 ] ; then /tmp/clone_tensorrtllm.sh --tensorrtllm-backend-repo-tag ${TENSORRTLLM_BACKEND_REPO_TAG} --tensorrtllm-backend-rebuild ${TENSORRTLLM_BACKEND_REBUILD} --triton-llm-path /opt/dynemo/llm_binding ; fi ; fi
RUN --mount=type=bind,source=./container/deps/requirements.standard.txt,target=/tmp/requirements.txt \
......@@ -106,7 +106,7 @@ ENV VLLM_GENERATE_WORKERS=${VLLM_FRAMEWORK:+1}
ENV VLLM_BASELINE_TP_SIZE=${VLLM_FRAMEWORK:+1}
ENV VLLM_CONTEXT_TP_SIZE=${VLLM_FRAMEWORK:+1}
ENV VLLM_GENERATE_TP_SIZE=${VLLM_FRAMEWORK:+1}
ENV VLLM_KV_CAPI_PATH="/opt/triton/llm_binding/lib/libtriton_llm_capi.so"
ENV VLLM_KV_CAPI_PATH="/opt/dynemo/llm_binding/lib/libdynemo_llm_capi.so"
ENV PYTHONUNBUFFERED=1
# Install NATS - pointing toward NATS github instead of binaries.nats.dev due to server instability
......@@ -159,27 +159,27 @@ COPY lib/bindings /workspace/lib/bindings
RUN cd lib/bindings/c/ && \
cargo build --release --locked && cargo doc --no-deps
# Install uv, create virtualenv for general use, and build triton_distributed wheel
# Install uv, create virtualenv for general use, and build dynemo wheel
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
RUN mkdir /opt/triton && \
uv venv /opt/triton/venv --python 3.12 && \
source /opt/triton/venv/bin/activate && \
RUN mkdir /opt/dynemo && \
uv venv /opt/dynemo/venv --python 3.12 && \
source /opt/dynemo/venv/bin/activate && \
uv build --wheel --out-dir /workspace/dist && \
uv pip install /workspace/dist/triton_distributed*cp312*.whl
uv pip install /workspace/dist/dynemo*cp312*.whl
# Package the bindings
RUN mkdir -p /opt/triton/bindings/wheels && \
mkdir /opt/triton/bindings/lib && \
cp dist/triton_distributed*cp312*.whl /opt/triton/bindings/wheels/. && \
cp lib/bindings/c/target/release/libtriton_distributed_llm_capi.so /opt/triton/bindings/lib/. && \
cp -r lib/bindings/c/include /opt/triton/bindings/.
RUN mkdir -p /opt/dynemo/bindings/wheels && \
mkdir /opt/dynemo/bindings/lib && \
cp dist/dynemo*cp312*.whl /opt/dynemo/bindings/wheels/. && \
cp lib/bindings/c/target/release/libdynemo_llm_capi.so /opt/dynemo/bindings/lib/. && \
cp -r lib/bindings/c/include /opt/dynemo/bindings/.
# Install triton_distributed_runtime and triton_distributed_llm wheels globally in container for tests that
# Install dynemo.runtime and dynemo.llm wheels globally in container for tests that
# currently run without virtual environment activated.
# TODO: In future, we may use a virtualenv for everything and remove this.
RUN pip install /opt/triton/bindings/wheels/triton_distributed*cp312*.whl
RUN pip install /opt/dynemo/bindings/wheels/dynemo*cp312*.whl
# Copy everything in after install steps to avoid re-running build/install
# Copy everything in after ginstall steps to avoid re-running build/install
# commands on unrelated changes in other dirs.
COPY . /workspace
......
......@@ -24,17 +24,17 @@ ENV PATH=/usr/local/bin/etcd/:$PATH
# Install uv and create virtualenv
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
RUN mkdir /opt/triton && \
uv venv /opt/triton/venv --python 3.12
RUN mkdir /opt/dynemo && \
uv venv /opt/dynemo/venv --python 3.12
# Activate virtual environment
ENV VIRTUAL_ENV=/opt/triton/venv
ENV VIRTUAL_ENV=/opt/dynemo/venv
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
# Install patched vllm - keep this early in Dockerfile to avoid
# rebuilds from unrelated source code changes
ARG VLLM_REF="v0.7.2"
ARG VLLM_PATCH="vllm_${VLLM_REF}-triton-kv-disagg-patch.patch"
ARG VLLM_PATCH="vllm_${VLLM_REF}-dynemo-kv-disagg-patch.patch"
RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
bash /tmp/deps/vllm/install.sh --patch /tmp/deps/vllm/${VLLM_PATCH} --ref ${VLLM_REF} --install-cmd "uv pip install --editable" --use-precompiled --installation-dir /opt/vllm
......@@ -100,25 +100,25 @@ COPY lib/bindings /workspace/lib/bindings
RUN cd lib/bindings/c && \
cargo build --release --locked && cargo doc --no-deps
# Build triton_distributed wheel
RUN source /opt/triton/venv/bin/activate && \
# Build dynemo wheel
RUN source /opt/dynemo/venv/bin/activate && \
uv build --wheel --out-dir /workspace/dist && \
uv pip install /workspace/dist/triton_distributed*cp312*.whl
uv pip install /workspace/dist/dynemo*cp312*.whl
# Package the bindings
RUN mkdir -p /opt/triton/bindings/wheels && \
mkdir /opt/triton/bindings/lib && \
cp dist/triton_distributed*cp312*.whl /opt/triton/bindings/wheels/. && \
cp lib/bindings/c/target/release/libtriton_distributed_llm_capi.so /opt/triton/bindings/lib/. && \
cp -r lib/bindings/c/include /opt/triton/bindings/.
RUN mkdir -p /opt/dynemo/bindings/wheels && \
mkdir /opt/dynemo/bindings/lib && \
cp dist/dynemo*cp312*.whl /opt/dynemo/bindings/wheels/. && \
cp lib/bindings/c/target/release/libdynemo_llm_capi.so /opt/dynemo/bindings/lib/. && \
cp -r lib/bindings/c/include /opt/dynemo/bindings/.
# Tell vllm to use the Triton LLM C API for KV Cache Routing
ENV VLLM_KV_CAPI_PATH="/opt/triton/bindings/lib/libtriton_distributed_llm_capi.so"
# Tell vllm to use the Dynemo LLM C API for KV Cache Routing
ENV VLLM_KV_CAPI_PATH="/opt/dynemo/bindings/lib/libdynemo_llm_capi.so"
# FIXME: Copy more specific folders in for dev/debug after directory restructure
COPY . /workspace
# FIXME: May want a modification with triton-distributed banner on entry
# FIXME: May want a modification with dynemo-distributed banner on entry
ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
CMD []
......@@ -136,10 +136,10 @@ RUN apt update -y && \
echo "set -g mouse on" >> /root/.tmux.conf
# Set environment variables
ENV VIRTUAL_ENV=/opt/triton/venv
ENV VIRTUAL_ENV=/opt/dynemo/venv
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
ENV RAPIDS_LIBUCX_PREFER_SYSTEM_LIBRARY=true
ENV VLLM_KV_CAPI_PATH="/opt/triton/bindings/lib/libtriton_distributed_llm_capi.so"
ENV VLLM_KV_CAPI_PATH="/opt/dynemo/bindings/lib/libdynemo_llm_capi.so"
# Copy binaries
COPY --from=dev /usr/local/bin/http /usr/local/bin/http
......@@ -166,7 +166,7 @@ COPY examples/python_rs/llm/vllm /workspace/examples/python_rs/llm/vllm
WORKDIR /workspace
# FIXME: May want a modification with triton-distributed banner on entry
# FIXME: May want a modification with dynemo-distributed banner on entry
ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
CMD []
......@@ -150,17 +150,17 @@ ENV PATH=/usr/local/bin/etcd/:$PATH
# Install uv and create virtualenv
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
RUN mkdir /opt/triton && \
uv venv /opt/triton/venv --python 3.12
RUN mkdir /opt/dynemo && \
uv venv /opt/dynemo/venv --python 3.12
# Activate virtual environment
ENV VIRTUAL_ENV=/opt/triton/venv
ENV VIRTUAL_ENV=/opt/dynemo/venv
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
# Install patched vllm - keep this early in Dockerfile to avoid
# rebuilds from unrelated source code changes
ARG VLLM_REF="v0.7.2"
ARG VLLM_PATCH="vllm_${VLLM_REF}-triton-kv-disagg-patch.patch"
ARG VLLM_PATCH="vllm_${VLLM_REF}-dynemo-kv-disagg-patch.patch"
RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
bash /tmp/deps/vllm/install.sh --patch /tmp/deps/vllm/${VLLM_PATCH} --ref ${VLLM_REF} --install-cmd "uv pip install --editable" --use-precompiled --installation-dir /opt/vllm
......@@ -225,25 +225,25 @@ COPY lib/bindings /workspace/lib/bindings
RUN cd lib/bindings/c && \
cargo build --release --locked && cargo doc --no-deps
# Build triton_distributed wheel
RUN source /opt/triton/venv/bin/activate && \
# Build dynemo wheel
RUN source /opt/dynemo/venv/bin/activate && \
uv build --wheel --out-dir /workspace/dist && \
uv pip install /workspace/dist/triton_distributed*cp312*.whl
uv pip install /workspace/dist/dynemo*cp312*.whl
# Package the bindings
RUN mkdir -p /opt/triton/bindings/wheels && \
mkdir /opt/triton/bindings/lib && \
cp dist/triton_distributed*cp312*.whl /opt/triton/bindings/wheels/. && \
cp lib/bindings/c/target/release/libtriton_distributed_llm_capi.so /opt/triton/bindings/lib/. && \
cp -r lib/bindings/c/include /opt/triton/bindings/.
RUN mkdir -p /opt/dynemo/bindings/wheels && \
mkdir /opt/dynemo/bindings/lib && \
cp dist/dynemo*cp312*.whl /opt/dynemo/bindings/wheels/. && \
cp lib/bindings/c/target/release/libdynemo_llm_capi.so /opt/dynemo/bindings/lib/. && \
cp -r lib/bindings/c/include /opt/dynemo/bindings/.
# Tell vllm to use the Triton LLM C API for KV Cache Routing
ENV VLLM_KV_CAPI_PATH="/opt/triton/bindings/lib/libtriton_distributed_llm_capi.so"
# Tell vllm to use the Dynemo LLM C API for KV Cache Routing
ENV VLLM_KV_CAPI_PATH="/opt/dynemo/bindings/lib/libdynemo_llm_capi.so"
# FIXME: Copy more specific folders in for dev/debug after directory restructure
COPY . /workspace
# FIXME: May want a modification with triton-distributed banner on entry
# FIXME: May want a modification with dynemo-distributed banner on entry
ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
CMD []
......@@ -261,10 +261,10 @@ RUN apt update -y && \
echo "set -g mouse on" >> /root/.tmux.conf
# Set environment variables
ENV VIRTUAL_ENV=/opt/triton/venv
ENV VIRTUAL_ENV=/opt/dynemo/venv
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
ENV RAPIDS_LIBUCX_PREFER_SYSTEM_LIBRARY=true
ENV VLLM_KV_CAPI_PATH="/opt/triton/bindings/lib/libtriton_distributed_llm_capi.so"
ENV VLLM_KV_CAPI_PATH="/opt/dynemo/bindings/lib/libdynemo_llm_capi.so"
# Copy binaries
COPY --from=dev /usr/local/bin/http /usr/local/bin/http
......@@ -291,7 +291,7 @@ COPY examples/python_rs/llm/vllm_nixl /workspace/examples/python_rs/llm/vllm_nix
WORKDIR /workspace
# FIXME: May want a modification with triton-distributed banner on entry
# FIXME: May want a modification with dynemo-distributed banner on entry
ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
CMD []
......@@ -16,7 +16,7 @@
TENSORRTLLM_BACKEND_REPO_TAG=
TENSORRTLLM_BACKEND_REBUILD=
TRITON_LLM_PATH=
DYNEMO_LLM_PATH=
GIT_TOKEN=
GIT_REPO=
......@@ -43,9 +43,9 @@ get_options() {
missing_requirement $1
fi
;;
--triton-llm-path)
--dynemo-llm-path)
if [ "$2" ]; then
TRITON_LLM_PATH=$2
DYNEMO_LLM_PATH=$2
shift
else
missing_requirement $1
......@@ -147,9 +147,9 @@ if [ ! -z ${TENSORRTLLM_BACKEND_REBUILD} ]; then
# Build the backend
(cd inflight_batcher_llm/src \
&& cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DUSE_CXX11_ABI=1 -DTRITON_LLM_PATH=$TRITON_LLM_PATH .. \
&& cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DUSE_CXX11_ABI=1 -DDYNEMO_LLM_PATH=$DYNEMO_LLM_PATH .. \
&& make install \
&& cp libtriton_tensorrtllm.so /opt/tritonserver/backends/tensorrtllm/ \
&& cp libdynemo_tensorrtllm.so /opt/tritonserver/backends/tensorrtllm/ \
&& cp trtllmExecutorWorker /opt/tritonserver/backends/tensorrtllm/ \
)
fi
......
......@@ -26,7 +26,8 @@ import typing as t
from typing import Any
import click
from triton_distributed_rs import DistributedRuntime, triton_endpoint, triton_worker
from dynemo.runtime import DistributedRuntime, dynemo_endpoint, dynemo_worker
logger = logging.getLogger("compoundai.serve.nova")
......@@ -102,7 +103,7 @@ def main(
server_context.worker_index = worker_id
class_instance = service.inner()
@triton_worker()
@dynemo_worker()
async def worker(runtime: DistributedRuntime):
if service_name and service_name != service.name:
server_context.service_type = "service"
......@@ -157,12 +158,12 @@ def main(
# Bind an instance of inner to the endpoint
bound_method = endpoint.func.__get__(class_instance)
# Only pass request type for now, use Any for response
# TODO: Handle a triton_endpoint not having types
# TODO: Handle a dynemo_endpoint not having types
# TODO: Handle multiple endpoints in a single component
triton_wrapped_method = triton_endpoint(endpoint.request_type, Any)(
dynemo_wrapped_method = dynemo_endpoint(endpoint.request_type, Any)(
bound_method
)
result = await td_endpoint.serve_endpoint(triton_wrapped_method)
result = await td_endpoint.serve_endpoint(dynemo_wrapped_method)
# WARNING: unreachable code :( because serve blocks
logger.info(f"[{run_id}] Result: {result}")
logger.info(f"[{run_id}] Registered endpoint '{name}'")
......
......@@ -50,7 +50,7 @@ class NovaEndpoint:
if isinstance(args[1], (str, dict)):
args[1] = self.request_type.parse_obj(args[1]) # type: ignore
# Convert Pydantic model to dict before passing to triton
# Convert Pydantic model to dict before passing to dynemo
if len(args) > 1 and isinstance(args[1], BaseModel):
args = list(args) # type: ignore
args[1] = args[1].model_dump() # type: ignore
......
......@@ -72,9 +72,9 @@ class NovaClient:
else:
# Create nova worker if no runtime
from triton_distributed_rs import DistributedRuntime, triton_worker
from dynemo.runtime import DistributedRuntime, dynemo_worker
@triton_worker()
@dynemo_worker()
async def stream_worker(runtime: DistributedRuntime):
try:
# Store runtime for future use
......
......@@ -90,14 +90,14 @@ Note: NATS and ETCD servers should be running and accessible from the container
Run the server logging (with debug level logging):
```bash
TRD_LOG=DEBUG http &
DYN_LOG=DEBUG http &
```
By default the server will run on port 8080.
Add model to the server:
```bash
llmctl http add chat TinyLlama/TinyLlama-1.1B-Chat-v1.0 triton-init.tensorrt-llm.chat/completions
llmctl http add completion TinyLlama/TinyLlama-1.1B-Chat-v1.0 triton-init.tensorrt-llm.completions
llmctl http add chat TinyLlama/TinyLlama-1.1B-Chat-v1.0 dynemo.tensorrt-llm.chat/completions
llmctl http add completion TinyLlama/TinyLlama-1.1B-Chat-v1.0 dynemo.tensorrt-llm.completions
```
#### 2. Workers
......@@ -214,14 +214,14 @@ Run the container interactively with the following command:
Run the server logging (with debug level logging):
```bash
TRD_LOG=DEBUG http &
DYN_LOG=DEBUG http &
```
By default the server will run on port 8080.
Add model to the server:
```bash
llmctl http add chat TinyLlama/TinyLlama-1.1B-Chat-v1.0 triton-init.router.chat/completions
llmctl http add completion TinyLlama/TinyLlama-1.1B-Chat-v1.0 triton-init.router.completions
llmctl http add chat TinyLlama/TinyLlama-1.1B-Chat-v1.0 dynemo.router.chat/completions
llmctl http add completion TinyLlama/TinyLlama-1.1B-Chat-v1.0 dynemo.router.completions
```
#### 2. Workers
......
......@@ -19,12 +19,12 @@ import asyncio
import uvloop
from triton_distributed.runtime import DistributedRuntime, triton_worker
from dynemo.runtime import DistributedRuntime, dynemo_worker
from .protocol import Request
@triton_worker()
@dynemo_worker()
async def worker(
runtime: DistributedRuntime,
component: str,
......@@ -38,7 +38,7 @@ async def worker(
"""
# create client
client = (
await runtime.namespace("triton-init")
await runtime.namespace("dynemo")
.component(component)
.endpoint("generate")
.client()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment