Unverified Commit 8d636ebd authored by Suman Tatiraju's avatar Suman Tatiraju Committed by GitHub
Browse files
parent 6d46288c
<!--
SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
All rights reserved.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# KVBM Further Reading
- [vLLM](https://docs.vllm.ai/en/latest/design/automatic_prefix_caching.html)
- [SGLang](https://github.com/sgl-project/sglang/pull/2693)
- [EMOGI](https://arxiv.org/abs/2006.06890)
\ No newline at end of file
#!/usr/bin/env python3
# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Path setup --------------------------------------------------------------
import json
import os
from datetime import date
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import httplib2
from packaging.version import Version
# import sys
# sys.path.insert(0, os.path.abspath('.'))
# -- conf.py setup -----------------------------------------------------------
# conf.py needs to be run in the top level 'docs'
# directory but the calling build script needs to
# be called from the current working directory. We
# change to the 'docs' dir here and then revert back
# at the end of the file.
# current_dir = os.getcwd()
# os.chdir("docs")
# -- Project information -----------------------------------------------------
project = "Dynamo"
copyright = "2025-{}, NVIDIA Corporation".format(date.today().year)
author = "NVIDIA"
# Get the version of dynamo this is building.
version_long = "0.1.0"
version_short = version_long
version_short_split = version_short.split(".")
one_before = f"{version_short_split[0]}.{int(version_short_split[1]) - 1}.{version_short_split[2]}"
# -- General configuration ---------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
"ablog",
"myst_parser",
"sphinx_copybutton",
"sphinx_design",
"sphinx-prompt",
# "sphinxcontrib.bibtex",
"sphinx_tabs.tabs",
"sphinx_sitemap",
"sphinx.ext.autodoc",
"sphinx.ext.autosummary",
"sphinx.ext.mathjax",
"sphinx.ext.napoleon",
"sphinx.ext.ifconfig",
"sphinx.ext.extlinks",
"sphinxcontrib.mermaid",
]
suppress_warnings = ["myst.domains", "ref.ref", "myst.header"]
source_suffix = [".rst", ".md"]
autodoc_default_options = {
"members": True,
"undoc-members": True,
"private-members": True,
}
autosummary_generate = True
autosummary_mock_imports = [
"tritonclient.grpc.model_config_pb2",
"tritonclient.grpc.service_pb2",
"tritonclient.grpc.service_pb2_grpc",
]
napoleon_include_special_with_doc = True
numfig = True
# final location of docs for seo/sitemap
html_baseurl = "https://docs.nvidia.com/dynamo/latest/"
myst_enable_extensions = [
"dollarmath",
"amsmath",
"deflist",
# "html_admonition",
"html_image",
"colon_fence",
# "smartquotes",
"replacements",
# "linkify",
"substitution",
]
myst_heading_anchors = 5
myst_fence_as_directive = ["mermaid"]
# Add any paths that contain templates here, relative to this directory.
# templates_path = ["_templates"] # disable it for nvidia-sphinx-theme to show footer
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = "nvidia_sphinx_theme"
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ["_static"]
# html_js_files = ["custom.js"]
# html_css_files = ["custom.css"] # Not needed with new theme
html_theme_options = {
"collapse_navigation": False,
"github_url": "https://github.com/ai-dynamo/dynamo",
# "switcher": {
# use for local testing
# "json_url": "http://localhost:8000/_static/switcher.json",
# "json_url": "https://docs.nvidia.com/dynamo/latest/_static/switcher.json",
# "version_match": one_before if "dev" in version_long else version_short,
# },
"navbar_start": ["navbar-logo", "version-switcher"],
"primary_sidebar_end": [],
}
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#
html_theme_options.update(
{
"collapse_navigation": False,
}
)
deploy_ngc_org = "nvidia"
deploy_ngc_team = "dynamo"
myst_substitutions = {
"VersionNum": version_short,
"deploy_ngc_org_team": f"{deploy_ngc_org}/{deploy_ngc_team}"
if deploy_ngc_team
else deploy_ngc_org,
}
def ultimateReplace(app, docname, source):
result = source[0]
for key in app.config.ultimate_replacements:
result = result.replace(key, app.config.ultimate_replacements[key])
source[0] = result
# this is a necessary hack to allow us to fill in variables that exist in code blocks
ultimate_replacements = {
"{VersionNum}": version_short,
"{SamplesVersionNum}": version_short,
"{NgcOrgTeam}": f"{deploy_ngc_org}/{deploy_ngc_team}"
if deploy_ngc_team
else deploy_ngc_org,
}
# bibtex_bibfiles = ["references.bib"]
# To test that style looks good with common bibtex config
# bibtex_reference_style = "author_year"
# bibtex_default_style = "plain"
### We currently use Myst: https://myst-nb.readthedocs.io/en/latest/use/execute.html
nb_execution_mode = "off" # Global execution disable
# execution_excludepatterns = ['tutorials/tts-python-basics.ipynb'] # Individual notebook disable
###############################
# SETUP SWITCHER
###############################
switcher_path = os.path.join(html_static_path[0], "switcher.json")
versions = []
# Triton 2 releases
correction = -1 if "dev" in version_long else 0
upper_bound = version_short.split(".")[1]
for i in range(2, int(version_short.split(".")[1]) + correction):
versions.append((f"2.{i}.0", f"dynamo{i}0"))
# Patch releases
# Add here.
versions = sorted(versions, key=lambda v: Version(v[0]), reverse=True)
# Build switcher data
json_data = []
for v in versions:
json_data.append(
{
"name": v[0],
"version": v[0],
"url": f"https://docs.nvidia.com/dynamo/archives/{v[1]}/user-guide/docs",
}
)
if "dev" in version_long:
json_data.insert(
0,
{
"name": f"{one_before} (current_release)",
"version": f"{one_before}",
"url": "https://docs.nvidia.com/dynamo/latest/index.html",
},
)
else:
json_data.insert(
0,
{
"name": f"{version_short} (current release)",
"version": f"{version_short}",
"url": "https://docs.nvidia.com/dynamo/latest/index.html",
},
)
# Trim to last N releases.
json_data = json_data[0:12]
json_data.append(
{
"name": "older releases",
"version": "archives",
"url": "https://docs.nvidia.com/dynamo/archives/",
}
)
# validate the links
for i, d in enumerate(json_data):
h = httplib2.Http()
resp = h.request(d["url"], "HEAD")
if int(resp[0]["status"]) >= 400:
print(d["url"], "NOK", resp[0]["status"])
# exit(1)
# Write switcher data to file
with open(switcher_path, "w") as f:
json.dump(json_data, f, ensure_ascii=False, indent=4)
<!--
SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
All rights reserved.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# Deployment Examples
This directory contains a hello world example which implements a simplified disaggregated serving architecture used for deploying Large Language Models (LLMs). It removes the LLM related inference code and focuses on how Dynamo handles routing, task queue, and metadata communication between prefill and decode workers.
## Components
- frontend: A simple http server handles incoming requests
- processor: A pre/post processing server and invokes router server
- router: Handles API requests and routes them to appropriate workers based on specified strategy
- worker: A dummy decode worker
- prefill-worker: A dummy prefill worker
## Deployment Architectures
This figure shows an overview of the major components to deploy:
```
+----------------+
| prefill worker |-------+
| | |
+----------------+ | pull
v
+------+ +-----------+ +------------------+ push +---------------+
| HTTP |----->| processor |----->| decode/monolith |------------>| prefill queue |
| |<-----| |<-----| worker | | |
+------+ +-----------+ +------------------+ +---------------+
| ^
query best | | return
worker | | worker_id
| | +------------------+
| +---------| router |
+------------->| |
+------------------+
```
## The Aggregated Deployment
This example uses 2 nodes to demo the disagg serving.
- Node 1
- Runs NATS and etcd services
- Deploys Frontend, Processor and Router
- Deploys DummyWorker as the monolith worker
- Node 2
- Deploys DummyWorker as the monolith worker
### Prerequisites
On Node 1, start required services (etcd and NATS) using [Docker Compose](https://github.com/ai-dynamo/dynamo/blob/main/deploy/docker-compose.yml)
```bash
docker compose -f deploy/docker-compose.yml up -d
```
### Run the Deployment
1. Set environment variables for NATS and etcd services
```bash
export NATS_SERVER="nats://Node_1_IP_ADDRESS:4222"
export ETCD_ENDPOINTS="http://Node_1_IP_ADDRESS:2379"
```
2. Launch Frontend, Processor and Router services:
```
cd dynamo/examples/hello_world/disagg_skeleton
dynamo serve components.graph:Frontend
```
3. Open a new terminal on Node 1 and deploy Worker service
```
export NATS_SERVER="nats://Node_1_IP_ADDRESS:4222"
export ETCD_ENDPOINTS="http://Node_1_IP_ADDRESS:2379"
cd dynamo/examples/hello_world/disagg_skeleton
dynamo serve components.worker:DummyWorker
```
4. Go to Node 2 and start Worker service as in step 3.
Now you should see both workers are ready in Node 1's terminal.
5. Query the Frontend with following two prompts. The router would assign different workers for each prompt and you can observe it from the responses.
- `Response: {"worker_output":"Tell me a joke_GeneratedBy_NODE1HOSTNAME","request_id":"id_number"}`
- `Response: {"worker_output":"Which team won 2020 World Series_GeneratedBy_NODE2HOSTNAME","request_id":"id_number"}`
```
curl -X 'POST' \
'http://localhost:8000/generate' \
-H 'accept: text/event-stream' \
-H 'Content-Type: application/json' \
-d '{
"prompt": "Tell me a joke",
"request_id":"id_number"
}'
curl -X 'POST' \
'http://localhost:8000/generate' \
-H 'accept: text/event-stream' \
-H 'Content-Type: application/json' \
-d '{
"prompt": "Which team won 2020 World Series",
"request_id":"id_number"
}'
```
6. Then modify the prompt; prompts with similar prefixes are routed to the same worker due to the routing algorithm used in this demo. For example, following query is routed to the worker that proceesed `Tell me a joke` prompt.
```
curl -X 'POST' \
'http://localhost:8000/generate' \
-H 'accept: text/event-stream' \
-H 'Content-Type: application/json' \
-d '{
"prompt": "Tell me a fact",
"request_id":"id_number"
}'
```
-`Response: {"worker_output":"Tell me a fact_GeneratedBy_NODE1HOSTNAME","request_id":"id_number"}`
## The Disaggregated Deployment
This example uses 3 nodes to demo the disagg serving.
- Node 1
- Runs NATS and etcd services
- Deploys Frontend and Processor
- Deploys DummyWorker as the decode worker
- Node 2
- Deploys DummyWorker as the decode worker
- Node 3
- Deploys Prefill as the prefill worker
### Run the Deployment
1. Repeat step 1 to 4 to deploy Frontend, Processor, Router and 2 Workers as decode worker
2. Go to Node 3 and start the prefill worker.
```
export NATS_SERVER="nats://Node_1_IP_ADDRESS:4222"
export ETCD_ENDPOINTS="http://Node_1_IP_ADDRESS:2379"
cd dynamo/examples/hello_world/disagg_skeleton
dynamo serve components.prefill_worker:PrefillWorker
```
3. Query the Frontend. This time decode workers push requests to the prefill queue, and prefill worker pulles task from the queue to simulate the prefill task. The actual prefill is skipped in this demo.
```
curl -X 'POST' \
'http://localhost:8000/generate' \
-H 'accept: text/event-stream' \
-H 'Content-Type: application/json' \
-d '{
"prompt": "This is prefill disagg serving example",
"request_id":"12345"
}'
```
<!--
SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# Hello World Example
## Overview
This example demonstrates the basic concepts of Dynamo by creating a simple multi-service pipeline. It shows how to:
1. Create and connect multiple Dynamo services
2. Pass data between services using Dynamo's runtime
3. Set up a simple HTTP API endpoint
4. Deploy and interact with a Dynamo service graph
Pipeline Architecture:
```
Users/Clients (HTTP)
┌─────────────┐
│ Frontend │ HTTP API endpoint (/generate)
└─────────────┘
│ dynamo/runtime
┌─────────────┐
│ Middle │
└─────────────┘
│ dynamo/runtime
┌─────────────┐
│ Backend │
└─────────────┘
```
## Component Descriptions
### Frontend Service
- Serves as the entry point for external HTTP requests
- Exposes a `/generate` HTTP API endpoint that clients can call
- Processes incoming text and passes it to the Middle service
### Middle Service
- Acts as an intermediary service in the pipeline
- Receives requests from the Frontend
- Appends "-mid" to the text and forwards it to the Backend
### Backend Service
- Functions as the final service in the pipeline
- Processes requests from the Middle service
- Appends "-back" to the text and yields tokens
## Running the Example Locally
1. Launch all three services using a single command:
```bash
cd /workspace/examples/hello_world
dynamo serve hello_world:Frontend
```
The `dynamo serve` command deploys the entire service graph, automatically handling the dependencies between Frontend, Middle, and Backend services.
2. Send request to frontend using curl:
```bash
curl -X 'POST' \
'http://localhost:8000/generate' \
-H 'accept: text/event-stream' \
-H 'Content-Type: application/json' \
-d '{
"text": "test"
}'
```
## Deploying to and Running the Example in Kubernetes
This example can be deployed to a Kubernetes cluster using [Dynamo Cloud](../../docs/guides/dynamo_deploy/dynamo_cloud.md) and the Dynamo CLI.
### Prerequisites
You must have first followed the instructions in [deploy/cloud/helm/README.md](https://github.com/ai-dynamo/dynamo/blob/main/deploy/cloud/helm/README.md) to create your Dynamo cloud deployment.
### Deployment Steps
For detailed deployment instructions, please refer to the [Operator Deployment Guide](../../docs/guides/dynamo_deploy/operator_deployment.md). The following are the specific commands for the hello world example:
```bash
# Set your project root directory
export PROJECT_ROOT=$(pwd)
# Configure environment variables (see operator_deployment.md for details)
export KUBE_NS=hello-world
export DYNAMO_CLOUD=http://localhost:8080 # If using port-forward
# OR
# export DYNAMO_CLOUD=https://dynamo-cloud.nvidia.com # If using Ingress/VirtualService
# Build the Dynamo base image (see operator_deployment.md for details)
export DYNAMO_IMAGE=<your-registry>/<your-image-name>:<your-tag>
# Build the service
cd $PROJECT_ROOT/examples/hello_world
DYNAMO_TAG=$(dynamo build hello_world:Frontend | grep "Successfully built" | awk '{ print $3 }' | sed 's/\.$//')
# Deploy to Kubernetes
export DEPLOYMENT_NAME=ci-hw
dynamo deployment create $DYNAMO_TAG -n $DEPLOYMENT_NAME
```
### Testing the Deployment
Once the deployment is complete, you can test it using:
```bash
# Find your frontend pod
export FRONTEND_POD=$(kubectl get pods -n ${KUBE_NS} | grep "${DEPLOYMENT_NAME}-frontend" | sort -k1 | tail -n1 | awk '{print $1}')
# Forward the pod's port to localhost
kubectl port-forward pod/$FRONTEND_POD 8000:8000 -n ${KUBE_NS}
# Test the API endpoint
curl -X 'POST' 'http://localhost:8000/generate' \
-H 'accept: text/event-stream' \
-H 'Content-Type: application/json' \
-d '{"text": "test"}'
```
For more details on managing deployments, testing, and troubleshooting, please refer to the [Operator Deployment Guide](../../docs/guides/dynamo_deploy/operator_deployment.md).
## Expected Output
When you send the request with "test" as input, the response will show how the text flows through each service:
```
Frontend: Middle: Backend: test-mid-back
```
This demonstrates how:
1. The Frontend receives "test"
2. The Middle service adds "-mid" to create "test-mid"
3. The Backend service adds "-back" to create "test-mid-back"
<!--
SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# LLM Deployment Examples
This directory contains examples and reference implementations for deploying Large Language Models (LLMs) in various configurations.
## Components
- workers: Prefill and decode worker handles actual LLM inference
- router: Handles API requests and routes them to appropriate workers based on specified strategy
- frontend: OpenAI compatible http server handles incoming requests
## Deployment Architectures
### Aggregated
Single-instance deployment where both prefill and decode are done by the same worker.
### Disaggregated
Distributed deployment where prefill and decode are done by separate workers that can scale independently.
```mermaid
sequenceDiagram
participant D as VllmWorker
participant Q as PrefillQueue
participant P as PrefillWorker
Note over D: Request is routed to decode
D->>D: Decide if prefill should be done locally or remotely
D->>D: Allocate KV blocks
D->>Q: Put RemotePrefillRequest on the queue
P->>Q: Pull request from the queue
P-->>D: Read cached KVs from Decode
D->>D: Decode other requests
P->>P: Run prefill
P-->>D: Write prefilled KVs into allocated blocks
P->>D: Send completion notification
Note over D: Notification received when prefill is done
D->>D: Schedule decoding
```
## Getting Started
1. Choose a deployment architecture based on your requirements
2. Configure the components as needed
3. Deploy using the provided scripts
### Prerequisites
Start required services (etcd and NATS) using [Docker Compose](../../deploy/metrics/docker-compose.yml)
```bash
docker compose -f deploy/metrics/docker-compose.yml up -d
```
### Build docker
```bash
# On an x86 machine
./container/build.sh --framework vllm
# On an ARM machine (ex: GB200)
./container/build.sh --framework vllm --platform linux/arm64
```
> [!NOTE]
> Building a vLLM docker image for ARM machines currently involves building vLLM from source,
> which has known issues with being slow and requiring a lot of system RAM:
> https://github.com/vllm-project/vllm/issues/8878
>
> You can tune the number of parallel build jobs for building VLLM from source
> on ARM based on your available cores and system RAM with `VLLM_MAX_JOBS`.
>
> For example, on an ARM machine with low system resources:
> `./container/build.sh --framework vllm --platform linux/arm64 --build-arg VLLM_MAX_JOBS=2`
>
> For example, on a GB200 which has very high CPU cores and memory resource:
> `./container/build.sh --framework vllm --platform linux/arm64 --build-arg VLLM_MAX_JOBS=64`
>
> When vLLM has pre-built ARM wheels published, this process can be improved.
### Run container
```
./container/run.sh -it --framework vllm
```
## Run Deployment
This figure shows an overview of the major components to deploy:
```
+----------------+
+------| prefill worker |-------+
notify | | | |
finished | +----------------+ | pull
v v
+------+ +-----------+ +------------------+ push +---------------+
| HTTP |----->| processor |----->| decode/monolith |------------>| prefill queue |
| |<-----| |<-----| worker | | |
+------+ +-----------+ +------------------+ +---------------+
| ^ |
query best | | return | publish kv events
worker | | worker_id v
| | +------------------+
| +---------| kv-router |
+------------->| |
+------------------+
```
> [!NOTE]
> The planner component is enabled by default for all deployment architectures but is set to no-op mode. This means the planner observes metrics but doesn't take scaling actions. To enable active scaling, you can add `--Planner.no-operation=false` to your `dynamo serve` command. For more details, see the [Planner documentation](../../components/planner/README.md).
### Example architectures
_Note_: For a non-dockerized deployment, first export `DYNAMO_HOME` to point to the dynamo repository root, e.g. `export DYNAMO_HOME=$(pwd)`
#### Aggregated serving
```bash
cd $DYNAMO_HOME/examples/llm
dynamo serve graphs.agg:Frontend -f ./configs/agg.yaml
```
#### Aggregated serving with KV Routing
```bash
cd $DYNAMO_HOME/examples/llm
dynamo serve graphs.agg_router:Frontend -f ./configs/agg_router.yaml
```
#### Disaggregated serving
```bash
cd $DYNAMO_HOME/examples/llm
dynamo serve graphs.disagg:Frontend -f ./configs/disagg.yaml
```
#### Disaggregated serving with KV Routing
```bash
cd $DYNAMO_HOME/examples/llm
dynamo serve graphs.disagg_router:Frontend -f ./configs/disagg_router.yaml
```
### Client
In another terminal:
```bash
# this test request has around 200 tokens isl
curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
"messages": [
{
"role": "user",
"content": "In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden."
}
],
"stream":false,
"max_tokens": 30
}'
```
### Multi-node deployment
See [multinode-examples.md](multinode-examples.md) for more details.
### Close deployment
See [close deployment](../../docs/guides/dynamo_serve.md#close-deployment) section to learn about how to close the deployment.
## Deploy to Kubernetes
These examples can be deployed to a Kubernetes cluster using [Dynamo Cloud](../../docs/guides/dynamo_deploy/dynamo_cloud.md) and the Dynamo CLI.
### Prerequisites
You must have first followed the instructions in [deploy/cloud/helm/README.md](../../deploy/cloud/helm/README.md) to install Dynamo Cloud on your Kubernetes cluster.
**Note**: The `KUBE_NS` variable in the following steps must match the Kubernetes namespace where you installed Dynamo Cloud. You must also expose the `dynamo-store` service externally. This will be the endpoint the CLI uses to interface with Dynamo Cloud.
### Deployment Steps
For detailed deployment instructions, please refer to the [Operator Deployment Guide](../../docs/guides/dynamo_deploy/operator_deployment.md). The following are the specific commands for the LLM examples:
```bash
# Set your project root directory
export PROJECT_ROOT=$(pwd)
# Configure environment variables (see operator_deployment.md for details)
export KUBE_NS=dynamo-cloud
export DYNAMO_CLOUD=http://localhost:8080 # If using port-forward
# OR
# export DYNAMO_CLOUD=https://dynamo-cloud.nvidia.com # If using Ingress/VirtualService
# Build the Dynamo base image (see operator_deployment.md for details)
export DYNAMO_IMAGE=<your-registry>/<your-image-name>:<your-tag>
# Build the service
cd $PROJECT_ROOT/examples/llm
DYNAMO_TAG=$(dynamo build graphs.agg:Frontend | grep "Successfully built" | awk '{ print $NF }' | sed 's/\.$//')
# Deploy to Kubernetes
export DEPLOYMENT_NAME=llm-agg
dynamo deployment create $DYNAMO_TAG -n $DEPLOYMENT_NAME -f ./configs/agg.yaml
```
**Note**: Optionally add `--Planner.no-operation=false` at the end of the deployment command to enable the planner component to take scaling actions on your deployment.
### Testing the Deployment
Once the deployment is complete, you can test it using:
```bash
# Find your frontend pod
export FRONTEND_POD=$(kubectl get pods -n ${KUBE_NS} | grep "${DEPLOYMENT_NAME}-frontend" | sort -k1 | tail -n1 | awk '{print $1}')
# Forward the pod's port to localhost
kubectl port-forward pod/$FRONTEND_POD 8000:8000 -n ${KUBE_NS}
# Test the API endpoint
curl localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
"messages": [
{
"role": "user",
"content": "In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden."
}
],
"stream":false,
"max_tokens": 30
}'
```
For more details on managing deployments, testing, and troubleshooting, please refer to the [Operator Deployment Guide](../../docs/guides/dynamo_deploy/operator_deployment.md).
<!--
SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
All rights reserved.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# Multinode Examples
## Single node sized models
You can deploy dynamo on multiple nodes via NATS/ETCD based discovery and communication. Here's an example of deploying disaggregated serving on 3 nodes using `nvidia/Llama-3.1-405B-Instruct-FP8`. Each node must be properly configured with Infiniband and/or RoCE for communication between decode and prefill workers.
##### Disaggregated Deployment with KV Routing
- Node 1: Frontend, Processor, Router, Decode Worker
- Node 2: Prefill Worker
- Node 3: Prefill Worker
Note that this can be easily extended to more nodes. You can also run the Frontend, Processor, and Router on a separate CPU only node if you'd like as long as all nodes have access to the NATS/ETCD endpoints!
**Step 1**: Start NATS/ETCD on your head node. Ensure you have the correct firewall rules to allow communication between the nodes the NATS/ETCD endpoints must be accessible by all other nodes.
```bash
# node 1
docker compose -f deploy/metrics/docker-compose.yml up -d
```
**Step 2**: Create the inference graph for this node. Here we use the `agg_router.py` (even though we are doing disaggregated serving) graph because we want the `Frontend`, `Processor`, `Router`, and `VllmWorker` to spin up (we spin up the other decode worker and prefill worker separately on different nodes later).
```python
# graphs/agg_router.py
Frontend.link(Processor).link(Router).link(VllmWorker)
```
**Step 3**: Create a configuration file for this node. We've provided a sample one for you in `configs/multinode-405b.yaml` for the 405B model. Note that we still include the `PrefillWorker` component in the configuration file even though we are not using it on node 1. This is because we can reuse the same configuration file on all nodes and just spin up individual workers on the other ones.
**Step 4**: Start the frontend, processor, router, and VllmWorker on node 1.
```bash
# node 1
cd $DYNAMO_HOME/examples/llm
dynamo serve graphs.agg_router:Frontend -f ./configs/multinode-405b.yaml
```
**Step 5**: Start the first prefill worker on node 2.
Since we only want to start the `PrefillWorker` on node 2, you can simply run just the PrefillWorker component directly with the configuration file from before.
```bash
# node 2
export NATS_SERVER = '<your-nats-server-address>' # note this should start with nats://...
export ETCD_ENDPOINTS = '<your-etcd-endpoints-address>'
cd $DYNAMO_HOME/examples/llm
dynamo serve components.prefill_worker:PrefillWorker -f ./configs/multinode-405b.yaml
```
**Step 6**: Start the second prefill worker on node 3.
```bash
# node 3
export NATS_SERVER = '<your-nats-server-address>' # note this should start with nats://...
export ETCD_ENDPOINTS = '<your-etcd-endpoints-address>'
cd $DYNAMO_HOME/examples/llm
dynamo serve components.prefill_worker:PrefillWorker -f ./configs/multinode-405b.yaml
```
**Step 7**: [Optional] Start more decode workers on other nodes
This example can be extended to more nodes as well. For example, if you want to spin up another decode worker, you can use
```bash
# node X
export NATS_SERVER = '<your-nats-server-address>' # note this should start with nats://...
export ETCD_ENDPOINTS = '<your-etcd-endpoints-address>'
cd $DYNAMO_HOME/examples/llm
dynamo serve components.worker:VllmWorker -f ./configs/multinode-405b.yaml --service-name VllmWorker
```
Note the use of `--service-name`. This only spins up the worker that you are requesting and ignore any `depends` statements.
###### Client
In another terminal:
```bash
# this test request has around 200 tokens isl
curl <node1-ip>:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Accept: text/event-stream" \
-d '{
"model": "nvidia/Llama-3.1-405B-Instruct-FP8",
"messages": [
{
"role": "user",
"content": "In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden."
}
],
"stream": true,
"max_tokens": 300
}'
```
#### Multi-node sized models
Multinode model support is coming soon. You can track progress [here](https://github.com/ai-dynamo/dynamo/issues/513)!
##### Aggregated Deployment
The steps for aggregated deployment of multi-node sized models is similar to
single-node sized models, except that you need to first configure the nodes
to be interconnected according to the framework's multi-node deployment guide.
In the below example, vLLM is be used as the framework to serve `DeepSeek-R1` model
using tensor parallel 16 on two H100x8 nodes.
**Step 1**: On each of the nodes, set up Ray cluster so that vLLM can access the resource
collectively:
```bash
# head node
ray start --head --port=6379
# example output and keep note of the IP address of the head node
# Local node IP: <head-node-address>
# set vLLM env arg
export VLLM_HOST_IP=<head-node-address>
# other node
ray start --address=<head-node-address>:6379
export VLLM_HOST_IP=<current-node-address>
# verify the accessibility by checking aggregated GPU count shown in ray status
ray status
# Expected/Sample output for 2 nodes:
# ```bash
# ======== Autoscaler status: 2025-04-16 15:35:42.751688 ========
# Node status
# ---------------------------------------------------------------
# Active:
# 1 node_<hash_1>
# 1 node_<hash_2>
# Pending:
# (no pending nodes)
# Recent failures:
# (no failures)
# Resources
# ---------------------------------------------------------------
# Usage:
# XXX CPU
# XXX GPU
# XXX memory
# XXX object_store_memory
# Demands:
# (no resource demands)
```
**Step 2**: On the head node, follow [LLM deployment examples](https://github.com/ai-dynamo/dynamo/blob/main/examples/llm/README.md) to
setup dynamo deployment for aggregated serving, using the configuration file,
`configs/multinode_agg_r1.yaml`, for DeepSeek-R1:
```bash
cd $DYNAMO_HOME/examples/llm
dynamo serve graphs.agg:Frontend -f ./configs/multinode_agg_r1.yaml
```
###### Client
In another terminal, you can send the same curl request as described above but
with `"model": "deepseek-ai/DeepSeek-R1"`
```bash
# this test request has around 200 tokens isl
curl <node1-ip>:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Accept: text/event-stream" \
-d '{
"model": "deepseek-ai/DeepSeek-R1",
"messages": [
{
"role": "user",
"content": "In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden."
}
],
"stream": true,
"max_tokens": 300
}'
```
##### Disaggregated Deployment
In this example, we deploy two replicas of the model (one prefill worker
and one decode worker). We use 4 H100x8 nodes and group every two of them
into one Ray cluster in the same way as described in aggregated deployment.
However, for etcd and nats server, we only run them in
one node and let's consider that node to be the head node of the whole deployment.
Note that if you are starting etcd server directly instead of using `docker compose`,
you should add additional arguments to be discoverable in other node.
```bash
etcd --advertise-client-urls http://<head-node-ip>:2379 --listen-client-urls http://<head-node-ip>:2379,http://127.0.0.1:2379
```
**Step 1**: On every two nodes, set up Ray cluster as described in
[aggregated deployment](#aggregated-deployment). After that, you should have
two independent Ray cluster, each has access to 16 GPUs.
**Step 2** start the deployment by running different flavors of `dynamo serve`
on one of the node for each Ray cluster, using the configuration file,
`configs/mutinode_disagg_r1.yaml`.
For decode, the below command is used; the node is the entry point of
the whole deployment. In other words, the ip of the node should be used to send
requests to.
```bash
# if not head node
export NATS_SERVER='nats://<nats-server-ip>:4222'
export ETCD_ENDPOINTS='<etcd-endpoints-ip>:2379'
cd $DYNAMO_HOME/examples/llm
dynamo serve graphs.agg:Frontend -f ./configs/mutinode_disagg_r1.yaml
```
For prefill:
```bash
# if not head node
export NATS_SERVER='nats://<nats-server-ip>:4222'
export ETCD_ENDPOINTS='<etcd-endpoints-ip>:2379'
cd $DYNAMO_HOME/examples/llm
dynamo serve components.prefill_worker:PrefillWorker -f ./configs/mutinode_disagg_r1.yaml
```
###### Client
In another terminal, you can send the same curl request as described in
[aggregated deployment](#aggregated-deployment), addressing to the ip of
the decode node.
<!--
SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# LLM Deployment Examples using TensorRT-LLM
This directory contains examples and reference implementations for deploying Large Language Models (LLMs) in various configurations using TensorRT-LLM.
## Deployment Architectures
See [deployment architectures](llm_deployment.md#Deployment Architectures) to learn about the general idea of the architecture.
Note that this TensorRT-LLM version does not support all the options yet.
```{note}
TensorRT-LLM disaggregation does not support conditional disaggregation yet. You can only configure the deployment to always use aggregate or disaggregated serving.
```
## Getting Started
1. Choose a deployment architecture based on your requirements
2. Configure the components as needed
3. Deploy using the provided scripts
### Prerequisites
Start required services (etcd and NATS) using [Docker Compose](../../deploy/docker-compose.yml)
```bash
docker compose -f deploy/docker-compose.yml up -d
```
### Build docker
#### Step 1: Build TensorRT-LLM base container image
Because of the known issue of C++11 ABI compatibility within the NGC pytorch container, we rebuild TensorRT-LLM from source.
See [here](https://nvidia.github.io/TensorRT-LLM/installation/linux.html) for more informantion.
Use the helper script to build a TensorRT-LLM container base image. The script uses a specific commit id from TensorRT-LLM main branch.
```bash
# TensorRT-LLM uses git-lfs, which needs to be installed in advance.
apt-get update && apt-get -y install git git-lfs
# The script uses python packages like docker-squash to squash image
# layers within trtllm base image
DEBIAN_FRONTEND=noninteractive TZ=America/Los_Angeles apt-get -y install python3 python3-pip python3-venv
./container/build_trtllm_base_image.sh
```
For more information see [here](https://nvidia.github.io/TensorRT-LLM/installation/build-from-source-linux.html#option-1-build-tensorrt-llm-in-one-step) for more details on building from source.
If you already have a TensorRT-LLM container image, you can skip this step.
#### Step 2: Build the Dynamo container
```
# On an x86 machine:
./container/build.sh --framework tensorrtllm
# On an ARM machine:
./container/build.sh --framework tensorrtllm --platform linux/arm64
```
This build script internally points to the base container image built with step 1. If you skipped previous step because you already have the container image available, you can run the build script with that image as a base.
```bash
# Build dynamo image with other TRTLLM base image.
./container/build.sh --framework TENSORRTLLM --base-image <trtllm-base-image> --base-image-tag <trtllm-base-image-tag>
```
### Run container
```
./container/run.sh --framework tensorrtllm -it
```
## Run Deployment
This figure shows an overview of the major components to deploy:
```
+------+ +-----------+ +------------------+ +---------------+
| HTTP |----->| processor |----->| Worker |------------>| Prefill |
| |<-----| |<-----| |<------------| Worker |
+------+ +-----------+ +------------------+ +---------------+
| ^ |
query best | | return | publish kv events
worker | | worker_id v
| | +------------------+
| +---------| kv-router |
+------------->| |
+------------------+
```
```{note}
The above architecture illustrates all the components. The final components that get spawned depend upon the chosen graph.
```
### Example architectures
#### Aggregated serving
```bash
cd /workspace/examples/tensorrt_llm
dynamo serve graphs.agg:Frontend -f ./configs/agg.yaml
```
#### Aggregated serving with KV Routing
```bash
cd /workspace/examples/tensorrt_llm
dynamo serve graphs.agg_router:Frontend -f ./configs/agg_router.yaml
```
#### Disaggregated serving
```bash
cd /workspace/examples/tensorrt_llm
dynamo serve graphs.disagg:Frontend -f ./configs/disagg.yaml
```
We are defining TRTLLM_USE_UCX_KVCACHE so that TRTLLM uses UCX for transfering the KV
cache between the context and generation workers.
#### Disaggregated serving with KV Routing
```bash
cd /workspace/examples/tensorrt_llm
dynamo serve graphs.disagg_router:Frontend -f ./configs/disagg_router.yaml
```
We are defining TRTLLM_USE_UCX_KVCACHE so that TRTLLM uses UCX for transfering the KV
cache between the context and generation workers.
### Client
See [client](llm_deployment.md#client) section to learn how to send request to the deployment.
### Close deployment
See [close deployment](../../docs/guides/dynamo_serve.md#close-deployment) section to learn about how to close the deployment.
Remaining tasks:
- [x] Add support for the disaggregated serving.
- [ ] Add integration test coverage.
- [ ] Add instructions for benchmarking.
- [ ] Add multi-node support.
- [ ] Merge the code base with llm example to reduce the code duplication.
- [ ] Use processor from dynamo-llm framework.
- [ ] Enable NIXL integration with TensorRT-LLM once available. Currently, TensorRT-LLM uses UCX to transfer KV cache.
# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
README.md
\ No newline at end of file
#!/usr/bin/env python3
# type: ignore # Ignore all mypy errors in this file
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import os
import re
import subprocess
from contextlib import contextmanager
from functools import partial
# Get the directory of the current file
dynamo_docs_abspath = os.path.dirname(os.path.abspath(__file__))
dynamo_abspath = os.path.dirname(dynamo_docs_abspath)
repo_url = "https://github.com/ai-dynamo/dynamo/blob/main/"
# Regex patterns
http_patn = r"^https?://"
http_reg = re.compile(http_patn)
tag_patn = "/(?:blob|tree)/main"
dynamo_repo_patn = rf"{http_patn}github.com/ai-dynamo/dynamo"
dynamo_github_url_reg = re.compile(
rf"{dynamo_repo_patn}/([^/#]+)(?:{tag_patn})?/*([^#]*)\s*(?=#|$)"
)
# relpath_patn = r"]\s*\(\s*([^)]+)\)"
# Hyperlink in a .md file, excluding embedded images.
hyperlink_reg = re.compile(r"((?<!\!)\[[^\]]+\]\s*\(\s*)([^)]+?)(\s*\))")
exclusions = None
with open(f"{dynamo_docs_abspath}/exclusions.txt", "r") as f:
exclusions = f.read()
f.close()
exclude_patterns = exclusions.strip().split("\n")
def setup_logger():
"""
This function is to setup logging
"""
# Create a custom logger
logger = logging.getLogger(__name__)
# Set the log level
logger.setLevel(logging.INFO)
# Create handlers
stream_handler = logging.StreamHandler()
# Create formatters and add it to the handlers
formatter = logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)
return logger
def log_message(message):
"""
This function is for logging to /tmp
- message: Message to log
"""
# Setup the logger
logger = setup_logger()
# Log the message
logger.info(message)
def run_command(command):
"""
This function runs any command using subprocess and logs failures
- command: Command to execute
"""
log_message(f"Running command: {command}")
try:
subprocess.run(
command,
shell=True,
check=True,
text=True,
capture_output=False,
)
except subprocess.CalledProcessError as e:
raise (e)
def is_excluded(file_path):
for exclude_pattern in exclude_patterns:
file_abspath = os.path.abspath(file_path)
exclude_pattern = os.path.abspath(exclude_pattern)
if os.path.commonpath([file_abspath, exclude_pattern]) == exclude_pattern:
return True
return False
def replace_url_with_relpath(url, src_doc_path):
"""
This function replaces Triton Inference Server GitHub URLs with relative paths in following cases.
1. URL is a doc file, e.g. ".md" file.
2. URL is a directory which contains README.md and URL ends with "#<section>".
Examples:
https://github.com/triton-inference-server/server/blob/main/docs/protocol#restricted-protocols
https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_shared_memory.md
https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#dynamic-batcher
Keep URL in the following cases:
https://github.com/triton-inference-server/server/tree/r24.02
https://github.com/triton-inference-server/server/blob/main/build.py
https://github.com/triton-inference-server/server/blob/main/qa
https://github.com/triton-inference-server/server/blob/main/CONTRIBUTING.md
"""
m = dynamo_github_url_reg.match(url)
# Do not replace URL if it is not a Triton GitHub file.
if not m:
return url
target_repo_name = m.group(1)
target_relpath_from_target_repo = os.path.normpath(m.groups("")[1])
section = url[len(m.group(0)) :]
valid_hashtag = section not in ["", "#"] and section.startswith("#")
if target_repo_name == "dynamo":
target_path = os.path.join(dynamo_abspath, target_relpath_from_target_repo)
else:
target_path = os.path.join(
dynamo_docs_abspath, target_repo_name, target_relpath_from_target_repo
)
# Return URL if it points to a path outside server/docs.
if os.path.commonpath([dynamo_docs_abspath, target_path]) != dynamo_docs_abspath:
return url
if (
os.path.isfile(target_path)
and os.path.splitext(target_path)[1] == ".md"
and not is_excluded(target_path)
):
pass
elif (
os.path.isdir(target_path)
and os.path.isfile(os.path.join(target_path, "README.md"))
and valid_hashtag
and not is_excluded(os.path.join(target_path, "README.md"))
):
target_path = os.path.join(target_path, "README.md")
else:
return url
# The "target_path" must be a file at this line.
relpath = os.path.relpath(target_path, start=os.path.dirname(src_doc_path))
return re.sub(dynamo_github_url_reg, relpath, url, 1)
def replace_relpath_with_url(relpath, src_doc_path):
"""
This function replaces relative paths with Triton Inference Server GitHub URLs in following cases.
1. Relative path is a file that is not ".md" type inside the current repo.
2. Relative path is a directory but not (has "README.md" and ends with "#<section>").
3. Relative path does not exist (shows 404 page).
Examples:
../examples/model_repository
../examples/model_repository/inception_graphdef/config.pbtxt
Keep relpath in the following cases:
build.md
build.md#building-with-docker
#building-with-docker
../getting_started/quickstart.md
../protocol#restricted-protocols
"""
target_path = relpath.rsplit("#")[0]
section = relpath[len(target_path) :]
valid_hashtag = section not in ["", "#"]
if relpath.startswith("#"):
target_path = os.path.basename(src_doc_path)
target_path = os.path.join(os.path.dirname(src_doc_path), target_path)
target_path = os.path.normpath(target_path)
# Assert target path is under the current repo directory.
assert os.path.commonpath([dynamo_abspath, target_path]) == dynamo_abspath
target_path_from_src_repo = os.path.relpath(target_path, start=dynamo_abspath)
# For example, target_path of "../protocol#restricted-protocols" should be "<path-to-server>/server/docs/protocol/README.md"
if (
os.path.isdir(target_path)
and valid_hashtag
and os.path.isfile(os.path.join(target_path, "README.md"))
):
relpath = os.path.join(relpath.rsplit("#")[0], "README.md") + section
target_path = os.path.join(target_path, "README.md")
if (
os.path.isfile(target_path)
and os.path.splitext(target_path)[1] == ".md"
and os.path.commonpath([dynamo_docs_abspath, target_path])
== dynamo_docs_abspath
and not is_excluded(target_path)
):
return relpath
else:
return repo_url + target_path_from_src_repo + section
def replace_hyperlink(m, src_doc_path):
"""
TODO: Support of HTML tags for future docs.
Markdown allows <link>, e.g. <a href=[^>]+>. Whether we want to
find and replace the link depends on if they link to internal .md files
or allows relative paths. I haven't seen one such case in our doc so
should be safe for now.
"""
hyperlink_str = m.group(2)
match = http_reg.match(hyperlink_str)
if match:
# Hyperlink is a URL.
res = replace_url_with_relpath(hyperlink_str, src_doc_path)
else:
# Hyperlink is a relative path.
res = replace_relpath_with_url(hyperlink_str, src_doc_path)
return m.group(1) + res + m.group(3)
def preprocess_docs(exclude_paths=[]):
# Find all ".md" files inside the current repo.
if exclude_paths:
cmd = (
["find", dynamo_docs_abspath, "-type", "d", "\\("]
+ " -o ".join([f"-path './{dir}'" for dir in exclude_paths]).split(" ")
+ ["\\)", "-prune", "-o", "-type", "f", "-name", "'*.md'", "-print"]
)
else:
cmd = ["find", dynamo_docs_abspath, "-name", "'*.md'"]
cmd = " ".join(cmd)
result = subprocess.run(cmd, check=True, capture_output=True, text=True, shell=True)
docs_list = list(filter(None, result.stdout.split("\n")))
# Read, preprocess and write back to each document file.
for doc_abspath in docs_list:
if is_excluded(doc_abspath):
continue
content = None
with open(doc_abspath, "r") as f:
content = f.read()
content = hyperlink_reg.sub(
partial(replace_hyperlink, src_doc_path=doc_abspath),
content,
)
with open(doc_abspath, "w") as f:
f.write(content)
@contextmanager
def change_directory(path):
"""
Context manager for changing the current working directory
"""
original_directory = os.getcwd()
try:
os.chdir(path)
yield
finally:
os.chdir(original_directory)
def main():
with change_directory(dynamo_docs_abspath):
run_command("make clean")
preprocess_docs()
run_command("make html")
if __name__ == "__main__":
main()
<!--
SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
All rights reserved.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# Getting Started
## Development Environment
For a consistent development environment, use the provided devcontainer configuration. This requires:
- [Docker](https://www.docker.com/products/docker-desktop)
- [VS Code](https://code.visualstudio.com/) with the [Dev Containers extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers)
To use the devcontainer:
1. Open the project in VS Code.
2. Click the button in the bottom-left corner.
3. Select **Reopen in Container**.
This builds and starts a container with all the necessary dependencies for Dynamo development.
## Installation
```{note}
- The following examples require system level packages.
- We recommend Ubuntu 24.04 with a x86_64 CPU. See the [Support Matrix](support_matrix.md).
```
```
apt-get update
DEBIAN_FRONTEND=noninteractive apt-get install -yq python3-dev python3-pip python3-venv libucx0
python3 -m venv venv
source venv/bin/activate
pip install ai-dynamo[all]
```
```{note}
To ensure compatibility, use the examples in the release branch or tag that matches the version you installed.
```
## Building the Dynamo Base Image
Although not needed for local development, deploying your Dynamo pipelines to Kubernetes requires you to build and push a Dynamo base image to your container registry. You can use any container registry of your choice, such as:
- Docker Hub (docker.io)
- NVIDIA NGC Container Registry (nvcr.io)
- Any private registry
To build it:
```bash
./container/build.sh
docker tag dynamo:latest-vllm <your-registry>/dynamo-base:latest-vllm
docker login <your-registry>
docker push <your-registry>/dynamo-base:latest-vllm
```
This documentation describes these frameworks:
- `--framework vllm` build: see [here](examples/llm_deployment.md).
- `--framework tensorrtllm` build: see [here](examples/trtllm.md).
After building, use this image by setting the `DYNAMO_IMAGE` environment variable to point to your built image:
```bash
export DYNAMO_IMAGE=<your-registry>/dynamo-base:latest-vllm
```
## Running and Interacting with an LLM Locally
To run a model and interact with it locally, call `dynamo run` with a hugging face model. `dynamo run` supports several backends including: `mistralrs`, `sglang`, `vllm`, and `tensorrtllm`.
### Example Command
```
dynamo run out=vllm deepseek-ai/DeepSeek-R1-Distill-Llama-8B
```
```
? User › Hello, how are you?
✔ User · Hello, how are you?
Okay, so I'm trying to figure out how to respond to the user's greeting.
They said, "Hello, how are you?" and then followed it with "Hello! I'm just a program, but thanks for asking."
Hmm, I need to come up with a suitable reply. ...
```
## LLM Serving
Dynamo provides a simple way to spin up a local set of inference components including:
- **OpenAI Compatible Frontend**–High performance OpenAI compatible http api server written in Rust.
- **Basic and Kv Aware Router**–Route and load balance traffic to a set of workers.
- **Workers**–Set of pre-configured LLM serving engines.
To run a minimal configuration you can use a pre-configured example.
### Start Dynamo Distributed Runtime Services
To start the Dynamo Distributed Runtime services the first time:
```bash
docker compose -f deploy/docker-compose.yml up -d
```
### Start Dynamo LLM Serving Components
Next, serve a minimal configuration with an http server, basic
round-robin router, and a single worker.
```bash
cd examples/llm
dynamo serve graphs.agg:Frontend -f configs/agg.yaml
```
### Send a Request
```bash
curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
"messages": [
{
"role": "user",
"content": "Hello, how are you?"
}
],
"stream":false,
"max_tokens": 300
}' | jq
```
## Local Development
If you use vscode or cursor, use the .devcontainer folder built on [Microsofts Extension](https://code.visualstudio.com/docs/devcontainers/containers). For instructions, see the [ReadMe](https://github.com/ai-dynamo/dynamo/blob/main/.devcontainer/README.md).
Otherwise, to develop locally, we recommend working inside of the container:
```bash
./container/build.sh
./container/run.sh -it --mount-workspace
cargo build --release
mkdir -p /workspace/deploy/dynamo/sdk/src/dynamo/sdk/cli/bin
cp /workspace/target/release/http /workspace/deploy/dynamo/sdk/src/dynamo/sdk/cli/bin
cp /workspace/target/release/llmctl /workspace/deploy/dynamo/sdk/src/dynamo/sdk/cli/bin
cp /workspace/target/release/dynamo-run /workspace/deploy/dynamo/sdk/src/dynamo/sdk/cli/bin
uv pip install -e .
export PYTHONPATH=$PYTHONPATH:/workspace/deploy/dynamo/sdk/src:/workspace/components/planner/src
```
### Conda Environment
Alternately, you can use a Conda environment:
```bash
conda activate <ENV_NAME>
pip install nixl # Or install https://github.com/ai-dynamo/nixl from source
cargo build --release
# To install ai-dynamo-runtime from source
cd lib/bindings/python
pip install .
cd ../../../
pip install .[all]
# To test
docker compose -f deploy/docker-compose.yml up -d
cd examples/llm
dynamo serve graphs.agg:Frontend -f configs/agg.yaml
```
<!--
SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
All rights reserved.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# Writing Python Workers in Dynamo
This guide explains how to create your own Python worker in Dynamo and deploy
......@@ -5,11 +23,11 @@ it via `dynamo serve` or `dynamo deploy`, covering basic concepts as well as
advanced features like enabling KV routing and disaggregated serving.
For detailed information about `dynamo serve` infrastructure, see the
[Dynamo SDK Docs](../deploy/sdk/docs/sdk/README.md).
[Dynamo SDK Docs](../API/sdk.md).
For a guide that walks through how to launch a vLLM-based worker with
implementation of Disaggregated Serving and KV-Aware Routing included,
see the [Dynamo Serve Guide](../docs/guides/dynamo_serve.md).
see the [Dynamo Serve Guide](../../docs/guides/dynamo_serve.md).
## Basic Concepts
......@@ -19,7 +37,7 @@ a Python class based definition that requires a few key decorators to get going:
- `@endpoint`: marks methods that can be called by other workers or clients
For more detailed information on these concepts, see the
[Dynamo SDK Docs](../deploy/sdk/docs/sdk/README.md).
[Dynamo SDK Docs](../API/sdk.md).
### Worker Skeleton
......@@ -51,7 +69,7 @@ based on the definitions above, it would be: `your_namespace/YourWorker/your_end
- `endpoint="your_endpoint"`: Defined by the `@endpoint` decorator, or by default the name of the function being decorated.
For more details about service configuration, resource management, and dynamo endpoints,
see the [Dynamo SDK Docs](../deploy/sdk/docs/README.md).
see the [Dynamo SDK Docs](../API/sdk.md).
### Request/Response Types
......@@ -125,7 +143,7 @@ class YourWorker:
To see a minimal worker example like the above used in a larger pipeline of
components, see the `dynamo serve`
[Hello World example](../examples/hello_world).
[Hello World example](../../examples/hello_world).
### Client Example
......@@ -176,13 +194,13 @@ that does something like the following:
- Forward responses back to client
This advanced scenario of a separate
[OpenAI Processor worker](../examples/llm/components/processor.py)
[OpenAI Processor worker](https://github.com/ai-dynamo/dynamo/blob/main/examples/llm/components/processor.py)
is demonstrated in this
[vLLM example](../examples/llm/).
[vLLM example](https://github.com/ai-dynamo/dynamo/tree/main/examples/llm).
For a more minimal example of deploying a pipeline of components with a custom
API that your client can communicate with, see the
[Hello World example](../examples/hello_world).
[Hello World example](../../examples/hello_world).
## Advanced Features
......@@ -192,7 +210,7 @@ KV-aware routing is a powerful feature of Dynamo that optimizes for routing
requests to specific workers while minimizing a specific KV-cache based cost function.
In its simplest form, all a worker needs to do to enable KV-aware routing is to
publish KV metrics through the `KvMetricsPublisher`, which will be consumed
publish KV metrics through the `KvMetricsPublisher`, which is consumed
by a Dynamo KV Router through the `KvMetricsAggregator`:
```python
......@@ -300,7 +318,7 @@ to a KV Router through the `KvMetricsAggregator`.
These metrics can then be inputs to a cost function to determine which
of the available worker's the request should be routed to.
For a [python-based KV Router](../examples/llm/components/kv_router.py)
For a [python-based KV Router](../../examples/llm/components/kv_router.py)
implementation, the router is like any other worker, and it can expose
an endpoint that can do arbitrary things based on your use case.
......@@ -414,11 +432,11 @@ class Router:
Similarly, for running a Rust-based Router as a standalone binary
rather than as a Python Worker, see the
[WorkerSelector Trait](../lib/llm/src/kv_router.rs) trait, and the
[Router Component](../components/router/src/main.rs).
[WorkerSelector Trait](../../lib/llm/src/kv_router.rs) trait, and the
[Router Component](../../components/router/src/main.rs).
For more details on receiving and routing based on the worker's published KV
metrics, see the [KV Cache Routing Guide](../docs/kv_cache_routing.md).
metrics, see the [KV Cache Routing Guide](../../docs/architecture/kv_cache_routing.md).
### Disaggregated Serving
......@@ -520,7 +538,7 @@ The NIXL connector provides:
- Efficient block-based KV cache transfers
- Asynchronous transfer notifications
For a complete implementation example using NIXL for disaggregated serving, see the [vLLM example](../examples/llm/README.md).
For a complete implementation example using NIXL for disaggregated serving, see the [vLLM example](../examples/llm_deployment.md).
#### Disaggregation in Dynamo
......@@ -589,13 +607,13 @@ Depending on the load distribution of requests and number of Prefill/Decode
worker instances, instead of directly forwarding requests to the Prefill
worker endpoint, it may be advantageous to send Prefill requests into a queue
that the Prefill workers can pull from on-demand instead. You can see an example
of that [here](../examples/hello_world/disagg_skeleton/components/prefill_worker.py).
of that [here](https://github.com/ai-dynamo/dynamo/blob/main/examples/hello_world/disagg_skeleton/components/prefill_worker.py).
For an introductory example on doing disaggregation with Dynamo using simple models, see
[this example](../examples/hello_world/disagg_skeleton).
[this example](../examples/disagg_skeleton.md).
For more information on Disaggregated Serving, see the
[general guide](../docs/disagg_serving.md) and [performance tuning guide](../docs/guides/disagg_perf_tuning.md).
[general guide](../architecture/disagg_serving.md) and [performance tuning guide](disagg_perf_tuning.md).
## Best Practices
......@@ -620,6 +638,6 @@ For more information on Disaggregated Serving, see the
## Additional Resources
- Check the [examples](../examples/) directory for more detailed implementations
- Refer to the [Dynamo SDK Docs](../deploy/sdk/docs/sdk/README.md) for API details.
- For Disaggregated Serving, see the [general guide](../docs/disagg_serving.md) and [performance tuning guide](../docs/guides/disagg_perf_tuning.md).
- Check the [examples](https://github.com/ai-dynamo/dynamo/tree/main/examples) directory for more detailed implementations
- Refer to the [Dynamo SDK Docs](../API/sdk.md) for API details.
- For Disaggregated Serving, see the [general guide](../architecture/disagg_serving.md) and [performance tuning guide](disagg_perf_tuning.md).
<!--
SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
All rights reserved.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# About the Dynamo Command Line Interface
The Dynamo CLI is a powerful tool for serving, containerizing, and deploying Dynamo applications. It leverages core pieces of the BentoML deployment stack and provides a range of commands to manage your Dynamo services.
The Dynamo CLI lets you:
- [`run`](#run) - quickly chat with a model
- [`serve`](#serve) - run a set of services locally (via `depends()` or `.link()`)
- [`build`](#build) - create an archive of your services (called a `bento`)
- [`deploy`](#deploy) - create a pipeline on Dynamo Cloud
## Commands
### `run`
The `run` command allows you to quickly chat with a model. Under the hood - it is running the `dynamo-run` Rust binary. For details, see [Running Dynamo](dynamo_run.md).
**Example**
```bash
dynamo run deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
```
### `serve`
The `serve` command lets you run a defined inference graph locally. You must point toward your file and intended class using file:Class syntax. For details, see [Serving Inference Graphs](dynamo_serve.md).
**Usage**
```bash
dynamo serve [SERVICE]
```
**Arguments**
- `SERVICE` - The service to start. You use file:Class syntax to specify the service.
**Flags**
- `--file`/`-f` - Path to optional YAML configuration file. An example of the YAML file can be found in the configuration section of the [SDK docs](../API/sdk.md)
- `--dry-run` - Print out the dependency graph and values without starting any services.
- `--service-name` - Only serve the specified service name. The rest of the discoverable components in the graph are not started.
- `--working-dir` - Specify the directory to find the Service instance
- Any additional flags that follow Class.key=value are passed to the service constructor for the target service and parsed. See the configuration section of the [SDK docs](../API/sdk.md) for more details.
**Example**
```bash
cd examples
# Spin up Frontend, Middle, and Backend components
dynamo serve hello_world:Frontend
# Spin up only the Middle component in the graph that is discoverable from the Frontend service
dynamo serve --service-name Middle hello_world:Frontend
```
### `build`
The `build` commmand allows you to package up your inference graph and its dependancies and create an archive of it. This is commonly paired with the `--containerize` flag to create a single docker container that runs your inference graph. As with `serve`, you point toward the first service in your dependency graph. For details about `dynamo build`, see [Serving Inference Graphs](dynamo_serve.md).
**Usage**
```bash
dynamo build [SERVICE]
```
**Arguments**
- `SERVICE` - The service to build. You use file:Class syntax to specify the service.
**Flags**
- `--working-dir` - Specify the directory to find the Service instance
- `--containerize` - Whether to containerize the Bento after building
**Example**
```bash
cd examples/hello_world
dynamo build hello_world:Frontend
```
### `deploy`
The `deploy` commmand creates a pipeline on Dynamo Cloud using parameters at the prompt or using a YAML configuration file. For details, see [Deploying Inference Graphs to Kubernetes](dynamo_deploy/README.md).
**Usage**
```bash
dynamo deploy [PIPELINE]
```
**Arguments**
- `pipeline` - The pipeline to deploy. Defaults to *None*; required.
**Flags**
- `--name` or `-n` - Deployment name. Defaults to *None*; required.
- `--config-file` or `-f` - Configuration file path. Defaults to *None*; required.
- `--wait` - Whether or not to wait for deployment to be ready. Defaults to wait.
`--no-wait`
- `--timeout` - The number of seconds that can elapse before deployment times out; measured in seconds. Defaults to 3600.
- `--endpoint` or `-e` - The Dynamo Cloud endpoint where the pipeline should be deployed. Defaults to *None*; required.
- `--help` or `-h` - Display in-line help for `dynamo deploy`.
**Example**
For a detailed example, see [Operator Deployment](dynamo_deploy/operator_deployment.md).
......@@ -15,7 +15,7 @@ See the License for the specific language governing permissions and
limitations under the License.
-->
# Dynamo Disaggregation: Performance Tuning
# Disaggregation and Performance Tuning
Disaggregation gains performance by separating the prefill and decode into different engines to reduce interferences between the two. However, performant disaggregation requires careful tuning of the inference parameters. Specifically, there are three sets of parameters that needs to be tuned:
......@@ -23,7 +23,7 @@ Disaggregation gains performance by separating the prefill and decode into diffe
1. Disaggregated router knobs.
1. Number of prefill and decode engines.
This guide will walk you through the process of tuning these parameters.
This guide describes the process of tuning these parameters.
## Engine Knobs
......@@ -48,13 +48,13 @@ TP Size | KV Cache Size (GB) | KV Cache per GPU (GB) | Per GPU Improvement over
The best number of GPUs to use in the prefill and decode engines can be determined by running a few fixed isl/osl/concurrency test using [GenAI-Perf](https://github.com/triton-inference-server/perf_analyzer/tree/main/genai-perf) and compare with the SLA. GenAI-Perf is pre-installed in the dynamo container.
Besides the parallelization mapping, other common knobs to tune are maximum batch size, maximum number of tokens, and block size. For prefill engines, usually a small batch size and large max_num_token is preferred. For decode engines, usually a large batch size and medium max_num_token is preferred. More details on tuning the max_num_token and max_batch_size will be covered in the next section.
Besides the parallelization mapping, other common knobs to tune are maximum batch size, maximum number of tokens, and block size. For prefill engines, usually a small batch size and large max_num_token is preferred. For decode engines, usually a large batch size and medium max_num_token is preferred. For details on tuning the max_num_token and max_batch_size, see the next section.
For block size, if the block size is too small, it leads to small memory chunks in the P->D KV cache transfer and poor performance. Too small block size also leads to memory fragmentation in the attention calculation, but the impact is usually insignificant. If the block size is too large, it leads to low prefix cache hit ratio. For most dense models, we find block size 128 is a good choice.
## Disaggregated Router
Disaggregated router decides whether to prefill a request in the remote prefill engine or locally in the decode engine using chunked prefill. For most frameworks, when chunked prefill is enabled and one forward iteration gets a mixture of prefilling and decoding request, three kernels will be launched:
Disaggregated router decides whether to prefill a request in the remote prefill engine or locally in the decode engine using chunked prefill. For most frameworks, when chunked prefill is enabled and one forward iteration gets a mixture of prefilling and decoding request, three kernels are launched:
1. The attention kernel for context tokens (context_fmha kernel in trtllm).
2. The attention kernel for decode tokens (xqa kernel in trtllm).
3. Dense kernel for the combined active tokens in prefills and decodes.
......@@ -63,7 +63,7 @@ Disaggregated router decides whether to prefill a request in the remote prefill
In the prefill engine, the best strategy is to operate at the smallest batch size that saturates the GPUs so that the average TTFT is minimized. For example, for llama3.3 70b NVFP4 quantization on B200 TP1 in vllm, the below figure shows the prefill time with different isl (prefix caching is turned off):
![Prefill Time](../images/prefill_time.png)
![Combined bar and line chart showing "Prefill Time". Bar chart represents TTFT (Time To First Token) in milliseconds against ISL (Input Sequence Length). The line chart shows TTFT/ISL (milliseconds per token) against ISL.](../images/prefill_time.png)
For isl less than 1000, the prefill efficiency is low because the GPU is not fully saturated. For isl larger than 4000, the prefill time per token increases because the attention takes longer to compute with a longer history.
......
......@@ -15,170 +15,26 @@ See the License for the specific language governing permissions and
limitations under the License.s
-->
# 🔨 [Experimental] Using `dynamo build` to containerize inference graphs
# Building Dynamo (`dynamo build`)
This guide explains how to use the `dynamo build` command to containerize Dynamo inference graphs (pipelines) for deployment.
## Table of Contents
`dynamo build` is a command-line tool that helps containerize inference graphs created with Dynamo SDK. Run `dynamo build --containerize` to build a stand-alone Docker container that encapsulates your entire inference graph. This image can then be shared and run standalone.
- [What is dynamo build?](#what-is-dynamo-build)
- [Building a containerized inference graph](#building-a-containerized-inference-graph)
- [Guided Example for containerizing Hello World pipeline](#guided-example-for-containerizing-hello-world-pipeline)
- [Guided Example for containerizing LLM pipeline](#guided-example-for-containerizing-llm-pipeline)
## What is dynamo build?
`dynamo build` is a command-line tool that helps containerize inference graphs created with Dynamo SDK. Simply run `dynamo build --containerize` to build a stand-alone Docker container that encapsulates your entire inference graph. This image can then be shared and run standalone.
**Note:** This is currently an experimental feature and has only been tested on the examples available in the `examples/` directory. You may have to make some modifications, in particular if your inference graph introduces custom dependencies.
```{note}
This experimental feature is tested on the examples in the `examples/` directory. You need to make some modifications. Pay particular attention if your inference graph introduces custom dependencies.
```
## Building a containerized inference graph
The basic workflow for using `dynamo build` involves:
The basic workflow for using `dynamo build` includes:
1. Defining your inference graph and testing locally with `dynamo serve`
2. Specifying a base image for your inference graph. More on this below.
3. Running `dynamo build` to build a containerized inference graph
#. Defining your inference graph and testing locally with `dynamo serve`
#. Specifying a base image for your inference graph. More on this below.
#. Running `dynamo build` to build a containerized inference graph
### Basic usage
### Basic Usage
```bash
dynamo build <graph_definition> --containerize
```
## Guided Example for containerizing Hello World pipeline
This section will walk through a complete example of building a containerized inference graph. In this example, we simply containerize the Hello World pipeline available at `examples/hello_world`
### 1. Define your graph and check that it works with `dynamo serve`
```bash
cd examples/hello_world
dynamo serve hello_world:Frontend
```
### 2. Build a base image
We intend to support 2 base images which will be available as buildable targets in the top-level Earthfile. You may then use one of these images as the base image to build your inference graph.
1. Leaner image without CUDA and vLLM making it suitable for CPU only deployments. This is what we will use for the Hello World example. Available as `dynamo-base-docker` in the top-level Earthfile.
2. Base image with CUDA and vLLM making it suitable for GPU deployments. **Note:** While this is not yet available in the top-level Earthfile, you may use `dynamo:latest-vllm` image created from running `./container/build.sh` as a valid base image for this purpose.
```bash
export CI_REGISTRY_IMAGE=my-registry
export CI_COMMIT_SHA=hello-world
earthly +dynamo-base-docker --CI_REGISTRY_IMAGE=$CI_REGISTRY_IMAGE --CI_COMMIT_SHA=$CI_COMMIT_SHA
# Image should succesfully be built and tagged as my-registry/dynamo-base-docker:hello-world
```
### 3. Containerize your graph with `dynamo build`
```bash
export DYNAMO_IMAGE=my-registry/dynamo-base-docker:hello-world
dynamo build hello_world:Frontend --containerize
# Output will contain tag for the newly created image
# e.g frontend-hello-world:latest
```
### 4. Run your container
As a prerequisite, ensure you have NATS and etcd running by running the docker compose in the deploy directory. You can find it [here](../../deploy/metrics/docker-compose.yml).
```bash
docker compose up -d
```
Starting your container with host networking and required environment variables:
```bash
# Host networking is required for NATS and etcd to be accessible from the container
docker run --network host \
--entrypoint bash \
--ipc host \
frontend:<generated_tag> \
-c "cd src && uv run dynamo serve hello_world:Frontend"
```
Test your containerized Dynamo services:
```bash
curl -X 'POST' \
'http://localhost:8000/generate' \
-H 'accept: text/event-stream' \
-H 'Content-Type: application/json' \
-d '{
"text": "test"
}'
```
## Guided Example for containerizing LLM pipeline
This section will walk through an example of building a containerized LLM inference graph using the example available at `examples/llm`.
### 1. Define your graph and check that it works with `dynamo serve`
```bash
cd examples/llm
dynamo serve graphs.agg:Frontend -f ./configs/agg.yaml
```
### 2. Build a base image
For LLM inference, we'll use the GPU-enabled base image with CUDA and vLLM support. You can use the `dynamo:latest-vllm` image created from running `./container/build.sh` as the base image.
```bash
# Build the base image with CUDA and vLLM support
./container/build.sh
# This will create dynamo:latest-vllm image
```
### 3. Containerize your graph with `dynamo build`
```bash
export DYNAMO_IMAGE=dynamo:latest-vllm
dynamo build graphs.agg:Frontend --containerize
# Output will contain tag for the newly created image
# e.g frontend-llm-agg:latest
```
### 4. Run your container
As a prerequisite, ensure you have NATS and etcd running by running the docker compose in the deploy directory. You can find it [here](../../deploy/metrics/docker-compose.yml).
```bash
docker compose up -d
```
Starting your container with host networking and required environment variables:
```bash
# Host networking is required for NATS and etcd to be accessible from the container
docker run --network host \
--entrypoint sh \
--gpus all \
--shm-size 10G \
--ipc host \
frontend:<generated_tag> \
-c "cd src && uv run dynamo serve graphs.agg:Frontend -f ./configs/agg.yaml"
```
### 5. Test your containerized LLM service
Once the container is running, you can test it by making a request to the service:
```bash
curl localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
"messages": [
{
"role": "user",
"content": "What is the capital of France?"
}
],
"stream": false,
"max_tokens": 30
}'
```
```
\ No newline at end of file
......@@ -15,15 +15,18 @@ See the License for the specific language governing permissions and
limitations under the License.
-->
# Deploying Dynamo Inference Graphs to Kubernetes
# Deploying Inference Graphs to Kubernetes (`dynamo deploy`)
This guide provides an overview of the different deployment options available for Dynamo inference graphs in Kubernetes environments.
This guide explains the deployment options available for Dynamo inference graphs in Kubernetes environments.
## Deployment Options
Dynamo provides two distinct deployment paths, each serving different use cases:
Dynamo provides two distinct deployment options that each serve different use cases:
1. Dynamo Cloud Kubernetes Platform is preferred in cases that support it
2. Manual Deployment with Helm Charts is suited to users who need more control over their deployments
### 1. 🚀 Dynamo Cloud Kubernetes Platform [PREFERRED]
### Dynamo Cloud Kubernetes Platform [PREFERRED]
The Dynamo Cloud Platform (`deploy/cloud/`) provides a managed deployment experience:
......@@ -35,9 +38,9 @@ For detailed instructions on using the Dynamo Cloud Platform, see:
- [Dynamo Cloud Platform Guide](dynamo_cloud.md): walks through installing and configuring the Dynamo cloud components on your Kubernetes cluster.
- [Operator Deployment Guide](operator_deployment.md)
### 2. Manual Deployment with Helm Charts
### Manual Deployment with Helm Charts
The manual deployment path (`deploy/helm/`) is available for users who need more control over their deployments:
Users who need more control over their deployments can use the manual deployment path (`deploy/helm/`):
- Used for manually deploying inference graphs to Kubernetes
- Contains Helm charts and configurations for deploying individual inference pipelines
......@@ -47,18 +50,18 @@ The manual deployment path (`deploy/helm/`) is available for users who need more
- [Using the Deployment Script](manual_helm_deployment.md#using-the-deployment-script): all-in-one script for manual deployment
- [Helm Deployment Guide](manual_helm_deployment.md#helm-deployment-guide): detailed instructions for manual deployment
## Getting Started
## Getting Started with Helm Deploym
1. **For Dynamo Cloud Platform**:
- Follow the [Dynamo Cloud Platform Guide](dynamo_cloud.md)
- Deploy a Hello World pipeline using the [Operator Deployment Guide](operator_deployment.md)
- Deploy a Dynamo LLM pipeline to Kubernetes [Deploy LLM Guide](../../../examples/llm/README.md#deploy-to-kubernetes)
- Deploy a Dynamo LLM pipeline to Kubernetes [Deploy LLM Guide](../../examples/llm_deployment.md#deploy-to-kubernetes)
2. **For Manual Deployment**:
- Follow the [Manual Helm Deployment Guide](manual_helm_deployment.md)
## Example Deployment
## Example Deployments
See the [Hello World example](../../../examples/hello_world/README.md#deploying-to-and-running-the-example-in-kubernetes) for a complete walkthrough of deploying a simple inference graph.
See the [Hello World example](../../examples/hello_world.md#deploying-to-and-running-the-example-in-kubernetes) for a complete walkthrough of deploying a simple inference graph.
See the [LLM example](../../../examples/llm/README.md#deploy-to-kubernetes) for a complete walkthrough of deploying a production-ready LLM inference pipeline to Kubernetes.
\ No newline at end of file
See the [LLM example](../../examples/llm_deployment.md#deploy-to-kubernetes) for a complete walkthrough of deploying a production-ready LLM inference pipeline to Kubernetes.
\ No newline at end of file
......@@ -15,11 +15,11 @@ See the License for the specific language governing permissions and
limitations under the License.
-->
# 🚀 Dynamo Cloud Kubernetes Platform (Dynamo Deploy)
# Dynamo Cloud Kubernetes Platform (Dynamo Deploy)
The Dynamo Cloud platform is a comprehensive solution for deploying and managing Dynamo inference graphs (also referred to as pipelines) in Kubernetes environments. It provides a streamlined experience for deploying, scaling, and monitoring your inference services. You can interface with Dynamo Cloud using the `deploy` subcommand available in the Dynamo CLI (e.g `dynamo deploy`)
The Dynamo Cloud platform is a comprehensive solution for deploying and managing Dynamo inference graphs (also referred to as pipelines) in Kubernetes environments. It provides a streamlined experience for deploying, scaling, and monitoring your inference services. You can interface with Dynamo Cloud using the `deploy` subcommand available in the Dynamo CLI (for example, `dynamo deploy`)
## 📋 Overview
## Overview
The Dynamo cloud platform consists of several key components:
......@@ -29,9 +29,9 @@ The Dynamo cloud platform consists of several key components:
These components work together to provide a seamless deployment experience, handling everything from containerization to scaling and monitoring.
![Dynamo Deploy](../../images/dynamo-deploy.png)
![Dynamo Deploy system deployment diagram.](../../images/dynamo-deploy.png)
## 🎯 Prerequisites
## Prerequisites
Before getting started with the Dynamo cloud platform, ensure you have:
......@@ -42,14 +42,15 @@ Before getting started with the Dynamo cloud platform, ensure you have:
- `kubectl` configured to access your cluster
- Helm installed (version 3.0 or later)
> [!TIP]
> Don't have a Kubernetes cluster? Check out our [Minikube setup guide](./minikube.md) to set up a local environment! 🏠
```{tip}
Don't have a Kubernetes cluster? Check out our [Minikube setup guide](./minikube.md) to set up a local environment!
```
## 🏗️ Building Docker Images for Dynamo Cloud Components
## Building Docker Images for Dynamo Cloud Components
The Dynamo cloud platform components need to be built and pushed to a container registry before deployment. You can build these components individually or all at once.
### ⚙️ Setting Up Environment Variables
### Setting Up Environment Variables
First, set the required environment variables for building and pushing images:
......@@ -64,13 +65,13 @@ Where:
- `<CONTAINER_REGISTRY>`: Your container registry (e.g., `nvcr.io`, `docker.io/<your-username>`, etc.)
- `<TAG>`: The version tag for your images (e.g., `latest`, `0.0.1`, `v1.0.0`)
> [!IMPORTANT]
> Make sure you're logged in to your container registry before pushing images:
> ```bash
> docker login <CONTAINER_REGISTRY>
> ```
```{important}
Make sure you're logged in to your container registry before pushing images:
```bash
docker login <CONTAINER_REGISTRY>
```
### 🛠️ Building Components
### Building Components
You can build and push all platform components at once:
......@@ -78,15 +79,15 @@ You can build and push all platform components at once:
earthly --push +all-docker --DOCKER_SERVER=$DOCKER_SERVER --IMAGE_TAG=$IMAGE_TAG
```
## 🚀 Deploying the Dynamo Cloud Platform
## Deploying the Dynamo Cloud Platform
Once you've built and pushed the components, you can deploy the platform to your Kubernetes cluster.
### 📋 Prerequisites
### Prerequisites
Before deploying Dynamo Cloud, ensure your Kubernetes cluster meets the following requirements:
#### 1. 💾 PVC Support with Default Storage Class
#### PVC Support with Default Storage Class
Dynamo Cloud requires Persistent Volume Claim (PVC) support with a default storage class. Verify your cluster configuration:
```bash
......@@ -99,7 +100,7 @@ kubectl get storageclass
# standard (default) kubernetes.io/gce-pd Delete Immediate true 1d
```
### 📥 Installation
### Installation
1. Set the required environment variables:
```bash
......@@ -110,12 +111,13 @@ export IMAGE_TAG=<TAG> # Use the same tag you used when building the images
export NAMESPACE=dynamo-cloud # change this to whatever you want!
```
> [!NOTE]
> DOCKER_USERNAME and DOCKER_PASSWORD are optional and only needed if you want to pull docker images from a private registry.
> A docker image pull secret will be created automatically if these variables are set. Its name will be `docker-imagepullsecret` unless overridden by the `DOCKER_SECRET_NAME` environment variable.
``` {note}
DOCKER_USERNAME and DOCKER_PASSWORD are optional and only needed if you want to pull docker images from a private registry.
A docker image pull secret is created automatically if these variables are set. Its name is `docker-imagepullsecret` unless overridden by the `DOCKER_SECRET_NAME` environment variable.
```
The Dynamo Cloud Platform auto-generates docker images for pipelines and pushes them to a container registry.
By default, the platform will use the same container registry as the platform components (specified by `DOCKER_SERVER`).
By default, the platform uses the same container registry as the platform components (specified by `DOCKER_SERVER`).
However, you can specify a different container registry for pipelines by additionally setting the following environment variables:
```bash
......@@ -136,7 +138,7 @@ export ISTIO_ENABLED="true"
export ISTIO_GATEWAY="istio-system/istio-ingressgateway" # or whatever istio gateway you have configured
```
Running the installation script with `--interactive` will guide you through the process of exposing your Dynamo Cloud Platform externally if you don't want to set these environment variables manually.
Running the installation script with `--interactive` guides you through the process of exposing your Dynamo Cloud Platform externally if you don't want to set these environment variables manually.
2. [One-time Action] Create a new kubernetes namespace and set it as your default.
......@@ -158,10 +160,11 @@ if you wish to be guided through the deployment process, you can run the deploy
./deploy.sh --interactive
```
4. 🌐 **Expose Dynamo Cloud Externally**
4. **Expose Dynamo Cloud Externally**
> [!NOTE]
> The script will automatically display information about the endpoint you can use to access Dynamo Cloud. In our docs, we refer to this externally available endpoint as `DYNAMO_CLOUD`.
``` {note}
The script automatically displays information about the endpoint you can use to access Dynamo Cloud. In our docs, we refer to this externally available endpoint as `DYNAMO_CLOUD`.
```
The simplest way to expose the `dynamo-store` service within the namespace externally is to use a port-forward:
......@@ -170,12 +173,12 @@ kubectl port-forward svc/dynamo-store <local-port>:80 -n $NAMESPACE
export DYNAMO_CLOUD=http://localhost:<local-port>
```
## 🎯 Next Steps
## Next Steps
After deploying the Dynamo cloud platform, you can:
1. Deploy your first inference graph using the [Dynamo CLI](operator_deployment.md)
2. Deploy Dynamo LLM pipelines to Kubernetes using the [Dynamo CLI](../../../examples/llm/README.md)!
2. Deploy Dynamo LLM pipelines to Kubernetes using the [Dynamo CLI](../../examples/llm_deployment.md)!
3. Manage your deployments using the Dynamo CLI
For more detailed information about deploying inference graphs, see the [Dynamo Deploy Guide](README.md).
......@@ -15,9 +15,10 @@ See the License for the specific language governing permissions and
limitations under the License.
-->
<a id="k8-helm-deploy"></a>
# Deploying Dynamo Inference Graphs to Kubernetes using Helm
This guide will walk you through the process of deploying an inference graph created using the Dynamo SDK onto a Kubernetes cluster.
This guide describes the deployment process of an inference graph created using the Dynamo SDK onto a Kubernetes cluster.
While this guide covers deployment of Dynamo inference graphs using Helm, the preferred method to deploy an inference graph is to [deploy with the Dynamo cloud platform](operator_deployment.md). The [Dynamo cloud platform](dynamo_cloud.md) simplifies the deployment and management of Dynamo inference graphs. It includes a set of components (Operator, Kubernetes Custom Resources, etc.) that work together to streamline the deployment and management process.
......@@ -28,8 +29,6 @@ Once an inference graph is defined using the Dynamo SDK, it can be deployed onto
3. Enabling autoscaling, monitoring, and observability for the inference graph
4. Easy administration of deployments via UI
The Dynamo Kubernetes Operator will be released soon.
## Helm Deployment Guide
### Setting up MicroK8s
......@@ -105,7 +104,7 @@ helm install --namespace ${NAMESPACE} ${RELEASE_NAME}-etcd \
--values etcd-values.yaml
```
After completing these steps, your cluster will have the necessary messaging and storage infrastructure for running Dynamo inference graphs.
After completing these steps, your cluster has the necessary messaging and storage infrastructure for running Dynamo inference graphs.
### Building and Deploying the Pipeline
......@@ -113,8 +112,9 @@ Follow these steps to containerize and deploy your inference pipeline:
1. Build and containerize the pipeline:
> [!NOTE]
> For instructions on building and pushing the Dynamo base image, see the [Building the Dynamo Base Image](../../README.md#building-the-dynamo-base-image) section in the main README.
``` {note}
For instructions on building and pushing the Dynamo base image, see [Building the Dynamo Base Image](../../get_started.md#building-the-dynamo-base-image).
```
```bash
# Navigate to example directory
......
......@@ -15,20 +15,21 @@ See the License for the specific language governing permissions and
limitations under the License.
-->
# 🏠 Minikube Setup Guide
# Minikube Setup Guide
Don't have a Kubernetes cluster? No problem! You can set up a local development environment using Minikube. This guide will walk you through setting up everything you need to run Dynamo Cloud locally.
Don't have a Kubernetes cluster? No problem! You can set up a local development environment using Minikube. This guide walks through the set up of everything you need to run Dynamo Cloud locally.
## Setting Up Minikube
### 1. Install Minikube
First things first! You'll need to install Minikube. Follow the official [Minikube installation guide](https://minikube.sigs.k8s.io/docs/start/) for your operating system.
First things first! Start by installing Minikube. Follow the official [Minikube installation guide](https://minikube.sigs.k8s.io/docs/start/) for your operating system.
### 2. Configure GPU Support (Optional)
Planning to use GPU-accelerated workloads? You'll need to configure GPU support in Minikube. Follow the [Minikube GPU guide](https://minikube.sigs.k8s.io/docs/tutorials/nvidia/) to set up NVIDIA GPU support before proceeding.
> [!TIP]
> Make sure to configure GPU support before starting Minikube if you plan to use GPU workloads!
```{tip}
Make sure to configure GPU support before starting Minikube if you plan to use GPU workloads!
```
### 3. Start Minikube
Time to launch your local cluster!
......@@ -61,8 +62,3 @@ kubectl get storageclass
Once your local environment is set up, you can proceed with the [Dynamo Cloud deployment guide](./dynamo_cloud.md) to deploy the platform to your local cluster.
## Coming Soon
- MicroK8s setup guide
- Kind setup guide
- More local development tips and tricks
<!--
SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# Deploying Dynamo Inference Graphs to Kubernetes using the Dynamo Cloud Platform
This guide walks you through deploying an inference graph created with the Dynamo SDK onto a Kubernetes cluster using the Dynamo cloud platform and the Dynamo deploy CLI. The Dynamo cloud platform provides a streamlined experience for deploying and managing your inference services.
......@@ -6,16 +23,15 @@ This guide walks you through deploying an inference graph created with the Dynam
Before proceeding with deployment, ensure you have:
- [Dynamo Python package](../README.md#installation) installed
- [Dynamo Python package](../../get_started.md#installation) installed
- A Kubernetes cluster with the [Dynamo cloud platform](dynamo_cloud.md) installed
- Ubuntu 24.04 as the base image for your services
- Required dependencies:
- Helm package manager
- Rust packages and toolchain
You must have first followed the instructions in [deploy/cloud/helm/README.md](../../../deploy/cloud/helm/README.md) to install Dynamo Cloud on your Kubernetes cluster.
You must have first followed the instructions in [deploy/dynamo/helm/README.md](https://github.com/ai-dynamo/dynamo/blob/main/deploy/dynamo/cloud/helm/README.md) to install Dynamo Cloud on your Kubernetes cluster.
**Note**: Note the `KUBE_NS` variable in the following steps must match the Kubernetes namespace where you installed Dynamo Cloud. You must also expose the `dynamo-store` service externally. This will be the endpoint the CLI uses to interface with Dynamo Cloud.
## Understanding the Deployment Process
The deployment process involves two main steps:
......@@ -74,14 +90,15 @@ export KUBE_NS=hello-world
export DYNAMO_CLOUD=https://dynamo-cloud.nvidia.com # Replace with your actual endpoint
```
> [!NOTE]
> The `DYNAMO_CLOUD` environment variable is required for all Dynamo deployment commands. Make sure it's set before running any deployment operations.
``` {note}
The `DYNAMO_CLOUD` environment variable is required for all Dynamo deployment commands. Make sure it's set before running any deployment operations.
```
### 2. Build the Dynamo Base Image
Before building your service, you need to ensure the base image is properly set up:
1. For detailed instructions on building and pushing the Dynamo base image, see the [Building the Dynamo Base Image](../../../README.md#building-the-dynamo-base-image) section in the main README.
1. For detailed instructions on building and pushing the Dynamo base image, see the [Building the Dynamo Base Image](../../get_started.md#building-the-dynamo-base-image) section in the main README.
2. Export the image from the previous step to your environment.
```bash
......@@ -135,8 +152,10 @@ To remove a deployment and all its associated resources:
```bash
dynamo deployment delete $DEPLOYMENT_NAME
```
> [!WARNING]
> This command will permanently delete the deployment and all associated resources. Make sure you have any necessary backups before proceeding.
```{warning}
This command permanently deletes the deployment and all associated resources. Make sure you have any necessary backups before proceeding.
```
### 4. Test the Deployment
......
# Dynamo Run
# Running Dynamo (`dynamo run`)
* [Quickstart with pip and vllm](#quickstart-with-pip-and-vllm)
* [Automatically download a model from Hugging Face](#use-model-from-hugging-face)
......@@ -19,9 +19,12 @@
* [Defaults](#defaults)
* [Extra engine arguments](#extra-engine-arguments)
`dynamo-run` is a CLI tool for exploring the Dynamo components, and an example of how to use them from Rust. It is also available as `dynamo run` if using the Python wheel.
It supports the following engines: mistralrs, llamacpp, sglang, vllm and tensorrt-llm. `mistralrs` is the default.
This guide explains the`dynamo run` command.
`dynamo-run` is a CLI tool for exploring the Dynamo components. It's also an example of how to use components from Rust. If you use the Python wheel, it's available as `dynamo run` .
It supports these engines: mistralrs, llamacpp, sglang, vllm, and tensorrt-llm. `mistralrs` is the default.
Usage:
```
......@@ -30,57 +33,66 @@ dynamo-run in=[http|text|dyn://<path>|batch:<folder>] out=echo_core|echo_full|mi
Example: `dynamo run Qwen/Qwen3-0.6B`
Set environment variable `DYN_LOG` to adjust logging level, e.g. `export DYN_LOG=debug`. It has the same syntax as `RUST_LOG`, ask AI for details.
Set the environment variable `DYN_LOG` to adjust the logging level; for example, `export DYN_LOG=debug`. It has the same syntax as `RUST_LOG`.
## Quickstart with pip and vllm
If you used `pip` to install `dynamo` you should have the `dynamo-run` binary pre-installed with the `vllm` engine. You must be in a virtual env with vllm installed to use this. To compile from source, see "Full documentation" below.
If you used `pip` to install `dynamo`, you have the `dynamo-run` binary pre-installed with the `vllm` engine. You must be in a virtual environment with vllm installed to use this engine. To compile from source, see [Full usage details](#full-usage-details) below.
The vllm and sglang engines require [etcd](https://etcd.io/) and [nats](https://nats.io/) with jetstream (`nats-server -js`). Mistralrs and llamacpp do not.
### Use model from Hugging Face
This will automatically download Qwen3 4B from Hugging Face (16 GiB download) and start it in interactive text mode:
To automatically downloads Qwen3 4B from Hugging Face (16 GiB download) and starts it in interactive text mode:
```
dynamo run out=vllm Qwen/Qwen3-4B
```
General format for HF download:
The general format for HF download follows this pattern:
```
dynamo run out=<engine> <HUGGING_FACE_ORGANIZATION/MODEL_NAME>
```
For gated models (e.g. meta-llama/Llama-3.2-3B-Instruct) you have to have an `HF_TOKEN` environment variable set.
For gated models (such as meta-llama/Llama-3.2-3B-Instruct), you must set an `HF_TOKEN` environment variable.
The parameter can be the ID of a HuggingFace repository (it will be downloaded), a GGUF file, or a folder containing safetensors, config.json, etc (a locally checked out HuggingFace repository).
The parameter can be the ID of a HuggingFace repository (which will be downloaded), a GPT-Generated Unified Format (GGUF) file, or a folder containing safetensors, config.json, or similar (perhaps a locally checked out HuggingFace repository).
### Run a model from local file
#### Step 1: Download model from Hugging Face
One of these models should be high quality and fast on almost any machine: https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF
E.g. https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF/blob/main/Llama-3.2-3B-Instruct-Q4_K_M.gguf
To run a model from local file:
- Download the model from Hugging Face
- Run the model from local file
See the following sections for details.
Download model file:
#### Download model from Hugging Face
One of the models available from HUgging Face should be high quality and fast on almost any machine: https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF
For example, try https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF/blob/main/Llama-3.2-3B-Instruct-Q4_K_M.gguf
To download model file:
```
curl -L -o Llama-3.2-3B-Instruct-Q4_K_M.gguf "https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF/resolve/main/Llama-3.2-3B-Instruct-Q4_K_M.gguf?download=true"
```
#### Run model from local file
**Text interface**
To run the model:
*Text interface*
```
dynamo run Llama-3.2-3B-Instruct-Q4_K_M.gguf # or path to a Hugging Face repo checkout instead of the GGUF
dynamo run Llama-3.2-3B-Instruct-Q4_K_M.gguf # or path to a Hugging Face repo checkout instead of the GGUF file
```
**HTTP interface**
*HTTP interface*
```
dynamo run in=http out=mistralrs Llama-3.2-3B-Instruct-Q4_K_M.gguf
```
You can also list models or send a request:
**List the models**
*List the models*
```
curl localhost:8080/v1/models
```
**Send a request**
*Send a request*
```
curl -d '{"model": "Llama-3.2-3B-Instruct-Q4_K_M", "max_completion_tokens": 2049, "messages":[{"role":"user", "content": "What is the capital of South Africa?" }]}' -H 'Content-Type: application/json' http://localhost:8080/v1/chat/completions
```
......@@ -91,24 +103,20 @@ You can run the ingress side (HTTP server and pre-processing) on one machine, fo
You will need [etcd](https://etcd.io/) and [nats](https://nats.io) with jetstream installed and accessible from both nodes.
**Node 1:**
OpenAI compliant HTTP server, optional pre-processing, worker discovery.
**Node 1:** OpenAI compliant HTTP server, optional pre-processing, worker discovery:
```
dynamo-run in=http out=dyn
```
**Node 2:**
Vllm engine. Receives and returns requests over the network.
**Node 2:** Vllm engine. Receives and returns requests over the network:
```
dynamo-run in=dyn://llama3B.backend.generate out=vllm ~/llms/Llama-3.2-3B-Instruct
```
This will use etcd to auto-discover the model and NATS to talk to it. You can
run multiple instances on the same endpoint and it will pick one based on the
This uses etcd to auto-discover the model and NATS to talk to it. You can
run multiple instances on the same endpoint; it picks one based on the
`--router-mode` (round-robin by default if left unspecified).
Run `dynamo-run --help` for more options.
......@@ -152,16 +160,37 @@ For output it is always only `out=dyn`. This tells Dynamo to auto-discover the i
**Setup**
Only patched vllm currently supports KV-aware routing. Key setup steps:
1. `etcd` and `nats` (see earlier) must be running and accessible from all nodes.
1. Create a virtualenv: `uv venv kvtest`, source it's `activate`.
1. EITHER install Dynamo's vllm branch: `uv pip install ai-dynamo-vllm`,
1. OR install upstream vllm 0.8.4 (`uv pip install vllm==0.8.4`) and patch it: `cd kvtest/lib/python3.12/site-packages`, `patch -p1 < $REPO_ROOT/container/deps/vllm/vllm_v0.8.4-dynamo-kv-disagg-patch.patch`.
1. Build the C bindings. `cd $REPO_ROOT/lib/bindings/c`. `cargo build`.
1. Put the library you just built on library path: `export LD_LIBRARY_PATH=$REPO_ROOT/target/debug/`.
If you patched locally (instead of installing `ai-dynamo-vllm`) you will need to edit vllm's `platforms/__init__.py` to undo a patch change:
Currently, only patched vllm supports KV-aware routing.
To set up KV-aware routing on patched vllm:
1. Ensure that `etcd` and `nats` (see [Quickstart with pip and vllm](#quickstart-with-pip-and-vllm)) are running and accessible from all nodes.
1. Create a virtualenv: `uv venv kvtest` and source its `activate`.
1. Use `pip` to **either**:
1. Install Dynamo's vllm branch:
```
uv pip install ai-dynamo-vllm
```
**or**
1. Install upstream vllm 0.8.4:
```
uv pip install vllm==0.8.4
```
And then patch it:
```
cd kvtest/lib/python3.12/site-packages
patch -p1 < $REPO_ROOT/container/deps/vllm/vllm_v0.8.4-dynamo-kv-disagg-patch.patch
```
1. Build the C bindings:
```
cd $REPO_ROOT/lib/bindings/c
cargo build`.
```
1. Put the library you just built on library path:
```
export LD_LIBRARY_PATH=$REPO_ROOT/target/debug/
```
If you patched locally (instead of installing `ai-dynamo-vllm`), edit vllm's `platforms/__init__.py` to undo a patch change:
```
#vllm_version = version("ai_dynamo_vllm")
vllm_version = version("vllm")
......@@ -169,7 +198,7 @@ If you patched locally (instead of installing `ai-dynamo-vllm`) you will need to
**Start the workers**
The workers are started normally.
The workers are started normally:
```
dynamo-run in=dyn://dynamo.endpoint.generate out=vllm /data/llms/Qwen/Qwen3-4B
......@@ -181,17 +210,19 @@ dynamo-run in=dyn://dynamo.endpoint.generate out=vllm /data/llms/Qwen/Qwen3-4B
dynamo-run in=http out=dyn --router-mode kv
```
The only difference from the distributed system above is `--router-mode kv`. The patched vllm will announce when a KV block is created or removed. The Dynamo router run will find the worker with the best match for those KV blocks and direct the traffic to that node.
The only difference from the distributed system above is `--router-mode kv`. The patched vllm announces when a KV block is created or removed. The Dynamo router run finds the worker with the best match for those KV blocks and directs the traffic to that node.
For performance testing compare a typical workload with `--router-mode random|round-robin` to see if it will benefit from KV-aware routing.
For performance testing, compare a typical workload with `--router-mode random|round-robin` to see if it can benefit from KV-aware routing.
## Full usage details
`dynamo-run` is what `dynamo run` executes. It is also an example of what you can build in Rust with the `dynamo-llm` and `dynamo-runtime` crates. The following guide demonstrates how you can build from source with all the features.
`dynamo run` executes `dynamo-run`. `dynamo-run` is also an example of what can be built in Rust with the `dynamo-llm` and `dynamo-runtime` crates. The following guide shows how to build from source with all the features.
### Setup
### Getting Started
#### Step 1: Install libraries
#### Setup
##### Step 1: Install libraries
**Ubuntu:**
```
sudo apt install -y build-essential libhwloc-dev libudev-dev pkg-config libssl-dev libclang-dev protobuf-compiler python3-dev cmake
......@@ -208,18 +239,18 @@ sudo apt install -y build-essential libhwloc-dev libudev-dev pkg-config libssl-d
```
brew install cmake protobuf
# Check that Metal is accessible
## Check that Metal is accessible
xcrun -sdk macosx metal
```
If Metal is accessible, you should see an error like `metal: error: no input files`, which confirms it is installed correctly.
#### Step 2: Install Rust
##### Step 2: Install Rust
```
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
source $HOME/.cargo/env
```
#### Step 3: Build
##### Step 3: Build
- Linux with GPU and CUDA (tested on Ubuntu):
```
......@@ -243,14 +274,20 @@ Optionally you can run `cargo build` from any location with arguments:
--manifest-path /path/to/project/Cargo.toml` # if cargo build is run outside of `launch/` directory
```
The binary will be called `dynamo-run` in `target/debug`
The binary is called `dynamo-run` in `target/debug`
```
cd target/debug
```
Build with `--release` for a smaller binary and better performance, but longer build times. The binary will be in `target/release`.
### mistralrs
#### Defaults
The input defaults to `in=text`. The output defaults to `out=mistralrs` engine, unless it is disabled with `--no-default-features` in which case vllm is used.
### Running Inference with Pre-built Engines
#### mistralrs
[mistral.rs](https://github.com/EricLBuehler/mistral.rs) is a pure Rust engine that is fast to run, fast to load, supports GGUF as well as safetensors, and runs well on CPU as well as GPU. For those reasons it is the default engine.
......@@ -266,7 +303,7 @@ dynamo-run in=text out=mistralrs Qwen/Qwen3-4B
If you have multiple GPUs, mistral.rs does automatic tensor parallelism. You do not need to pass any extra flags to dynamo-run to enable it.
### llamacpp
#### llamacpp
Currently [llama.cpp](https://github.com/ggml-org/llama.cpp) is not included by default. Build it like this:
......@@ -279,14 +316,14 @@ dynamo-run out=llamacpp ~/llms/gemma-3-1b-it-q4_0.gguf
dynamo-run out=llamacpp ~/llms/Qwen3-0.6B-Q8_0.gguf # From https://huggingface.co/ggml-org
```
Note that in some cases we are unable to extract the tokenizer from the GGUF, and so a Hugging Face checkout of a matching model must also be passed. Dynamo will use the weights from the GGUF and the pre-processor (`tokenizer.json`, etc) from the `--model-config`:
Note that in some cases we are unable to extract the tokenizer from the GGUF, and so a Hugging Face checkout of a matching model must also be passed. Dynamo uses the weights from the GGUF and the pre-processor (`tokenizer.json`, etc) from the `--model-config`:
```
dynamo-run out=llamacpp ~/llms/Llama-4-Scout-17B-16E-Instruct-UD-IQ1_S.gguf --model-config ~/llms/Llama-4-Scout-17B-16E-Instruct
```
If you have multiple GPUs, llama.cpp does automatic tensor parallelism. You do not need to pass any extra flags to dynamo-run to enable it.
### sglang
#### sglang
The [SGLang](https://docs.sglang.ai/index.html) engine requires [etcd](https://etcd.io/) and [nats](https://nats.io/) with jetstream (`nats-server -js`) to be running.
......@@ -302,14 +339,14 @@ uv pip install "sglang[all]==0.4.2" --find-links https://flashinfer.ai/whl/cu124
2. Run
Any example above using `out=sglang` will work, but our sglang backend is also multi-gpu.
Any example above using `out=sglang` can work, but our sglang backend is also multi-gpu.
```
cd target/debug
./dynamo-run in=http out=sglang --model-path ~/llms/DeepSeek-R1-Distill-Llama-70B/ --tensor-parallel-size 8
```
To pass extra arguments to the sglang engine see *Extra engine arguments* below.
To pass extra arguments to the sglang engine see [Extra engine arguments](#extra-engine-arguments).
**Multi-GPU**
......@@ -324,7 +361,7 @@ To specify which GPU to start from pass `--base-gpu-id <num>`, for example on a
dynamo-run out=sglang <model> --tensor-parallel-size 4 --base-gpu-id 4
```
**Multi-node:**
**Multinode:**
Dynamo only manages the leader node (node rank 0). The follower nodes are started in the [normal sglang way](https://docs.sglang.ai/references/deepseek.html#running-examples-on-multi-node).
......@@ -342,7 +379,7 @@ python3 -m sglang.launch_server --model-path /data/models/DeepSeek-R1-Distill-Ll
- Parameters `--tensor-parallel-size` and `--tp` must match and be the total number of GPUs across the cluster.
- `--node-rank` must be unique consecutive integers starting at 1. The leader, managed by Dynamo, is 0.
### vllm
#### vllm
Using the [vllm](https://github.com/vllm-project/vllm) Python library. Slow startup, fast inference. Supports both safetensors from HF and GGUF files, but is very slow for GGUF - prefer llamacpp.
......@@ -358,7 +395,7 @@ uv pip install pip
uv pip install vllm==0.8.4 setuptools
```
**Note: If you're on Ubuntu 22.04 or earlier, you will need to add `--python=python3.10` to your `uv venv` command**
**Note: If you're on Ubuntu 22.04 or earlier, you must add `--python=python3.10` to your `uv venv` command**
2. Build:
```
......@@ -375,7 +412,7 @@ Inside that virtualenv:
```
To pass extra arguments to the vllm engine see [Extra engine arguments](#extra_engine_arguments) below.
To pass extra arguments to the vllm engine see [Extra engine arguments](#extra-engine-arguments) below.
**Multi-GPU**
......@@ -383,7 +420,7 @@ Pass `--tensor-parallel-size <NUM-GPUS>` to `dynamo-run`.
To specify which GPUs to use set environment variable `CUDA_VISIBLE_DEVICES`.
**Multi-node:**
**Multinode:**
vllm uses [ray](https://docs.vllm.ai/en/latest/serving/distributed_serving.html#running-vllm-on-multiple-nodes) for pipeline parallel inference. Dynamo does not change or manage that.
......@@ -400,16 +437,16 @@ Shutdown: `ray stop`
#### TensorRT-LLM engine
To run a TRT-LLM model with dynamo-run we have included a python based [async engine] (/examples/tensorrt_llm/engines/agg_engine.py).
To configure the TensorRT-LLM async engine please see [llm_api_config.yaml](/examples/tensorrt_llm/configs/llm_api_config.yaml). The file defines the options that need to be passed to the LLM engine. Follow the steps below to serve trtllm on dynamo run.
To run a TRT-LLM model with dynamo-run we have included a python based [async engine] (https://github.com/ai-dynamo/dynamo/blob/main/examples/tensorrt_llm/engines/agg_engine.py).
To configure the TensorRT-LLM async engine please see [llm_api_config.yaml](https://github.com/ai-dynamo/dynamo/blob/main/examples/tensorrt_llm/configs/llm_api_config.yaml). The file defines the options that need to be passed to the LLM engine. Follow the steps below to serve trtllm on dynamo run.
##### Step 1: Build the environment
See instructions [here](/examples/tensorrt_llm/README.md#build-docker) to build the dynamo container with TensorRT-LLM.
See instructions [here](https://github.com/ai-dynamo/dynamo/blob/main/examples/tensorrt_llm/README.md#build-docker) to build the dynamo container with TensorRT-LLM.
##### Step 2: Run the environment
See instructions [here](/examples/tensorrt_llm/README.md#run-container) to run the built environment.
See instructions [here](https://github.com/ai-dynamo/dynamo/blob/main/examples/tensorrt_llm/README.md#run-container) to run the built environment.
##### Step 3: Execute `dynamo run` command
......@@ -418,13 +455,13 @@ Execute the following to load the TensorRT-LLM model specified in the configurat
dynamo run out=pystr:/workspace/examples/tensorrt_llm/engines/trtllm_engine.py -- --engine_args /workspace/examples/tensorrt_llm/configs/llm_api_config.yaml
```
### Echo Engines
#### Echo Engines
Dynamo includes two echo engines for testing and debugging purposes:
#### echo_core
##### echo_core
The `echo_core` engine accepts pre-processed requests and echoes the tokens back as the response. This is useful for testing pre-processing functionality as the response will include the full prompt template.
The `echo_core` engine accepts pre-processed requests and echoes the tokens back as the response. This is useful for testing pre-processing functionality as the response includes the full prompt template.
```
dynamo-run in=http out=echo_core --model-path <hf-repo-checkout>
......@@ -437,7 +474,7 @@ curl -N -d '{"nvext": {"ignore_eos": true}, "stream": true, "model": "Qwen2.5-3B
The default `in=text` sets that for you.
#### echo_full
##### echo_full
The `echo_full` engine accepts un-processed requests and echoes the prompt back as the response.
......@@ -445,7 +482,7 @@ The `echo_full` engine accepts un-processed requests and echoes the prompt back
dynamo-run in=http out=echo_full --model-name my_model
```
#### Configuration
##### Configuration
Both echo engines use a configurable delay between tokens to simulate generation speed. You can adjust this using the `DYN_TOKEN_ECHO_DELAY_MS` environment variable:
......@@ -456,7 +493,7 @@ DYN_TOKEN_ECHO_DELAY_MS=1 dynamo-run in=http out=echo_full
The default delay is 10ms, which produces approximately 100 tokens per second.
### Batch mode
#### Batch mode
`dynamo-run` can take a jsonl file full of prompts and evaluate them all:
......@@ -477,7 +514,20 @@ The output looks like this:
{"text":"What is the capital of Spain?","response":".The capital of Spain is Madrid.","tokens_in":7,"tokens_out":7,"elapsed_ms":855}
```
### Write your own engine in Python
### Extra engine arguments
The vllm and sglang backends support passing any argument the engine accepts.
Put the arguments in a JSON file:
```
{
"dtype": "half",
"trust_remote_code": true
}
```
Pass it like this:
```
dynamo-run out=sglang ~/llms/Llama-3.2-3B-Instruct --extra-engine-args sglang_extra.json
```
### Writing your own engine in Python
Note: This section replaces "bring-your-own-engine".
......@@ -531,7 +581,7 @@ if __name__ == "__main__":
The `model_path` can be:
- A HuggingFace repo ID, optionally prefixed with `hf://`. It will be downloaded and cached locally.
- A HuggingFace repo ID, optionally prefixed with `hf://`. It is downloaded and cached locally.
- The path to a checkout of a HuggingFace repo - any folder containing safetensor files as well as `config.json`, `tokenizer.json` and `tokenizer_config.json`.
- The path to a GGUF file, if your engine supports that.
......@@ -552,24 +602,3 @@ More fully-featured Backend engines (used by `dynamo-run`):
- [vllm](https://github.com/ai-dynamo/dynamo/blob/main/launch/dynamo-run/src/subprocess/vllm_inc.py)
- [sglang](https://github.com/ai-dynamo/dynamo/blob/main/launch/dynamo-run/src/subprocess/sglang_inc.py)
### Defaults
The input defaults to `in=text`. The output will default to `out=mistralrs` engine, unless it is disabled with `--no-default-features` in which case vllm is used.
### Extra engine arguments
The vllm and sglang backends support passing any argument the engine accepts.
Put the arguments in a JSON file:
```
{
"dtype": "half",
"trust_remote_code": true
}
```
Pass it like this:
```
dynamo-run out=sglang ~/llms/Llama-3.2-3B-Instruct --extra-engine-args sglang_extra.json
```
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment