NVIDIA Dynamo is a high-throughput low-latency inference framework designed for serving generative AI and reasoning models in multi-node distributed environments. Dynamo is designed to be inference engine agnostic (supports TRT-LLM, vLLM, SGLang or others) and captures LLM-specific capabilities such as:
...
...
@@ -112,7 +112,7 @@ example.
First start the Dynamo Distributed Runtime services:
```bash
docker compose -f deploy/docker-compose.yml up -d
docker compose -f deploy/metrics/docker-compose.yml up -d
```
#### Start Dynamo LLM Serving Components
...
...
@@ -151,13 +151,13 @@ Otherwise, to develop locally, we recommend working inside of the container
{{-fail(printf"Value of property `.modelRepository.volumeMounts[*].path` '%s' is illegal because '%s' is not a sub-directory of '%s'." . (clean $mount_path) $model_repository_path)}}
{{- end}}
{{- end}}
-mountPath:{{$mount_path}}
name:{{$volume_name}}
{{- end}}
{{- end}}
{{- end}}
{{- end}}
-mountPath:/dev/shm
name:shared-memory
{{- with $.Values.image}}
{{- with .pullSecrets}}
{{- if len .}}
imagePullSecrets:
{{toYaml . | indent 6}}
{{- end}}
{{- end}}
{{- end}}
restartPolicy:Always
terminationGracePeriodSeconds:30
{{- if or (gt $resources_gpu 0) (gt $kube_tolerations_count 0)}}
tolerations:
{{- if gt $resources_gpu 0}}
-effect:NoSchedule
key:nvidia.com/gpu
operator:Exists
{{- end}}
{{- with $.Values.kubernetes}}
{{- with .tolerations}}
{{toYaml . | indent 6}}
{{- end}}
{{- end}}
{{- end}}
volumes:
{{- with $.Values.model}}
{{- with .repository}}
{{- with .volumeMounts}}
{{- range .}}
-name:{{required "Property `.modelRepository.volumeMounts.name` is required." .name}}
persistentVolumeClaim:
claimName:{{required "Property `.modelRepository.volumeMounts.persistentVolumeClaim` is required." .persistentVolumeClaim}}
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# `component` contains configuration options related to the Distributed Neural Models Component.
component:
# `component.name` is the name of the Distributed Neural Models (dynamo) Component in the distributed deployment.
name:# (required)
# `component.namespace` is the Distributed Neural Models namespace in which the Distributed Neural Models Component will be deployed.
namespace:# (default: "default")
# `image` contains configuration options related to the Distributed Neural Models Component container image.
image:# (required)
# `image.pullSecrets` is an optional list of pull secrets to be used when downloading the Distributed Neural Models Component container image.
pullSecrets:[]# (optional)
# - name: pull-secret-name
# `image.name` is the name of the container image containing the version of Distributed Neural Models Component container image to be used.
# name: # (required)
# `distributed` contains configuration options related to organization of Distributed Neural Models workflows.
distributed:# (required)
# `distributed.requestPlane` contains configuration options related to connecting the Distributed Neural Models Component to its Distributed Neural Models Request Plane.
requestPlane:
# `distributed.requestPlane.etcdUrl` is the URL of the etcd server used by the Distributed Neural Models Request Plane.
etcdUrl:# (required)
# `distributed.requestPlane.natsUrl` is the URL of the NATS server used by the Distributed Neural Models Request Plane.
natsUrl:# (required)
# `distributed.requestPlane.timeout` is the maximum time in seconds the Distributed Neural Models Component will wait for a response from the Distributed Neural Models Request Plane.
timeout:# (default 60)
# `distributed.workerCount` is the number of worker instances to be deployed as part of the Distributed Neural Models Component.
workerCount:# (default 1)
# `model` contains configuration options related to the model(s) loaded by the Distributed Neural Models Component.
model:
# `model.instance` are optional configuration options related to the number of Distributed Neural Models Component pods are deployed.
instance:
# `instance.count` is the number of worker instances (whole model) to be deployed as part of this helm chart.
count:# (default 1)
# `model.instance.parallelism` contains optional configuration options related to how work for a single model is spread across multiple pods.
# When the product of `pipeline`*`tensor` is greater than 1, multiple pods will be deployed as a single logical worker.
parallelism:
# `model.instance.parallelism.pipeline` specifies the level of pipeline parallelism used by the model hosted by the Distributed Neural Models Component.
# Pipeline parallelism involves sharding the model (vertically) into chunks, where each chunk comprises a subset of layers that is executed on a separate device.
pipeline:# (default 1)
# `model.instance.parallelism.tensor` specifies the level of tensor parallelism used by the model hosted by the Distributed Neural Models Component.
# Tensor parallelism involves sharding (horizontally) individual layers of the model into smaller, independent blocks of computation that can be executed on different devices.
tensor:# (default 1)
# `model.repository` contains configuration options related to the model repository used by the Distributed Neural Models Component to load model(s).
repository:# (optional)
# `model.repository.path` is a local file-system path within the container to the model repository.
# When `persistentVolumeClaim` is specified, this is the path to which the PVC will be mounted.
path:# (default: /var/run/models)
# `model.repository.volumeMounts` are persistent volumes (PV) to be mounted with the Distributed Neural Models Component container.
volumeMounts:[]# (optional)
# # `model.repository.volumeMounts.name` is the name to associate the volume mount with. Volume mount names must be unique and cannot contain spaces or special characters.
# - name: # (required)
# # `model.repository.volumeMounts.path` is the file-system path relative to model repository's root path to which the volume will be mounted to.
# # When not provided, the volume is mounted to the root of the repository.
# # Overlapping mount paths can cause errors during container deployment.
# path: # (optional)
# # `model.repository.volumeMounts.persistentVolumeclaim` is the name of the persistent volume claim (PVC) used to mount a folder containing the model(s) Triton will load.
# persistentVolumeClaim: # (required)
# `ports` contains configuration options for the management of the Distributed Neural Models Component exposed.
ports:# (optional)
# `ports.health` is the container port exposed to enable Distributed Neural Models Component Kubernetes health reporting.
health:# (default 8000)
# `ports.metrics` is the container port exposed to enable Distributed Neural Models Component metrics reporting.
metrics:# (default 9347)
# `ports.request` is the container port exposed to enable Distributed Neural Models Component request-plane operations.
request:# (default 9345)
# `resources` contains configuration options related to the resources assigned to Distributed Neural Models Component and loaded model(s).
resources:# (optional)
# `resources.cpu` is the number of logical CPU cores required by the Distributed Neural Models Component and loaded model(s).
cpu:# (default: 4)
# `resources.ephemeral` is the ephemeral storage (aka local disk usage) allowance.
# Ephemeral storage MUST include any shared memory allocated to Distributed Neural Models Component.
# Value must be provided in Kubernetes' unit notation.
ephemeral:# (default: 1Gi)
# `resources.gpu` contains configuration options related GPU resources to be assigned to the Distributed Neural Models Component and loaded model(s).
gpu:# (optional)
# `resources.gpu.count` specifies the number of GPUs required by the Distributed Neural Models Component and loaded model(s).
count:# (default: 1)
# `resources.gpu.product` defines list of the supported GPUs to which Distributed Neural Models Component instance(s) can be deployed.
# Value must match the node's `.metadata.labels.nvidia.com/gpu.product` label provided by the NVIDIA GPU Discovery Service.
# Run 'kubectl get nodes' to find node names.
# Run 'kubectl describe node <node_name>' to inspect a node's labels.
product:[]# (optional)
# `resources.memory` specifies the amount of CPU visible (aka host) memory available to the Distributed Neural Models Component and loaded model(s).
# This value must include any shared memory allocated (via `resources.sharedMemory`) to Distributed Neural Models Component.
# Value must be provided in Kubernetes' unit notation.
memory:# (default: 16Gi)
# `resources.sharedMemory` specifies about amount of shared CPU visible (aka host) memory available the Distributed Neural Models Component and loaded model(s).
# Value must be provided in Kubernetes' unit notation.
sharedMemory:# (default: 512Mi)
# `kubernetes` contains configurations option related to the Kubernetes objects created by the chart.
kubernetes:# (optional)
# `kubernetes.annotations` is an optional set of annotations to be applied to create Kubernetes objects.
annotations:[]# (optional)
# `kubernetes.checks` are optional configuration options controlling how the cluster monitors the health of Distributed Neural Models Component deployment(s).
checks:
# `kubernetes.checks.liveness` are configuration options related to how the cluster determines that a Distributed Neural Models Component instance is "alive" and responsive.
liveness:
# `kubernetes.checks.liveness.enabled` when `true`, instructs the cluster will actively determine if the pod is alive; otherwise the cluster will always assume the pod is alive.
enabled:# (default true)
# `kubernetes.checks.liveness.failureThreshold` is the number of failed responses required to determine a pod is not responsive (aka "alive").
failureThreshold:# (default 15)
# `kubernetes.checks.liveness.initialDelaySeconds` is the minimum wait before the cluster beings to attempt to determine the health of the pod.
initialDelaySeconds:# (default 10)
# `kubernetes.checks.liveness.periodSeconds` is the minimum period between attempts to determine the health of the pod.
periodSeconds:# (default 2)
# `kubernetes.checks.liveness.successThreshold` is the number of successful responses required to determine that a pod is healthy.
successThreshold:# (default 1)
# `kubernetes.checks.readiness` contains configuration options related to how the cluster determines that a Distributed Neural Models Component instance is ready.
readiness:
# `kubernetes.checks.readiness.enabled` when `true`, instructs the cluster will actively determine if the pod is ready; otherwise the cluster will always assume the pod is ready.
enabled:# (default true)
# `kubernetes.checks.readiness.failureThreshold` is the number of failed responses required to determine a pod is not responsive (aka "ready").
failureThreshold:# (default 15)
# `kubernetes.checks.readiness.initialDelaySeconds` is the minimum wait before the cluster beings to attempt to determine the readiness of the pod.
initialDelaySeconds:# (default 10)
# `kubernetes.checks.readiness.periodSeconds` is the minimum period between attempts to determine the readiness of the pod.
periodSeconds:# (default 2)
# `kubernetes.checks.readiness.successThreshold` is the number of successful responses required to determine that a pod is ready.
successThreshold:# (default 1)
# `kubernetes.labels` is an optional set of labels to be applied to created Kubernetes objects.
# These labels can be used for association with a preexisting service object.
labels:[]# (optional)
# `kubernetes.partOf` is an optional value to be used with the `app.kubernetes.io/part-of` label on created Kubernetes objects.
partOf:# (default: nova-distributed)
# `kubernetes.terminationGracePeriod` is the duration in seconds the cluster will wait for a Distributed Neural Models Component instance to gracefully terminate.
terminationGracePeriod:# (default 30)
# `kubernetes.tolerations` are tolerations applied to every pod deployed as part of this deployment.
# Template already includes `nvidia.com/gpu=present:NoSchedule` when `resources.gpu` is specified.