# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # `component` contains configuration options related to the Distributed Neural Models Component. component: # `component.name` is the name of the Distributed Neural Models (dynamo) Component in the distributed deployment. name: # (required) # `component.namespace` is the Distributed Neural Models namespace in which the Distributed Neural Models Component will be deployed. namespace: # (default: "default") # `image` contains configuration options related to the Distributed Neural Models Component container image. image: # (required) # `image.pullSecrets` is an optional list of pull secrets to be used when downloading the Distributed Neural Models Component container image. pullSecrets: [] # (optional) # - name: pull-secret-name # `image.name` is the name of the container image containing the version of Distributed Neural Models Component container image to be used. # name: # (required) # `distributed` contains configuration options related to organization of Distributed Neural Models workflows. distributed: # (required) # `distributed.requestPlane` contains configuration options related to connecting the Distributed Neural Models Component to its Distributed Neural Models Request Plane. requestPlane: # `distributed.requestPlane.etcdUrl` is the URL of the etcd server used by the Distributed Neural Models Request Plane. etcdUrl: # (required) # `distributed.requestPlane.natsUrl` is the URL of the NATS server used by the Distributed Neural Models Request Plane. natsUrl: # (required) # `distributed.requestPlane.timeout` is the maximum time in seconds the Distributed Neural Models Component will wait for a response from the Distributed Neural Models Request Plane. timeout: # (default 60) # `distributed.workerCount` is the number of worker instances to be deployed as part of the Distributed Neural Models Component. workerCount: # (default 1) # `model` contains configuration options related to the model(s) loaded by the Distributed Neural Models Component. model: # `model.instance` are optional configuration options related to the number of Distributed Neural Models Component pods are deployed. instance: # `instance.count` is the number of worker instances (whole model) to be deployed as part of this helm chart. count: # (default 1) # `model.instance.parallelism` contains optional configuration options related to how work for a single model is spread across multiple pods. # When the product of `pipeline`*`tensor` is greater than 1, multiple pods will be deployed as a single logical worker. parallelism: # `model.instance.parallelism.pipeline` specifies the level of pipeline parallelism used by the model hosted by the Distributed Neural Models Component. # Pipeline parallelism involves sharding the model (vertically) into chunks, where each chunk comprises a subset of layers that is executed on a separate device. pipeline: # (default 1) # `model.instance.parallelism.tensor` specifies the level of tensor parallelism used by the model hosted by the Distributed Neural Models Component. # Tensor parallelism involves sharding (horizontally) individual layers of the model into smaller, independent blocks of computation that can be executed on different devices. tensor: # (default 1) # `model.repository` contains configuration options related to the model repository used by the Distributed Neural Models Component to load model(s). repository: # (optional) # `model.repository.path` is a local file-system path within the container to the model repository. # When `persistentVolumeClaim` is specified, this is the path to which the PVC will be mounted. path: # (default: /var/run/models) # `model.repository.volumeMounts` are persistent volumes (PV) to be mounted with the Distributed Neural Models Component container. volumeMounts: [] # (optional) # # `model.repository.volumeMounts.name` is the name to associate the volume mount with. Volume mount names must be unique and cannot contain spaces or special characters. # - name: # (required) # # `model.repository.volumeMounts.path` is the file-system path relative to model repository's root path to which the volume will be mounted to. # # When not provided, the volume is mounted to the root of the repository. # # Overlapping mount paths can cause errors during container deployment. # path: # (optional) # # `model.repository.volumeMounts.persistentVolumeclaim` is the name of the persistent volume claim (PVC) used to mount a folder containing the model(s) Triton will load. # persistentVolumeClaim: # (required) # `ports` contains configuration options for the management of the Distributed Neural Models Component exposed. ports: # (optional) # `ports.health` is the container port exposed to enable Distributed Neural Models Component Kubernetes health reporting. health: # (default 8000) # `ports.metrics` is the container port exposed to enable Distributed Neural Models Component metrics reporting. metrics: # (default 9347) # `ports.request` is the container port exposed to enable Distributed Neural Models Component request-plane operations. request: # (default 9345) # `resources` contains configuration options related to the resources assigned to Distributed Neural Models Component and loaded model(s). resources: # (optional) # `resources.cpu` is the number of logical CPU cores required by the Distributed Neural Models Component and loaded model(s). cpu: # (default: 4) # `resources.ephemeral` is the ephemeral storage (aka local disk usage) allowance. # Ephemeral storage MUST include any shared memory allocated to Distributed Neural Models Component. # Value must be provided in Kubernetes' unit notation. ephemeral: # (default: 1Gi) # `resources.gpu` contains configuration options related GPU resources to be assigned to the Distributed Neural Models Component and loaded model(s). gpu: # (optional) # `resources.gpu.count` specifies the number of GPUs required by the Distributed Neural Models Component and loaded model(s). count: # (default: 1) # `resources.gpu.product` defines list of the supported GPUs to which Distributed Neural Models Component instance(s) can be deployed. # Value must match the node's `.metadata.labels.nvidia.com/gpu.product` label provided by the NVIDIA GPU Discovery Service. # Run 'kubectl get nodes' to find node names. # Run 'kubectl describe node ' to inspect a node's labels. product: [] # (optional) # `resources.memory` specifies the amount of CPU visible (aka host) memory available to the Distributed Neural Models Component and loaded model(s). # This value must include any shared memory allocated (via `resources.sharedMemory`) to Distributed Neural Models Component. # Value must be provided in Kubernetes' unit notation. memory: # (default: 16Gi) # `resources.sharedMemory` specifies about amount of shared CPU visible (aka host) memory available the Distributed Neural Models Component and loaded model(s). # Value must be provided in Kubernetes' unit notation. sharedMemory: # (default: 512Mi) # `kubernetes` contains configurations option related to the Kubernetes objects created by the chart. kubernetes: # (optional) # `kubernetes.annotations` is an optional set of annotations to be applied to create Kubernetes objects. annotations: [] # (optional) # `kubernetes.checks` are optional configuration options controlling how the cluster monitors the health of Distributed Neural Models Component deployment(s). checks: # `kubernetes.checks.liveness` are configuration options related to how the cluster determines that a Distributed Neural Models Component instance is "alive" and responsive. liveness: # `kubernetes.checks.liveness.enabled` when `true`, instructs the cluster will actively determine if the pod is alive; otherwise the cluster will always assume the pod is alive. enabled: # (default true) # `kubernetes.checks.liveness.failureThreshold` is the number of failed responses required to determine a pod is not responsive (aka "alive"). failureThreshold: # (default 15) # `kubernetes.checks.liveness.initialDelaySeconds` is the minimum wait before the cluster beings to attempt to determine the health of the pod. initialDelaySeconds: # (default 10) # `kubernetes.checks.liveness.periodSeconds` is the minimum period between attempts to determine the health of the pod. periodSeconds: # (default 2) # `kubernetes.checks.liveness.successThreshold` is the number of successful responses required to determine that a pod is healthy. successThreshold: # (default 1) # `kubernetes.checks.readiness` contains configuration options related to how the cluster determines that a Distributed Neural Models Component instance is ready. readiness: # `kubernetes.checks.readiness.enabled` when `true`, instructs the cluster will actively determine if the pod is ready; otherwise the cluster will always assume the pod is ready. enabled: # (default true) # `kubernetes.checks.readiness.failureThreshold` is the number of failed responses required to determine a pod is not responsive (aka "ready"). failureThreshold: # (default 15) # `kubernetes.checks.readiness.initialDelaySeconds` is the minimum wait before the cluster beings to attempt to determine the readiness of the pod. initialDelaySeconds: # (default 10) # `kubernetes.checks.readiness.periodSeconds` is the minimum period between attempts to determine the readiness of the pod. periodSeconds: # (default 2) # `kubernetes.checks.readiness.successThreshold` is the number of successful responses required to determine that a pod is ready. successThreshold: # (default 1) # `kubernetes.labels` is an optional set of labels to be applied to created Kubernetes objects. # These labels can be used for association with a preexisting service object. labels: [] # (optional) # `kubernetes.partOf` is an optional value to be used with the `app.kubernetes.io/part-of` label on created Kubernetes objects. partOf: # (default: nova-distributed) # `kubernetes.terminationGracePeriod` is the duration in seconds the cluster will wait for a Distributed Neural Models Component instance to gracefully terminate. terminationGracePeriod: # (default 30) # `kubernetes.tolerations` are tolerations applied to every pod deployed as part of this deployment. # Template already includes `nvidia.com/gpu=present:NoSchedule` when `resources.gpu` is specified. tolerations: [] # (optional)