values.yaml

# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# `component` contains configuration options related to the Distributed Neural Models Component.
component:
  # `component.name` is the name of the Distributed Neural Models (dynemo) Component in the distributed deployment.
  name: # (required)
  # `component.namespace` is the Distributed Neural Models namespace in which the Distributed Neural Models Component will be deployed.
  namespace: # (default: "default")

# `image` contains configuration options related to the Distributed Neural Models Component container image.
image: # (required)
  # `image.pullSecrets` is an optional list of pull secrets to be used when downloading the Distributed Neural Models Component container image.
  pullSecrets: [] # (optional)
  # - name: pull-secret-name
  # `image.name` is the name of the container image containing the version of Distributed Neural Models Component container image to be used.
  #   name: # (required)

# `distributed` contains configuration options related to organization of Distributed Neural Models workflows.
distributed: # (required)
  # `distributed.requestPlane` contains configuration options related to connecting the Distributed Neural Models Component to its Distributed Neural Models Request Plane.
  requestPlane:
    # `distributed.requestPlane.etcdUrl` is the URL of the etcd server used by the Distributed Neural Models Request Plane.
    etcdUrl: # (required)
    # `distributed.requestPlane.natsUrl` is the URL of the NATS server used by the Distributed Neural Models Request Plane.
    natsUrl: # (required)
    # `distributed.requestPlane.timeout` is the maximum time in seconds the Distributed Neural Models Component will wait for a response from the Distributed Neural Models Request Plane.
    timeout: # (default 60)
  # `distributed.workerCount` is the number of worker instances to be deployed as part of the Distributed Neural Models Component.
  workerCount: # (default 1)

# `model` contains configuration options related to the model(s) loaded by the Distributed Neural Models Component.
model:
  # `model.instance` are optional configuration options related to the number of Distributed Neural Models Component pods are deployed.
  instance:
    # `instance.count` is the number of worker instances (whole model) to be deployed as part of this helm chart.
    count: # (default 1)
    # `model.instance.parallelism` contains optional configuration options related to how work for a single model is spread across multiple pods.
    # When the product of `pipeline`*`tensor` is greater than 1, multiple pods will be deployed as a single logical worker.
    parallelism:
      # `model.instance.parallelism.pipeline` specifies the level of pipeline parallelism used by the model hosted by the Distributed Neural Models Component.
      # Pipeline parallelism involves sharding the model (vertically) into chunks, where each chunk comprises a subset of layers that is executed on a separate device.
      pipeline: # (default 1)
      # `model.instance.parallelism.tensor` specifies the level of tensor parallelism used by the model hosted by the Distributed Neural Models Component.
      # Tensor parallelism involves sharding (horizontally) individual layers of the model into smaller, independent blocks of computation that can be executed on different devices.
      tensor: # (default 1)
  # `model.repository` contains configuration options related to the model repository used by the Distributed Neural Models Component to load model(s).
  repository: # (optional)
    # `model.repository.path` is a local file-system path within the container to the model repository.
    # When `persistentVolumeClaim` is specified, this is the path to which the PVC will be mounted.
    path: # (default: /var/run/models)
    # `model.repository.volumeMounts` are persistent volumes (PV) to be mounted with the Distributed Neural Models Component container.
    volumeMounts: [] # (optional)
    #   # `model.repository.volumeMounts.name` is the name to associate the volume mount with. Volume mount names must be unique and cannot contain spaces or special characters.
    # - name: # (required)
    #   # `model.repository.volumeMounts.path` is the file-system path relative to model repository's root path to which the volume will be mounted to.
    #   # When not provided, the volume is mounted to the root of the repository.
    #   # Overlapping mount paths can cause errors during container deployment.
    #   path: # (optional)
    #   # `model.repository.volumeMounts.persistentVolumeclaim` is the name of the persistent volume claim (PVC) used to mount a folder containing the model(s) Triton will load.
    #   persistentVolumeClaim: # (required)

# `ports` contains configuration options for the management of the Distributed Neural Models Component exposed.
ports: # (optional)
  # `ports.health` is the container port exposed to enable Distributed Neural Models Component Kubernetes health reporting.
  health: # (default 8000)
  # `ports.metrics` is the container port exposed to enable Distributed Neural Models Component metrics reporting.
  metrics: # (default 9347)
  # `ports.request` is the container port exposed to enable Distributed Neural Models Component request-plane operations.
  request: # (default 9345)

# `resources` contains configuration options related to the resources assigned to Distributed Neural Models Component and loaded model(s).
resources: # (optional)
  # `resources.cpu` is the number of logical CPU cores required by the Distributed Neural Models Component and loaded model(s).
  cpu: # (default: 4)
  # `resources.ephemeral` is the ephemeral storage (aka local disk usage) allowance.
  # Ephemeral storage MUST include any shared memory allocated to Distributed Neural Models Component.
  # Value must be provided in Kubernetes' unit notation.
  ephemeral: # (default: 1Gi)
  # `resources.gpu` contains configuration options related GPU resources to be assigned to the Distributed Neural Models Component and loaded model(s).
  gpu: # (optional)
    # `resources.gpu.count` specifies the number of GPUs required by the Distributed Neural Models Component and loaded model(s).
    count: # (default: 1)
    # `resources.gpu.product` defines list of the supported GPUs to which Distributed Neural Models Component instance(s) can be deployed.
    # Value must match the node's `.metadata.labels.nvidia.com/gpu.product` label provided by the NVIDIA GPU Discovery Service.
    # Run 'kubectl get nodes' to find node names.
    # Run 'kubectl describe node <node_name>' to inspect a node's labels.
    product: [] # (optional)
  # `resources.memory` specifies the amount of CPU visible (aka host) memory available to the Distributed Neural Models Component and loaded model(s).
  # This value must include any shared memory allocated (via `resources.sharedMemory`) to Distributed Neural Models Component.
  # Value must be provided in Kubernetes' unit notation.
  memory: # (default: 16Gi)
  # `resources.sharedMemory` specifies about amount of shared CPU visible (aka host) memory available the Distributed Neural Models Component and loaded model(s).
  # Value must be provided in Kubernetes' unit notation.
  sharedMemory: # (default: 512Mi)

# `kubernetes` contains configurations option related to the Kubernetes objects created by the chart.
kubernetes: # (optional)
  # `kubernetes.annotations` is an optional set of annotations to be applied to create Kubernetes objects.
  annotations: [] # (optional)
  # `kubernetes.checks` are optional configuration options controlling how the cluster monitors the health of Distributed Neural Models Component deployment(s).
  checks:
    # `kubernetes.checks.liveness` are configuration options related to how the cluster determines that a Distributed Neural Models Component instance is "alive" and responsive.
    liveness:
      # `kubernetes.checks.liveness.enabled` when `true`, instructs the cluster will actively determine if the pod is alive; otherwise the cluster will always assume the pod is alive.
      enabled: # (default true)
      # `kubernetes.checks.liveness.failureThreshold` is the number of failed responses required to determine a pod is not responsive (aka "alive").
      failureThreshold: # (default 15)
      # `kubernetes.checks.liveness.initialDelaySeconds` is the minimum wait before the cluster beings to attempt to determine the health of the pod.
      initialDelaySeconds: # (default 10)
      # `kubernetes.checks.liveness.periodSeconds` is the minimum period between attempts to determine the health of the pod.
      periodSeconds: # (default 2)
      # `kubernetes.checks.liveness.successThreshold` is the number of successful responses required to determine that a pod is healthy.
      successThreshold: # (default 1)
    # `kubernetes.checks.readiness` contains configuration options related to how the cluster determines that a Distributed Neural Models Component instance is ready.
    readiness:
      # `kubernetes.checks.readiness.enabled` when `true`, instructs the cluster will actively determine if the pod is ready; otherwise the cluster will always assume the pod is ready.
      enabled: # (default true)
      # `kubernetes.checks.readiness.failureThreshold` is the number of failed responses required to determine a pod is not responsive (aka "ready").
      failureThreshold: # (default 15)
      # `kubernetes.checks.readiness.initialDelaySeconds` is the minimum wait before the cluster beings to attempt to determine the readiness of the pod.
      initialDelaySeconds: # (default 10)
      # `kubernetes.checks.readiness.periodSeconds` is the minimum period between attempts to determine the readiness of the pod.
      periodSeconds: # (default 2)
      # `kubernetes.checks.readiness.successThreshold` is the number of successful responses required to determine that a pod is ready.
      successThreshold: # (default 1)
  # `kubernetes.labels` is an optional set of labels to be applied to created Kubernetes objects.
  # These labels can be used for association with a preexisting service object.
  labels: [] # (optional)
  # `kubernetes.partOf` is an optional value to be used with the `app.kubernetes.io/part-of` label on created Kubernetes objects.
  partOf: # (default: nova-distributed)
  # `kubernetes.terminationGracePeriod` is the duration in seconds the cluster will wait for a Distributed Neural Models Component instance to gracefully terminate.
  terminationGracePeriod: # (default 30)
  # `kubernetes.tolerations` are tolerations applied to every pod deployed as part of this deployment.
  # Template already includes `nvidia.com/gpu=present:NoSchedule` when `resources.gpu` is specified.
  tolerations: [] # (optional)