values.yaml 11.1 KB
Newer Older
J Wyman's avatar
J Wyman committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# `component` contains configuration options related to the Distributed Neural Models Component.
component:
Neelay Shah's avatar
Neelay Shah committed
18
  # `component.name` is the name of the Distributed Neural Models (dynamo) Component in the distributed deployment.
J Wyman's avatar
J Wyman committed
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
  name: # (required)
  # `component.namespace` is the Distributed Neural Models namespace in which the Distributed Neural Models Component will be deployed.
  namespace: # (default: "default")

# `image` contains configuration options related to the Distributed Neural Models Component container image.
image: # (required)
  # `image.pullSecrets` is an optional list of pull secrets to be used when downloading the Distributed Neural Models Component container image.
  pullSecrets: [] # (optional)
  # - name: pull-secret-name
  # `image.name` is the name of the container image containing the version of Distributed Neural Models Component container image to be used.
  #   name: # (required)

# `distributed` contains configuration options related to organization of Distributed Neural Models workflows.
distributed: # (required)
  # `distributed.requestPlane` contains configuration options related to connecting the Distributed Neural Models Component to its Distributed Neural Models Request Plane.
  requestPlane:
    # `distributed.requestPlane.etcdUrl` is the URL of the etcd server used by the Distributed Neural Models Request Plane.
    etcdUrl: # (required)
    # `distributed.requestPlane.natsUrl` is the URL of the NATS server used by the Distributed Neural Models Request Plane.
    natsUrl: # (required)
    # `distributed.requestPlane.timeout` is the maximum time in seconds the Distributed Neural Models Component will wait for a response from the Distributed Neural Models Request Plane.
    timeout: # (default 60)
  # `distributed.workerCount` is the number of worker instances to be deployed as part of the Distributed Neural Models Component.
  workerCount: # (default 1)

# `model` contains configuration options related to the model(s) loaded by the Distributed Neural Models Component.
model:
  # `model.instance` are optional configuration options related to the number of Distributed Neural Models Component pods are deployed.
  instance:
    # `instance.count` is the number of worker instances (whole model) to be deployed as part of this helm chart.
    count: # (default 1)
    # `model.instance.parallelism` contains optional configuration options related to how work for a single model is spread across multiple pods.
    # When the product of `pipeline`*`tensor` is greater than 1, multiple pods will be deployed as a single logical worker.
    parallelism:
      # `model.instance.parallelism.pipeline` specifies the level of pipeline parallelism used by the model hosted by the Distributed Neural Models Component.
      # Pipeline parallelism involves sharding the model (vertically) into chunks, where each chunk comprises a subset of layers that is executed on a separate device.
      pipeline: # (default 1)
      # `model.instance.parallelism.tensor` specifies the level of tensor parallelism used by the model hosted by the Distributed Neural Models Component.
      # Tensor parallelism involves sharding (horizontally) individual layers of the model into smaller, independent blocks of computation that can be executed on different devices.
      tensor: # (default 1)
  # `model.repository` contains configuration options related to the model repository used by the Distributed Neural Models Component to load model(s).
  repository: # (optional)
    # `model.repository.path` is a local file-system path within the container to the model repository.
    # When `persistentVolumeClaim` is specified, this is the path to which the PVC will be mounted.
    path: # (default: /var/run/models)
    # `model.repository.volumeMounts` are persistent volumes (PV) to be mounted with the Distributed Neural Models Component container.
    volumeMounts: [] # (optional)
    #   # `model.repository.volumeMounts.name` is the name to associate the volume mount with. Volume mount names must be unique and cannot contain spaces or special characters.
    # - name: # (required)
    #   # `model.repository.volumeMounts.path` is the file-system path relative to model repository's root path to which the volume will be mounted to.
    #   # When not provided, the volume is mounted to the root of the repository.
    #   # Overlapping mount paths can cause errors during container deployment.
    #   path: # (optional)
    #   # `model.repository.volumeMounts.persistentVolumeclaim` is the name of the persistent volume claim (PVC) used to mount a folder containing the model(s) Triton will load.
    #   persistentVolumeClaim: # (required)

# `ports` contains configuration options for the management of the Distributed Neural Models Component exposed.
ports: # (optional)
  # `ports.health` is the container port exposed to enable Distributed Neural Models Component Kubernetes health reporting.
  health: # (default 8000)
  # `ports.metrics` is the container port exposed to enable Distributed Neural Models Component metrics reporting.
  metrics: # (default 9347)
  # `ports.request` is the container port exposed to enable Distributed Neural Models Component request-plane operations.
  request: # (default 9345)

# `resources` contains configuration options related to the resources assigned to Distributed Neural Models Component and loaded model(s).
resources: # (optional)
  # `resources.cpu` is the number of logical CPU cores required by the Distributed Neural Models Component and loaded model(s).
  cpu: # (default: 4)
  # `resources.ephemeral` is the ephemeral storage (aka local disk usage) allowance.
  # Ephemeral storage MUST include any shared memory allocated to Distributed Neural Models Component.
  # Value must be provided in Kubernetes' unit notation.
  ephemeral: # (default: 1Gi)
  # `resources.gpu` contains configuration options related GPU resources to be assigned to the Distributed Neural Models Component and loaded model(s).
  gpu: # (optional)
    # `resources.gpu.count` specifies the number of GPUs required by the Distributed Neural Models Component and loaded model(s).
    count: # (default: 1)
    # `resources.gpu.product` defines list of the supported GPUs to which Distributed Neural Models Component instance(s) can be deployed.
    # Value must match the node's `.metadata.labels.nvidia.com/gpu.product` label provided by the NVIDIA GPU Discovery Service.
    # Run 'kubectl get nodes' to find node names.
    # Run 'kubectl describe node <node_name>' to inspect a node's labels.
    product: [] # (optional)
  # `resources.memory` specifies the amount of CPU visible (aka host) memory available to the Distributed Neural Models Component and loaded model(s).
  # This value must include any shared memory allocated (via `resources.sharedMemory`) to Distributed Neural Models Component.
  # Value must be provided in Kubernetes' unit notation.
  memory: # (default: 16Gi)
  # `resources.sharedMemory` specifies about amount of shared CPU visible (aka host) memory available the Distributed Neural Models Component and loaded model(s).
  # Value must be provided in Kubernetes' unit notation.
  sharedMemory: # (default: 512Mi)

# `kubernetes` contains configurations option related to the Kubernetes objects created by the chart.
kubernetes: # (optional)
  # `kubernetes.annotations` is an optional set of annotations to be applied to create Kubernetes objects.
  annotations: [] # (optional)
  # `kubernetes.checks` are optional configuration options controlling how the cluster monitors the health of Distributed Neural Models Component deployment(s).
  checks:
    # `kubernetes.checks.liveness` are configuration options related to how the cluster determines that a Distributed Neural Models Component instance is "alive" and responsive.
    liveness:
      # `kubernetes.checks.liveness.enabled` when `true`, instructs the cluster will actively determine if the pod is alive; otherwise the cluster will always assume the pod is alive.
      enabled: # (default true)
      # `kubernetes.checks.liveness.failureThreshold` is the number of failed responses required to determine a pod is not responsive (aka "alive").
      failureThreshold: # (default 15)
      # `kubernetes.checks.liveness.initialDelaySeconds` is the minimum wait before the cluster beings to attempt to determine the health of the pod.
      initialDelaySeconds: # (default 10)
      # `kubernetes.checks.liveness.periodSeconds` is the minimum period between attempts to determine the health of the pod.
      periodSeconds: # (default 2)
      # `kubernetes.checks.liveness.successThreshold` is the number of successful responses required to determine that a pod is healthy.
      successThreshold: # (default 1)
    # `kubernetes.checks.readiness` contains configuration options related to how the cluster determines that a Distributed Neural Models Component instance is ready.
    readiness:
      # `kubernetes.checks.readiness.enabled` when `true`, instructs the cluster will actively determine if the pod is ready; otherwise the cluster will always assume the pod is ready.
      enabled: # (default true)
      # `kubernetes.checks.readiness.failureThreshold` is the number of failed responses required to determine a pod is not responsive (aka "ready").
      failureThreshold: # (default 15)
      # `kubernetes.checks.readiness.initialDelaySeconds` is the minimum wait before the cluster beings to attempt to determine the readiness of the pod.
      initialDelaySeconds: # (default 10)
      # `kubernetes.checks.readiness.periodSeconds` is the minimum period between attempts to determine the readiness of the pod.
      periodSeconds: # (default 2)
      # `kubernetes.checks.readiness.successThreshold` is the number of successful responses required to determine that a pod is ready.
      successThreshold: # (default 1)
  # `kubernetes.labels` is an optional set of labels to be applied to created Kubernetes objects.
  # These labels can be used for association with a preexisting service object.
  labels: [] # (optional)
  # `kubernetes.partOf` is an optional value to be used with the `app.kubernetes.io/part-of` label on created Kubernetes objects.
  partOf: # (default: nova-distributed)
  # `kubernetes.terminationGracePeriod` is the duration in seconds the cluster will wait for a Distributed Neural Models Component instance to gracefully terminate.
  terminationGracePeriod: # (default 30)
  # `kubernetes.tolerations` are tolerations applied to every pod deployed as part of this deployment.
  # Template already includes `nvidia.com/gpu=present:NoSchedule` when `resources.gpu` is specified.
  tolerations: [] # (optional)