values.yaml 11.1 KB
Newer Older
J Wyman's avatar
J Wyman committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# `component` contains configuration options related to the Distributed Neural Models Component.
component:
  # `component.name` is the name of the Distributed Neural Models (dynemo) Component in the distributed deployment.
  name: # (required)
  # `component.namespace` is the Distributed Neural Models namespace in which the Distributed Neural Models Component will be deployed.
  namespace: # (default: "default")

# `image` contains configuration options related to the Distributed Neural Models Component container image.
image: # (required)
  # `image.pullSecrets` is an optional list of pull secrets to be used when downloading the Distributed Neural Models Component container image.
  pullSecrets: [] # (optional)
  # - name: pull-secret-name
  # `image.name` is the name of the container image containing the version of Distributed Neural Models Component container image to be used.
  #   name: # (required)

# `distributed` contains configuration options related to organization of Distributed Neural Models workflows.
distributed: # (required)
  # `distributed.requestPlane` contains configuration options related to connecting the Distributed Neural Models Component to its Distributed Neural Models Request Plane.
  requestPlane:
    # `distributed.requestPlane.etcdUrl` is the URL of the etcd server used by the Distributed Neural Models Request Plane.
    etcdUrl: # (required)
    # `distributed.requestPlane.natsUrl` is the URL of the NATS server used by the Distributed Neural Models Request Plane.
    natsUrl: # (required)
    # `distributed.requestPlane.timeout` is the maximum time in seconds the Distributed Neural Models Component will wait for a response from the Distributed Neural Models Request Plane.
    timeout: # (default 60)
  # `distributed.workerCount` is the number of worker instances to be deployed as part of the Distributed Neural Models Component.
  workerCount: # (default 1)

# `model` contains configuration options related to the model(s) loaded by the Distributed Neural Models Component.
model:
  # `model.instance` are optional configuration options related to the number of Distributed Neural Models Component pods are deployed.
  instance:
    # `instance.count` is the number of worker instances (whole model) to be deployed as part of this helm chart.
    count: # (default 1)
    # `model.instance.parallelism` contains optional configuration options related to how work for a single model is spread across multiple pods.
    # When the product of `pipeline`*`tensor` is greater than 1, multiple pods will be deployed as a single logical worker.
    parallelism:
      # `model.instance.parallelism.pipeline` specifies the level of pipeline parallelism used by the model hosted by the Distributed Neural Models Component.
      # Pipeline parallelism involves sharding the model (vertically) into chunks, where each chunk comprises a subset of layers that is executed on a separate device.
      pipeline: # (default 1)
      # `model.instance.parallelism.tensor` specifies the level of tensor parallelism used by the model hosted by the Distributed Neural Models Component.
      # Tensor parallelism involves sharding (horizontally) individual layers of the model into smaller, independent blocks of computation that can be executed on different devices.
      tensor: # (default 1)
  # `model.repository` contains configuration options related to the model repository used by the Distributed Neural Models Component to load model(s).
  repository: # (optional)
    # `model.repository.path` is a local file-system path within the container to the model repository.
    # When `persistentVolumeClaim` is specified, this is the path to which the PVC will be mounted.
    path: # (default: /var/run/models)
    # `model.repository.volumeMounts` are persistent volumes (PV) to be mounted with the Distributed Neural Models Component container.
    volumeMounts: [] # (optional)
    #   # `model.repository.volumeMounts.name` is the name to associate the volume mount with. Volume mount names must be unique and cannot contain spaces or special characters.
    # - name: # (required)
    #   # `model.repository.volumeMounts.path` is the file-system path relative to model repository's root path to which the volume will be mounted to.
    #   # When not provided, the volume is mounted to the root of the repository.
    #   # Overlapping mount paths can cause errors during container deployment.
    #   path: # (optional)
    #   # `model.repository.volumeMounts.persistentVolumeclaim` is the name of the persistent volume claim (PVC) used to mount a folder containing the model(s) Triton will load.
    #   persistentVolumeClaim: # (required)

# `ports` contains configuration options for the management of the Distributed Neural Models Component exposed.
ports: # (optional)
  # `ports.health` is the container port exposed to enable Distributed Neural Models Component Kubernetes health reporting.
  health: # (default 8000)
  # `ports.metrics` is the container port exposed to enable Distributed Neural Models Component metrics reporting.
  metrics: # (default 9347)
  # `ports.request` is the container port exposed to enable Distributed Neural Models Component request-plane operations.
  request: # (default 9345)

# `resources` contains configuration options related to the resources assigned to Distributed Neural Models Component and loaded model(s).
resources: # (optional)
  # `resources.cpu` is the number of logical CPU cores required by the Distributed Neural Models Component and loaded model(s).
  cpu: # (default: 4)
  # `resources.ephemeral` is the ephemeral storage (aka local disk usage) allowance.
  # Ephemeral storage MUST include any shared memory allocated to Distributed Neural Models Component.
  # Value must be provided in Kubernetes' unit notation.
  ephemeral: # (default: 1Gi)
  # `resources.gpu` contains configuration options related GPU resources to be assigned to the Distributed Neural Models Component and loaded model(s).
  gpu: # (optional)
    # `resources.gpu.count` specifies the number of GPUs required by the Distributed Neural Models Component and loaded model(s).
    count: # (default: 1)
    # `resources.gpu.product` defines list of the supported GPUs to which Distributed Neural Models Component instance(s) can be deployed.
    # Value must match the node's `.metadata.labels.nvidia.com/gpu.product` label provided by the NVIDIA GPU Discovery Service.
    # Run 'kubectl get nodes' to find node names.
    # Run 'kubectl describe node <node_name>' to inspect a node's labels.
    product: [] # (optional)
  # `resources.memory` specifies the amount of CPU visible (aka host) memory available to the Distributed Neural Models Component and loaded model(s).
  # This value must include any shared memory allocated (via `resources.sharedMemory`) to Distributed Neural Models Component.
  # Value must be provided in Kubernetes' unit notation.
  memory: # (default: 16Gi)
  # `resources.sharedMemory` specifies about amount of shared CPU visible (aka host) memory available the Distributed Neural Models Component and loaded model(s).
  # Value must be provided in Kubernetes' unit notation.
  sharedMemory: # (default: 512Mi)

# `kubernetes` contains configurations option related to the Kubernetes objects created by the chart.
kubernetes: # (optional)
  # `kubernetes.annotations` is an optional set of annotations to be applied to create Kubernetes objects.
  annotations: [] # (optional)
  # `kubernetes.checks` are optional configuration options controlling how the cluster monitors the health of Distributed Neural Models Component deployment(s).
  checks:
    # `kubernetes.checks.liveness` are configuration options related to how the cluster determines that a Distributed Neural Models Component instance is "alive" and responsive.
    liveness:
      # `kubernetes.checks.liveness.enabled` when `true`, instructs the cluster will actively determine if the pod is alive; otherwise the cluster will always assume the pod is alive.
      enabled: # (default true)
      # `kubernetes.checks.liveness.failureThreshold` is the number of failed responses required to determine a pod is not responsive (aka "alive").
      failureThreshold: # (default 15)
      # `kubernetes.checks.liveness.initialDelaySeconds` is the minimum wait before the cluster beings to attempt to determine the health of the pod.
      initialDelaySeconds: # (default 10)
      # `kubernetes.checks.liveness.periodSeconds` is the minimum period between attempts to determine the health of the pod.
      periodSeconds: # (default 2)
      # `kubernetes.checks.liveness.successThreshold` is the number of successful responses required to determine that a pod is healthy.
      successThreshold: # (default 1)
    # `kubernetes.checks.readiness` contains configuration options related to how the cluster determines that a Distributed Neural Models Component instance is ready.
    readiness:
      # `kubernetes.checks.readiness.enabled` when `true`, instructs the cluster will actively determine if the pod is ready; otherwise the cluster will always assume the pod is ready.
      enabled: # (default true)
      # `kubernetes.checks.readiness.failureThreshold` is the number of failed responses required to determine a pod is not responsive (aka "ready").
      failureThreshold: # (default 15)
      # `kubernetes.checks.readiness.initialDelaySeconds` is the minimum wait before the cluster beings to attempt to determine the readiness of the pod.
      initialDelaySeconds: # (default 10)
      # `kubernetes.checks.readiness.periodSeconds` is the minimum period between attempts to determine the readiness of the pod.
      periodSeconds: # (default 2)
      # `kubernetes.checks.readiness.successThreshold` is the number of successful responses required to determine that a pod is ready.
      successThreshold: # (default 1)
  # `kubernetes.labels` is an optional set of labels to be applied to created Kubernetes objects.
  # These labels can be used for association with a preexisting service object.
  labels: [] # (optional)
  # `kubernetes.partOf` is an optional value to be used with the `app.kubernetes.io/part-of` label on created Kubernetes objects.
  partOf: # (default: nova-distributed)
  # `kubernetes.terminationGracePeriod` is the duration in seconds the cluster will wait for a Distributed Neural Models Component instance to gracefully terminate.
  terminationGracePeriod: # (default 30)
  # `kubernetes.tolerations` are tolerations applied to every pod deployed as part of this deployment.
  # Template already includes `nvidia.com/gpu=present:NoSchedule` when `resources.gpu` is specified.
  tolerations: [] # (optional)