values.yaml 22 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Used to generate top-level secrets (overridden by custom-values.yaml)
16

17
18
19
# Subcharts configuration

# Dynamo operator configuration
20
dynamo-operator:
21
  # -- Whether to enable the Dynamo Kubernetes operator deployment
22
  enabled: true
23
24

  # -- NATS server address for operator communication (leave empty to use the bundled NATS chart). Format: "nats://hostname:port"
25
  natsAddr: ""
26
27

  # -- etcd server address for operator state storage (leave empty to use the bundled etcd chart). Format: "http://hostname:port" or "https://hostname:port"
28
  etcdAddr: ""
29

30
31
32
  # -- URL for the Model Express server if not deployed by this helm chart. This is ignored if Model Express server is installed by this helm chart (global.model-express.enabled is true).
  modelExpressURL: ""
  # -- Namespace access controls for the operator
33
  namespaceRestriction:
34
    # -- Whether to restrict operator to specific namespaces
35
    enabled: true
36
    # -- Target namespace for operator deployment (leave empty for current namespace)
37
    targetNamespace:
38
39

  # Controller manager configuration
40
  controllerManager:
41
    # -- Node tolerations for controller manager pods
42
    tolerations: []
43

44
45
46
    # -- Affinity for controller manager pods
    affinity: []

47
    manager:
48
      # Container image configuration for the operator manager
49
      image:
50
        # -- Official NVIDIA Dynamo operator image repository
51
        repository: "nvcr.io/nvidia/ai-dynamo/kubernetes-operator"
52
        # -- Image tag (leave empty to use chart default)
53
        tag: ""
54
        # -- Image pull policy - when to pull the image
55
        pullPolicy: IfNotPresent
56
57

      # Command line arguments for the operator manager
58
      args:
59
        # -- Health probe endpoint for Kubernetes health checks
60
        - --health-probe-bind-address=:8081
61
        # -- Metrics endpoint for Prometheus scraping (localhost only for security)
62
        - --metrics-bind-address=127.0.0.1:8080
63
64

  # -- Secrets for pulling private container images
65
  imagePullSecrets: []
66
67

  # Core Dynamo platform configuration
68
  dynamo:
69
    # -- How long to wait before forcefully terminating Grove instances
70
    groveTerminationDelay: 15m
71
72

    # Internal utility images used by the platform
73
    internalImages:
74
      # -- Debugger image for troubleshooting deployments
75
      debugger: python:3.12-slim
76
77

    # -- Whether to enable restricted security contexts for enhanced security
78
    enableRestrictedSecurityContext: false
79
80

    # Docker registry configuration for private repositories
81
    dockerRegistry:
82
      # -- Whether to use Kubernetes secrets for registry authentication
83
      useKubernetesSecret: false
84
      # -- Docker registry server URL
85
      server:
86
      # -- Registry username
87
      username:
88
      # -- Registry password (consider using existingSecretName instead)
89
      password:
90
      # -- Name of existing Kubernetes secret containing registry credentials
91
      existingSecretName:
92
      # -- Whether the registry uses HTTPS
93
      secure: true
94
95

    # Ingress configuration for external access
96
    ingress:
97
      # -- Whether to create ingress resources
98
      enabled: false
99
      # -- Ingress class name (e.g., "nginx", "traefik")
100
      className:
101
      # -- Secret name containing TLS certificates
102
      tlsSecretName: my-tls-secret
103
104

    # Istio service mesh configuration
105
    istio:
106
      # -- Whether to enable Istio integration
107
      enabled: false
108
      # -- Istio gateway name for routing
109
      gateway:
110
111

    # -- Host suffix for generated ingress hostnames
112
    ingressHostSuffix: ""
113
114

    # -- Whether VirtualServices should support HTTPS routing
115
    virtualServiceSupportsHTTPS: false
116

117
118
119
120
121
    # Metrics configuration
    metrics:
      # -- Endpoint that services can use to retrieve metrics. If set, dynamo operator will automatically inject the PROMETHEUS_ENDPOINT environment variable into services it manages. Users can override the value of the PROMETHEUS_ENDPOINT environment variable by modifying the corresponding deployment's environment variables
      prometheusEndpoint: ""

122
123
124
125
126
127
128
129
130
    # MPI Run configuration
    mpiRun:
      # -- Name of the secret containing the SSH key for MPI Run
      secretName: "mpi-run-ssh-secret"
      # SSH key generation configuration
      sshKeygen:
        # -- Whether to enable SSH key generation for MPI Run
        enabled: true

131
132
133
134
135
136
137
138
139
140
141
142
143

# Grove component - distributed inference orchestration
grove:
  # -- Whether to enable Grove for multi-node inference coordination, if enabled, the Grove operator will be deployed cluster-wide
  enabled: false

# Kai Scheduler component - advanced workload scheduling
kai-scheduler:
  # -- Whether to enable Kai Scheduler for intelligent resource allocation, if enabled, the Kai Scheduler operator will be deployed cluster-wide
  enabled: false

# etcd configuration - distributed key-value store for operator state
# For complete configuration options, see: https://github.com/bitnami/charts/tree/main/bitnami/etcd
144
etcd:
145

146
  # -- Whether to enable etcd deployment, disable if you want to use an external etcd instance
147
  enabled: true
148

149
  image:
150
    # -- following bitnami announcement for brownout - https://github.com/bitnami/charts/tree/main/bitnami/etcd#%EF%B8%8F-important-notice-upcoming-changes-to-the-bitnami-catalog, we need to use the legacy repository until we migrate to the new "secure" repository
151
    repository: bitnamilegacy/etcd
152
    tag: 3.5.18-debian-12-r5
153

154
  # Persistent storage configuration for etcd data
155
  persistence:
156
    # Whether to enable persistent storage (recommended for production)
157
158
159
    enabled: true
    # Use the cluster default storage-class or override with a named class
    storageClass: null
160
    # Size of persistent volume for etcd data
161
    size: 1Gi
162
163

  # Pre-upgrade job configuration
164
  preUpgrade:
165
    # Whether to run pre-upgrade validation jobs
166
    enabled: false
167
168

  # Number of etcd replicas (1 for single-node, 3+ for HA)
169
  replicaCount: 1
170
171
172

  # Authentication and authorization settings
  # Explicitly remove authentication for simplified internal communication
173
174
  auth:
    rbac:
175
      # Whether to create RBAC authentication (disabled for internal use)
176
177
      create: false

178
  # Health check configuration
179
  readinessProbe:
180
    # Whether to enable readiness probes (disabled to reduce startup complexity)
181
182
183
    enabled: false

  livenessProbe:
184
    # Whether to enable liveness probes (disabled to reduce startup complexity)
185
186
    enabled: false

187
  # Node tolerations for etcd pods (allows scheduling on specific nodes)
188
189
  tolerations: []

190
191
# NATS configuration - messaging system for operator communication
# For complete configuration options, see: https://github.com/nats-io/k8s/tree/main/helm/charts/nats
192
nats:
193
  # -- Whether to enable NATS deployment, disable if you want to use an external NATS instance
194
  enabled: true
195
196
197
198

  # TLS Certificate Authority configuration for secure communication
  # Reference a common CA Certificate or Bundle in all nats config `tls` blocks and nats-box contexts
  # Note: `tls.verify` still must be set in the appropriate nats config `tls` blocks to require mTLS
199
  tlsCA:
200
    # Whether to enable TLS CA configuration
201
202
    enabled: false

203
  # Core NATS server configuration
204
  config:
205
    # NATS clustering for high availability (multiple NATS servers)
206
    cluster:
207
      # Whether to enable NATS clustering (disabled for single-node setups)
208
209
      enabled: false

210
    # JetStream - persistent messaging and streaming capabilities
211
    jetstream:
212
      # Whether to enable JetStream (recommended for persistent messaging)
213
214
      enabled: true

215
      # File-based storage for JetStream streams and consumers
216
      fileStore:
217
        # Whether to enable file storage (persistent across restarts)
218
        enabled: true
219
        # Directory path for JetStream file storage
220
221
222
        dir: /data

        ############################################################
223
        # Persistent Volume Claim for JetStream file storage
224
225
        ############################################################
        pvc:
226
          # Whether to create a PVC for JetStream storage
227
          enabled: true
228
          # Size of the persistent volume for JetStream data
229
          size: 10Gi
230
          # Storage class name (leave empty for default)
231
232
          storageClassName:

233
          # Advanced PVC configuration (merge additional fields)
234
235
236
          # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#persistentvolumeclaim-v1-core
          merge: {}
          patch: []
237
          # PVC name (defaults to "{{ include "nats.fullname" $ }}-js")
238
239
          name:

240
        # Maximum size for JetStream file storage (defaults to PVC size)
241
242
        maxSize:

243
      # Memory-based storage for JetStream (non-persistent)
244
      memoryStore:
245
        # Whether to enable memory storage (faster but not persistent)
246
247
        enabled: false

248
249
      # Advanced JetStream configuration
      # For options see: https://docs.nats.io/running-a-nats-service/configuration#jetstream
250
251
252
      merge: {}
      patch: []

253
    # Core NATS server settings
254
    nats:
255
      # Port for NATS client connections
256
      port: 4222
257
258

      # TLS configuration for encrypted connections
259
      tls:
260
        # Whether to enable TLS encryption
261
        enabled: false
262
263
        # Advanced TLS configuration
        # For options see: https://docs.nats.io/running-a-nats-service/configuration/securing_nats/tls
264
265
266
        merge: {}
        patch: []

267
    # Leaf nodes for creating NATS topologies and remote connections
268
    leafnodes:
269
      # Whether to enable leaf node connections
270
271
      enabled: false

272
    # WebSocket support for browser-based NATS clients
273
    websocket:
274
      # Whether to enable WebSocket protocol support
275
276
      enabled: false

277
    # MQTT protocol bridge for IoT device connectivity
278
    mqtt:
279
      # Whether to enable MQTT protocol support
280
281
      enabled: false

282
    # Gateway connections for multi-cluster NATS deployments
283
    gateway:
284
      # Whether to enable gateway connections
285
286
      enabled: false

287
    # HTTP monitoring endpoint for NATS server metrics
288
    monitor:
289
      # Whether to enable HTTP monitoring interface
290
      enabled: true
291
      # Port for monitoring HTTP endpoint
292
      port: 8222
293
294

      # TLS configuration for monitoring endpoint
295
      tls:
296
297
        # Whether to enable HTTPS for monitoring (requires config.nats.tls enabled)
        # When enabled, monitoring port will use HTTPS with the options from config.nats.tls
298
299
        enabled: false

300
    # Go pprof profiling endpoint for performance debugging
301
    profiling:
302
      # Whether to enable profiling endpoint (for debugging only)
303
      enabled: false
304
      # Port for profiling endpoint
305
306
      port: 65432

307
    # Account resolver for multi-tenant NATS deployments
308
    resolver:
309
      # Whether to enable account resolution (for advanced multi-tenancy)
310
311
      enabled: false

312
313
314
    # Server naming configuration
    # Adds a prefix to the server name, which defaults to the pod name
    # Helpful for ensuring server name is unique in a super cluster
315
316
    serverNamePrefix: ""

317
318
319
    # Advanced NATS configuration merging and patching
    # For complete options see: https://docs.nats.io/running-a-nats-service/configuration
    # Special rules apply:
320
321
322
323
324
    #  1. strings that start with << and end with >> will be unquoted
    #     use this for variables and numbers with units
    #  2. keys ending in $include will be switched to include directives
    #     keys are sorted alphabetically, use prefix before $includes to control includes ordering
    #     paths should be relative to /etc/nats-config/nats.conf
325
    # Example:
326
327
328
329
330
331
332
333
334
335
336
337
    #   merge:
    #     $include: ./my-config.conf
    #     zzz$include: ./my-config-last.conf
    #     server_name: nats
    #     authorization:
    #       token: << $TOKEN >>
    #     jetstream:
    #       max_memory_store: << 1GB >>
    merge: {}
    patch: []

  ############################################################
338
  # NATS container configuration in StatefulSet
339
340
  ############################################################
  container:
341
    # NATS server container image configuration
342
    image:
343
      # Official NATS server repository
344
      repository: nats
345
      # NATS server version (Alpine-based for smaller size)
346
      tag: 2.10.21-alpine
347
      # Image pull policy (leave empty for chart default)
348
      pullPolicy:
349
      # Custom registry URL (leave empty for Docker Hub)
350
351
      registry:

352
353
    # Container port configuration
    # Note: Ports must also be enabled in the config section above
354
355
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#containerport-v1-core
    ports:
356
      # Main NATS client connection port
357
      nats: {}
358
      # Leaf node connection port
359
      leafnodes: {}
360
      # WebSocket connection port
361
      websocket: {}
362
      # MQTT protocol port
363
      mqtt: {}
364
      # Cluster communication port
365
      cluster: {}
366
      # Gateway connection port
367
      gateway: {}
368
      # HTTP monitoring port
369
      monitor: {}
370
      # Go profiling port
371
372
      profiling: {}

373
374
375
    # Environment variables for the NATS container
    # Map with key as env var name, value can be string or map
    # Example:
376
377
378
379
380
381
382
383
384
    #   env:
    #     GOMEMLIMIT: 7GiB
    #     TOKEN:
    #       valueFrom:
    #         secretKeyRef:
    #           name: nats-auth
    #           key: token
    env: {}

385
    # Advanced container configuration merging and patching
386
387
388
389
390
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#container-v1-core
    merge: {}
    patch: []

  ############################################################
391
  # Configuration reloader container for hot config updates
392
393
  ############################################################
  reloader:
394
    # Whether to enable the config reloader sidecar container
395
    enabled: true
396
397

    # Config reloader container image
398
    image:
399
      # Official NATS config reloader repository
400
      repository: natsio/nats-server-config-reloader
401
      # Config reloader version
402
      tag: 0.16.0
403
      # Image pull policy (leave empty for chart default)
404
      pullPolicy:
405
      # Custom registry URL (leave empty for Docker Hub)
406
407
      registry:

408
    # Environment variables for the reloader container
409
410
    env: {}

411
412
    # Volume mount prefixes from NATS container to share with reloader
    # All NATS container volume mounts with these prefixes will be mounted into the reloader
413
414
415
    natsVolumeMountPrefixes:
    - /etc/

416
    # Advanced reloader container configuration
417
418
419
420
421
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#container-v1-core
    merge: {}
    patch: []

  ############################################################
422
  # Prometheus metrics exporter container (optional)
423
  ############################################################
424
  # Note: config.monitor must be enabled for this to work
425
  promExporter:
426
    # Whether to enable Prometheus metrics exporter sidecar
427
428
429
    enabled: false

  ############################################################
430
  # Kubernetes Service for NATS access
431
432
  ############################################################
  service:
433
    # Whether to create a Kubernetes Service for NATS
434
435
    enabled: true

436
437
438
    # Service port configuration
    # Additional boolean field 'enabled' controls whether port is exposed in the service
    # Note: Ports must also be enabled in the config section above
439
440
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#serviceport-v1-core
    ports:
441
      # Main NATS client connection port
442
443
      nats:
        enabled: true
444
      # Leaf node connection port
445
446
      leafnodes:
        enabled: true
447
      # WebSocket connection port
448
449
      websocket:
        enabled: true
450
      # MQTT protocol port
451
452
      mqtt:
        enabled: true
453
      # Cluster communication port (typically internal only)
454
455
      cluster:
        enabled: false
456
      # Gateway connection port (typically internal only)
457
458
      gateway:
        enabled: false
459
      # HTTP monitoring port (typically internal only)
460
461
      monitor:
        enabled: false
462
      # Go profiling port (typically internal only)
463
464
465
      profiling:
        enabled: false

466
    # Advanced service configuration
467
468
469
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#service-v1-core
    merge: {}
    patch: []
470
    # Service name (defaults to "{{ include "nats.fullname" $ }}")
471
472
473
    name:

  ############################################################
474
  # Advanced NATS Kubernetes resource configuration
475
476
  ############################################################

477
  # StatefulSet configuration for NATS server persistence
478
  statefulSet:
479
    # Advanced StatefulSet configuration merging and patching
480
481
482
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#statefulset-v1-apps
    merge: {}
    patch: []
483
    # StatefulSet name (defaults to "{{ include "nats.fullname" $ }}")
484
485
    name:

486
  # Pod template configuration for NATS StatefulSet
487
  podTemplate:
488
489
    # Whether to add a hash of the ConfigMap as a pod annotation
    # This will cause the StatefulSet to roll when the ConfigMap is updated
490
491
    configChecksumAnnotation: true

492
493
494
495
496
497
498
    # Pod topology spread constraints for better distribution across nodes
    # Map of topologyKey: topologySpreadConstraint
    # labelSelector will be added automatically to match StatefulSet pods
    # Example:
    #   topologySpreadConstraints:
    #     kubernetes.io/hostname:
    #       maxSkew: 1
499
500
    topologySpreadConstraints: {}

501
    # Advanced pod template configuration
502
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#pod-v1-core
503
504
    merge:
      spec:
505
        # Node tolerations for NATS pods (allows scheduling on specific nodes)
506
        tolerations: []
507
508
    patch: []

509
  # Headless service for StatefulSet pod discovery
510
  headlessService:
511
    # Advanced headless service configuration
512
513
514
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#service-v1-core
    merge: {}
    patch: []
515
    # Headless service name (defaults to "{{ include "nats.fullname" $ }}-headless")
516
517
    name:

518
  # ConfigMap for NATS server configuration
519
  configMap:
520
    # Advanced ConfigMap configuration
521
522
523
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#configmap-v1-core
    merge: {}
    patch: []
524
    # ConfigMap name (defaults to "{{ include "nats.fullname" $ }}-config")
525
526
    name:

527
  # Pod Disruption Budget for controlled rolling updates
528
  podDisruptionBudget:
529
    # Whether to create a PodDisruptionBudget (recommended for production)
530
531
    enabled: true

532
  # Service Account for NATS server pods
533
  serviceAccount:
534
    # Whether to create and use a dedicated service account
535
536
537
    enabled: false

  ############################################################
538
539
  # NATS Box - CLI tools and debugging container
  # NATS Box provides CLI tools for interacting with NATS server
540
541
  ############################################################
  natsBox:
542
    # Whether to deploy NATS Box for CLI access and debugging
543
    enabled: false
544
545

    ############################################################
546
    # NATS client contexts for authentication and connection
547
548
    ############################################################
    contexts:
549
      # Default context configuration
550
      default:
551
        # Credentials-based authentication
552
        creds:
553
          # Inline credentials file contents (base64 encoded)
554
          contents:
555
          # Name of existing secret containing credentials file
556
          secretName:
557
          # Directory to mount credentials (defaults to /etc/nats-creds/<context-name>)
558
          dir:
559
          # Key name in secret for credentials file
560
          key: nats.creds
561
562

        # NKey-based authentication (public/private key pairs)
563
        nkey:
564
          # Inline NKey file contents (base64 encoded)
565
          contents:
566
          # Name of existing secret containing NKey file
567
          secretName:
568
          # Directory to mount NKey (defaults to /etc/nats-nkeys/<context-name>)
569
          dir:
570
          # Key name in secret for NKey file
571
          key: nats.nk
572
573

        # TLS client certificate authentication
574
        tls:
575
          # Name of existing secret containing TLS client certificates
576
          secretName:
577
          # Directory to mount certificates (defaults to /etc/nats-certs/<context-name>)
578
          dir:
579
          # Certificate file name in secret
580
          cert: tls.crt
581
          # Private key file name in secret
582
583
          key: tls.key

584
585
        # Advanced context configuration
        # For options see: https://docs.nats.io/using-nats/nats-tools/nats_cli#nats-contexts
586
587
588
        merge: {}
        patch: []

589
    # Name of context to select by default for NATS CLI operations
590
591
592
    defaultContextName: default

    ############################################################
593
    # NATS Box container configuration
594
595
    ############################################################
    container:
596
      # NATS Box container image
597
      image:
598
        # Official NATS Box repository with CLI tools
599
        repository: natsio/nats-box
600
        # NATS Box version
601
        tag: 0.14.5
602
        # Image pull policy (leave empty for chart default)
603
        pullPolicy:
604
        # Custom registry URL (leave empty for Docker Hub)
605
606
        registry:

607
      # Environment variables for NATS Box container
608
609
      env: {}

610
      # Advanced container configuration
611
612
613
      # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#container-v1-core
      merge: {}
      patch: []
614
615

    # Service Account for NATS Box deployment
616
    serviceAccount:
617
      # Whether to create and use a dedicated service account for NATS Box
618
      enabled: false
619

620
    # Pod template configuration for NATS Box deployment
621
622
623
    podTemplate:
      merge:
        spec:
624
          # Node tolerations for NATS Box pods
625
626
          tolerations: []
      patch: []