values.yaml 23.2 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Used to generate top-level secrets (overridden by custom-values.yaml)
16

17
18
19
# Subcharts configuration

# Dynamo operator configuration
20
dynamo-operator:
21
  # -- Whether to enable the Dynamo Kubernetes operator deployment
22
  enabled: true
23
24

  # -- NATS server address for operator communication (leave empty to use the bundled NATS chart). Format: "nats://hostname:port"
25
  natsAddr: ""
26
27

  # -- etcd server address for operator state storage (leave empty to use the bundled etcd chart). Format: "http://hostname:port" or "https://hostname:port"
28
  etcdAddr: ""
29

30
31
32
  # -- URL for the Model Express server if not deployed by this helm chart. This is ignored if Model Express server is installed by this helm chart (global.model-express.enabled is true).
  modelExpressURL: ""
  # -- Namespace access controls for the operator
33
  namespaceRestriction:
34
35
    # -- Whether to restrict operator to specific namespaces. By default, the operator will run with cluster-wide permissions. Only 1 instance of the operator should be deployed in the cluster. If you want to deploy multiple operator instances, you can set this to true and specify the target namespace (by default, the target namespace is the helm release namespace).
    enabled: false
36
    # -- Target namespace for operator deployment (leave empty for current namespace)
37
    targetNamespace:
38
39

  # Controller manager configuration
40
  controllerManager:
41
    # -- Node tolerations for controller manager pods
42
    tolerations: []
43

44
45
46
    # -- Affinity for controller manager pods
    affinity: []

47
48
49
50
51
52
53
    # Leader election configuration for cluster-wide coordination
    leaderElection:
      # -- Leader election ID for cluster-wide coordination. WARNING: All cluster-wide operators must use the SAME ID to prevent split-brain. Different IDs would allow multiple leaders simultaneously.
      id: ""  # If empty, defaults to: dynamo.nvidia.com (shared across all cluster-wide operators)
      # -- Namespace for leader election leases (only used in cluster-wide mode). If empty, defaults to kube-system for cluster-wide coordination. All cluster-wide operators should use the SAME namespace for proper leader election.
      namespace: ""

54
    manager:
55
      # Container image configuration for the operator manager
56
      image:
57
        # -- Official NVIDIA Dynamo operator image repository
58
        repository: "nvcr.io/nvidia/ai-dynamo/kubernetes-operator"
59
        # -- Image tag (leave empty to use chart default)
60
        tag: ""
61
        # -- Image pull policy - when to pull the image
62
        pullPolicy: IfNotPresent
63
64

      # Command line arguments for the operator manager
65
      args:
66
        # -- Health probe endpoint for Kubernetes health checks
67
        - --health-probe-bind-address=:8081
68
        # -- Metrics endpoint for Prometheus scraping (localhost only for security)
69
        - --metrics-bind-address=127.0.0.1:8080
70
71

  # -- Secrets for pulling private container images
72
  imagePullSecrets: []
73
74

  # Core Dynamo platform configuration
75
  dynamo:
76
    # -- How long to wait before forcefully terminating Grove instances
77
    groveTerminationDelay: 4h
78
79

    # Internal utility images used by the platform
80
    internalImages:
81
      # -- Debugger image for troubleshooting deployments
82
      debugger: python:3.12-slim
83
84

    # -- Whether to enable restricted security contexts for enhanced security
85
    enableRestrictedSecurityContext: false
86
87

    # Docker registry configuration for private repositories
88
    dockerRegistry:
89
      # -- Whether to use Kubernetes secrets for registry authentication
90
      useKubernetesSecret: false
91
      # -- Docker registry server URL
92
      server:
93
      # -- Registry username
94
      username:
95
      # -- Registry password (consider using existingSecretName instead)
96
      password:
97
      # -- Name of existing Kubernetes secret containing registry credentials
98
      existingSecretName:
99
      # -- Whether the registry uses HTTPS
100
      secure: true
101
102

    # Ingress configuration for external access
103
    ingress:
104
      # -- Whether to create ingress resources
105
      enabled: false
106
      # -- Ingress class name (e.g., "nginx", "traefik")
107
      className:
108
      # -- Secret name containing TLS certificates
109
      tlsSecretName: my-tls-secret
110
111

    # Istio service mesh configuration
112
    istio:
113
      # -- Whether to enable Istio integration
114
      enabled: false
115
      # -- Istio gateway name for routing
116
      gateway:
117
118

    # -- Host suffix for generated ingress hostnames
119
    ingressHostSuffix: ""
120
121

    # -- Whether VirtualServices should support HTTPS routing
122
    virtualServiceSupportsHTTPS: false
123

124
125
126
127
128
    # Metrics configuration
    metrics:
      # -- Endpoint that services can use to retrieve metrics. If set, dynamo operator will automatically inject the PROMETHEUS_ENDPOINT environment variable into services it manages. Users can override the value of the PROMETHEUS_ENDPOINT environment variable by modifying the corresponding deployment's environment variables
      prometheusEndpoint: ""

129
130
131
132
133
134
135
136
137
    # MPI Run configuration
    mpiRun:
      # -- Name of the secret containing the SSH key for MPI Run
      secretName: "mpi-run-ssh-secret"
      # SSH key generation configuration
      sshKeygen:
        # -- Whether to enable SSH key generation for MPI Run
        enabled: true

138
139
140
141
142
143
144
145
146
147
148
149

# Grove component - distributed inference orchestration
grove:
  # -- Whether to enable Grove for multi-node inference coordination, if enabled, the Grove operator will be deployed cluster-wide
  enabled: false

# Kai Scheduler component - advanced workload scheduling
kai-scheduler:
  # -- Whether to enable Kai Scheduler for intelligent resource allocation, if enabled, the Kai Scheduler operator will be deployed cluster-wide
  enabled: false

# etcd configuration - distributed key-value store for operator state
150
etcd:
151

152
  # -- Whether to enable etcd deployment, disable if you want to use an external etcd instance. For complete configuration options, see: https://github.com/bitnami/charts/tree/main/bitnami/etcd , all etcd settings should be prefixed with "etcd."
153
  enabled: true
154

155
  image:
156
    # -- following bitnami announcement for brownout - https://github.com/bitnami/charts/tree/main/bitnami/etcd#%EF%B8%8F-important-notice-upcoming-changes-to-the-bitnami-catalog, we need to use the legacy repository until we migrate to the new "secure" repository
157
    repository: bitnamilegacy/etcd
158
    tag: 3.5.18-debian-12-r5
159

160
  # Persistent storage configuration for etcd data
161
  persistence:
162
    # Whether to enable persistent storage (recommended for production)
163
164
165
    enabled: true
    # Use the cluster default storage-class or override with a named class
    storageClass: null
166
    # Size of persistent volume for etcd data
167
    size: 1Gi
168
169

  # Pre-upgrade job configuration
170
  preUpgradeJob:
171
    # Whether to run pre-upgrade validation jobs
172
    enabled: false
173
174

  # Number of etcd replicas (1 for single-node, 3+ for HA)
175
  replicaCount: 1
176
177
178

  # Authentication and authorization settings
  # Explicitly remove authentication for simplified internal communication
179
180
  auth:
    rbac:
181
      # Whether to create RBAC authentication (disabled for internal use)
182
183
      create: false

184
  # Health check configuration
185
  readinessProbe:
186
    # Whether to enable readiness probes (disabled to reduce startup complexity)
187
188
189
    enabled: false

  livenessProbe:
190
    # Whether to enable liveness probes (disabled to reduce startup complexity)
191
192
    enabled: false

193
  # Node tolerations for etcd pods (allows scheduling on specific nodes)
194
195
  tolerations: []

196
# NATS configuration - messaging system for operator communication
197
nats:
198
  # -- Whether to enable NATS deployment, disable if you want to use an external NATS instance. For complete configuration options, see: https://github.com/nats-io/k8s/tree/main/helm/charts/nats , all nats settings should be prefixed with "nats."
199
  enabled: true
200
201
202
203

  # TLS Certificate Authority configuration for secure communication
  # Reference a common CA Certificate or Bundle in all nats config `tls` blocks and nats-box contexts
  # Note: `tls.verify` still must be set in the appropriate nats config `tls` blocks to require mTLS
204
  tlsCA:
205
    # Whether to enable TLS CA configuration
206
207
    enabled: false

208
  # Core NATS server configuration
209
  config:
210
    # NATS clustering for high availability (multiple NATS servers)
211
    cluster:
212
      # Whether to enable NATS clustering (disabled for single-node setups)
213
214
      enabled: false

215
    # JetStream - persistent messaging and streaming capabilities
216
    jetstream:
217
      # Whether to enable JetStream (recommended for persistent messaging)
218
219
      enabled: true

220
      # File-based storage for JetStream streams and consumers
221
      fileStore:
222
        # Whether to enable file storage (persistent across restarts)
223
        enabled: true
224
        # Directory path for JetStream file storage
225
226
227
        dir: /data

        ############################################################
228
        # Persistent Volume Claim for JetStream file storage
229
230
        ############################################################
        pvc:
231
          # Whether to create a PVC for JetStream storage
232
          enabled: true
233
          # Size of the persistent volume for JetStream data
234
          size: 10Gi
235
          # Storage class name (leave empty for default)
236
237
          storageClassName:

238
          # Advanced PVC configuration (merge additional fields)
239
240
241
          # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#persistentvolumeclaim-v1-core
          merge: {}
          patch: []
242
          # PVC name (defaults to "{{ include "nats.fullname" $ }}-js")
243
244
          name:

245
        # Maximum size for JetStream file storage (defaults to PVC size)
246
247
        maxSize:

248
      # Memory-based storage for JetStream (non-persistent)
249
      memoryStore:
250
        # Whether to enable memory storage (faster but not persistent)
251
252
        enabled: false

253
254
      # Advanced JetStream configuration
      # For options see: https://docs.nats.io/running-a-nats-service/configuration#jetstream
255
256
257
      merge: {}
      patch: []

258
    # Core NATS server settings
259
    nats:
260
      # Port for NATS client connections
261
      port: 4222
262
263

      # TLS configuration for encrypted connections
264
      tls:
265
        # Whether to enable TLS encryption
266
        enabled: false
267
268
        # Advanced TLS configuration
        # For options see: https://docs.nats.io/running-a-nats-service/configuration/securing_nats/tls
269
270
271
        merge: {}
        patch: []

272
    # Leaf nodes for creating NATS topologies and remote connections
273
    leafnodes:
274
      # Whether to enable leaf node connections
275
276
      enabled: false

277
    # WebSocket support for browser-based NATS clients
278
    websocket:
279
      # Whether to enable WebSocket protocol support
280
281
      enabled: false

282
    # MQTT protocol bridge for IoT device connectivity
283
    mqtt:
284
      # Whether to enable MQTT protocol support
285
286
      enabled: false

287
    # Gateway connections for multi-cluster NATS deployments
288
    gateway:
289
      # Whether to enable gateway connections
290
291
      enabled: false

292
    # HTTP monitoring endpoint for NATS server metrics
293
    monitor:
294
      # Whether to enable HTTP monitoring interface
295
      enabled: true
296
      # Port for monitoring HTTP endpoint
297
      port: 8222
298
299

      # TLS configuration for monitoring endpoint
300
      tls:
301
302
        # Whether to enable HTTPS for monitoring (requires config.nats.tls enabled)
        # When enabled, monitoring port will use HTTPS with the options from config.nats.tls
303
304
        enabled: false

305
    # Go pprof profiling endpoint for performance debugging
306
    profiling:
307
      # Whether to enable profiling endpoint (for debugging only)
308
      enabled: false
309
      # Port for profiling endpoint
310
311
      port: 65432

312
    # Account resolver for multi-tenant NATS deployments
313
    resolver:
314
      # Whether to enable account resolution (for advanced multi-tenancy)
315
316
      enabled: false

317
318
319
    # Server naming configuration
    # Adds a prefix to the server name, which defaults to the pod name
    # Helpful for ensuring server name is unique in a super cluster
320
321
    serverNamePrefix: ""

322
323
324
    # Advanced NATS configuration merging and patching
    # For complete options see: https://docs.nats.io/running-a-nats-service/configuration
    # Special rules apply:
325
326
327
328
329
    #  1. strings that start with << and end with >> will be unquoted
    #     use this for variables and numbers with units
    #  2. keys ending in $include will be switched to include directives
    #     keys are sorted alphabetically, use prefix before $includes to control includes ordering
    #     paths should be relative to /etc/nats-config/nats.conf
330
    # Example:
331
332
333
334
335
336
337
338
    #   merge:
    #     $include: ./my-config.conf
    #     zzz$include: ./my-config-last.conf
    #     server_name: nats
    #     authorization:
    #       token: << $TOKEN >>
    #     jetstream:
    #       max_memory_store: << 1GB >>
339
340
341
    merge:
      # 10MB which allows for larger context size : The default NATS max payload size is 1MB, and 256K tokens (with tokens being int32 - 4 bytes each) tips over that 1MB max.
      max_payload: 10485760
342
343
344
    patch: []

  ############################################################
345
  # NATS container configuration in StatefulSet
346
347
  ############################################################
  container:
348
    # NATS server container image configuration
349
    image:
350
      # Official NATS server repository
351
      repository: nats
352
      # NATS server version (Alpine-based for smaller size)
353
      tag: 2.10.21-alpine
354
      # Image pull policy (leave empty for chart default)
355
      pullPolicy:
356
      # Custom registry URL (leave empty for Docker Hub)
357
358
      registry:

359
360
    # Container port configuration
    # Note: Ports must also be enabled in the config section above
361
362
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#containerport-v1-core
    ports:
363
      # Main NATS client connection port
364
      nats: {}
365
      # Leaf node connection port
366
      leafnodes: {}
367
      # WebSocket connection port
368
      websocket: {}
369
      # MQTT protocol port
370
      mqtt: {}
371
      # Cluster communication port
372
      cluster: {}
373
      # Gateway connection port
374
      gateway: {}
375
      # HTTP monitoring port
376
      monitor: {}
377
      # Go profiling port
378
379
      profiling: {}

380
381
382
    # Environment variables for the NATS container
    # Map with key as env var name, value can be string or map
    # Example:
383
384
385
386
387
388
389
390
391
    #   env:
    #     GOMEMLIMIT: 7GiB
    #     TOKEN:
    #       valueFrom:
    #         secretKeyRef:
    #           name: nats-auth
    #           key: token
    env: {}

392
    # Advanced container configuration merging and patching
393
394
395
396
397
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#container-v1-core
    merge: {}
    patch: []

  ############################################################
398
  # Configuration reloader container for hot config updates
399
400
  ############################################################
  reloader:
401
    # Whether to enable the config reloader sidecar container
402
    enabled: true
403
404

    # Config reloader container image
405
    image:
406
      # Official NATS config reloader repository
407
      repository: natsio/nats-server-config-reloader
408
      # Config reloader version
409
      tag: 0.16.0
410
      # Image pull policy (leave empty for chart default)
411
      pullPolicy:
412
      # Custom registry URL (leave empty for Docker Hub)
413
414
      registry:

415
    # Environment variables for the reloader container
416
417
    env: {}

418
419
    # Volume mount prefixes from NATS container to share with reloader
    # All NATS container volume mounts with these prefixes will be mounted into the reloader
420
421
422
    natsVolumeMountPrefixes:
    - /etc/

423
    # Advanced reloader container configuration
424
425
426
427
428
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#container-v1-core
    merge: {}
    patch: []

  ############################################################
429
  # Prometheus metrics exporter container (optional)
430
  ############################################################
431
  # Note: config.monitor must be enabled for this to work
432
  promExporter:
433
    # Whether to enable Prometheus metrics exporter sidecar
434
435
436
    enabled: false

  ############################################################
437
  # Kubernetes Service for NATS access
438
439
  ############################################################
  service:
440
    # Whether to create a Kubernetes Service for NATS
441
442
    enabled: true

443
444
445
    # Service port configuration
    # Additional boolean field 'enabled' controls whether port is exposed in the service
    # Note: Ports must also be enabled in the config section above
446
447
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#serviceport-v1-core
    ports:
448
      # Main NATS client connection port
449
450
      nats:
        enabled: true
451
      # Leaf node connection port
452
453
      leafnodes:
        enabled: true
454
      # WebSocket connection port
455
456
      websocket:
        enabled: true
457
      # MQTT protocol port
458
459
      mqtt:
        enabled: true
460
      # Cluster communication port (typically internal only)
461
462
      cluster:
        enabled: false
463
      # Gateway connection port (typically internal only)
464
465
      gateway:
        enabled: false
466
      # HTTP monitoring port (typically internal only)
467
468
      monitor:
        enabled: false
469
      # Go profiling port (typically internal only)
470
471
472
      profiling:
        enabled: false

473
    # Advanced service configuration
474
475
476
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#service-v1-core
    merge: {}
    patch: []
477
    # Service name (defaults to "{{ include "nats.fullname" $ }}")
478
479
480
    name:

  ############################################################
481
  # Advanced NATS Kubernetes resource configuration
482
483
  ############################################################

484
  # StatefulSet configuration for NATS server persistence
485
  statefulSet:
486
    # Advanced StatefulSet configuration merging and patching
487
488
489
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#statefulset-v1-apps
    merge: {}
    patch: []
490
    # StatefulSet name (defaults to "{{ include "nats.fullname" $ }}")
491
492
    name:

493
  # Pod template configuration for NATS StatefulSet
494
  podTemplate:
495
496
    # Whether to add a hash of the ConfigMap as a pod annotation
    # This will cause the StatefulSet to roll when the ConfigMap is updated
497
498
    configChecksumAnnotation: true

499
500
501
502
503
504
505
    # Pod topology spread constraints for better distribution across nodes
    # Map of topologyKey: topologySpreadConstraint
    # labelSelector will be added automatically to match StatefulSet pods
    # Example:
    #   topologySpreadConstraints:
    #     kubernetes.io/hostname:
    #       maxSkew: 1
506
507
    topologySpreadConstraints: {}

508
    # Advanced pod template configuration
509
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#pod-v1-core
510
511
    merge:
      spec:
512
        # Node tolerations for NATS pods (allows scheduling on specific nodes)
513
        tolerations: []
514
515
    patch: []

516
  # Headless service for StatefulSet pod discovery
517
  headlessService:
518
    # Advanced headless service configuration
519
520
521
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#service-v1-core
    merge: {}
    patch: []
522
    # Headless service name (defaults to "{{ include "nats.fullname" $ }}-headless")
523
524
    name:

525
  # ConfigMap for NATS server configuration
526
  configMap:
527
    # Advanced ConfigMap configuration
528
529
530
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#configmap-v1-core
    merge: {}
    patch: []
531
    # ConfigMap name (defaults to "{{ include "nats.fullname" $ }}-config")
532
533
    name:

534
  # Pod Disruption Budget for controlled rolling updates
535
  podDisruptionBudget:
536
    # Whether to create a PodDisruptionBudget (recommended for production)
537
538
    enabled: true

539
  # Service Account for NATS server pods
540
  serviceAccount:
541
    # Whether to create and use a dedicated service account
542
543
544
    enabled: false

  ############################################################
545
546
  # NATS Box - CLI tools and debugging container
  # NATS Box provides CLI tools for interacting with NATS server
547
548
  ############################################################
  natsBox:
549
    # Whether to deploy NATS Box for CLI access and debugging
550
    enabled: false
551
552

    ############################################################
553
    # NATS client contexts for authentication and connection
554
555
    ############################################################
    contexts:
556
      # Default context configuration
557
      default:
558
        # Credentials-based authentication
559
        creds:
560
          # Inline credentials file contents (base64 encoded)
561
          contents:
562
          # Name of existing secret containing credentials file
563
          secretName:
564
          # Directory to mount credentials (defaults to /etc/nats-creds/<context-name>)
565
          dir:
566
          # Key name in secret for credentials file
567
          key: nats.creds
568
569

        # NKey-based authentication (public/private key pairs)
570
        nkey:
571
          # Inline NKey file contents (base64 encoded)
572
          contents:
573
          # Name of existing secret containing NKey file
574
          secretName:
575
          # Directory to mount NKey (defaults to /etc/nats-nkeys/<context-name>)
576
          dir:
577
          # Key name in secret for NKey file
578
          key: nats.nk
579
580

        # TLS client certificate authentication
581
        tls:
582
          # Name of existing secret containing TLS client certificates
583
          secretName:
584
          # Directory to mount certificates (defaults to /etc/nats-certs/<context-name>)
585
          dir:
586
          # Certificate file name in secret
587
          cert: tls.crt
588
          # Private key file name in secret
589
590
          key: tls.key

591
592
        # Advanced context configuration
        # For options see: https://docs.nats.io/using-nats/nats-tools/nats_cli#nats-contexts
593
594
595
        merge: {}
        patch: []

596
    # Name of context to select by default for NATS CLI operations
597
598
599
    defaultContextName: default

    ############################################################
600
    # NATS Box container configuration
601
602
    ############################################################
    container:
603
      # NATS Box container image
604
      image:
605
        # Official NATS Box repository with CLI tools
606
        repository: natsio/nats-box
607
        # NATS Box version
608
        tag: 0.14.5
609
        # Image pull policy (leave empty for chart default)
610
        pullPolicy:
611
        # Custom registry URL (leave empty for Docker Hub)
612
613
        registry:

614
      # Environment variables for NATS Box container
615
616
      env: {}

617
      # Advanced container configuration
618
619
620
      # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#container-v1-core
      merge: {}
      patch: []
621
622

    # Service Account for NATS Box deployment
623
    serviceAccount:
624
      # Whether to create and use a dedicated service account for NATS Box
625
      enabled: false
626

627
    # Pod template configuration for NATS Box deployment
628
629
630
    podTemplate:
      merge:
        spec:
631
          # Node tolerations for NATS Box pods
632
633
          tolerations: []
      patch: []