values.yaml 24 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Used to generate top-level secrets (overridden by custom-values.yaml)
16

17
18
19
# Subcharts configuration

# Dynamo operator configuration
20
dynamo-operator:
21
  # -- Whether to enable the Dynamo Kubernetes operator deployment
22
  enabled: true
23
24

  # -- NATS server address for operator communication (leave empty to use the bundled NATS chart). Format: "nats://hostname:port"
25
  natsAddr: ""
26
27

  # -- etcd server address for operator state storage (leave empty to use the bundled etcd chart). Format: "http://hostname:port" or "https://hostname:port"
28
  etcdAddr: ""
29

30
31
32
  # -- URL for the Model Express server if not deployed by this helm chart. This is ignored if Model Express server is installed by this helm chart (global.model-express.enabled is true).
  modelExpressURL: ""
  # -- Namespace access controls for the operator
33
  namespaceRestriction:
34
35
    # -- Whether to restrict operator to specific namespaces. By default, the operator will run with cluster-wide permissions. Only 1 instance of the operator should be deployed in the cluster. If you want to deploy multiple operator instances, you can set this to true and specify the target namespace (by default, the target namespace is the helm release namespace).
    enabled: false
36
    # -- Target namespace for operator deployment (leave empty for current namespace)
37
    targetNamespace:
38
39
40
41
42
43
44
    # Namespace scope marker lease configuration (used to prevent conflicts when running both cluster-wide and namespace-restricted operators)
    lease:
      # Duration before the namespace scope marker lease expires if not renewed (namespace-restricted mode only). When a namespace-restricted operator is running, it creates a lease in its namespace. The cluster-wide operator detects this lease and excludes that namespace from processing. If the namespace operator stops renewing the lease (e.g., crashes), the lease expires and the cluster-wide operator automatically resumes processing that namespace.
      duration: 30s
      # Interval for renewing the namespace scope marker lease (namespace-restricted mode only). The namespace-restricted operator renews its lease at this interval to signal it's still running.
      renewInterval: 10s

45
46

  # Controller manager configuration
47
  controllerManager:
48
    # -- Node tolerations for controller manager pods
49
    tolerations: []
50

51
52
53
    # -- Affinity for controller manager pods
    affinity: []

54
55
56
57
58
59
60
    # Leader election configuration for cluster-wide coordination
    leaderElection:
      # -- Leader election ID for cluster-wide coordination. WARNING: All cluster-wide operators must use the SAME ID to prevent split-brain. Different IDs would allow multiple leaders simultaneously.
      id: ""  # If empty, defaults to: dynamo.nvidia.com (shared across all cluster-wide operators)
      # -- Namespace for leader election leases (only used in cluster-wide mode). If empty, defaults to kube-system for cluster-wide coordination. All cluster-wide operators should use the SAME namespace for proper leader election.
      namespace: ""

61
    manager:
62
      # Container image configuration for the operator manager
63
      image:
64
        # -- Official NVIDIA Dynamo operator image repository
65
        repository: "nvcr.io/nvidia/ai-dynamo/kubernetes-operator"
66
        # -- Image tag (leave empty to use chart default)
67
        tag: ""
68
        # -- Image pull policy - when to pull the image
69
        pullPolicy: IfNotPresent
70
71

      # Command line arguments for the operator manager
72
      args:
73
        # -- Health probe endpoint for Kubernetes health checks
74
        - --health-probe-bind-address=:8081
75
        # -- Metrics endpoint for Prometheus scraping (localhost only for security)
76
        - --metrics-bind-address=127.0.0.1:8080
77
78

  # -- Secrets for pulling private container images
79
  imagePullSecrets: []
80
81

  # Core Dynamo platform configuration
82
  dynamo:
83
    # -- How long to wait before forcefully terminating Grove instances
84
    groveTerminationDelay: 4h
85
86

    # Internal utility images used by the platform
87
    internalImages:
88
      # -- Debugger image for troubleshooting deployments
89
      debugger: python:3.12-slim
90
91

    # -- Whether to enable restricted security contexts for enhanced security
92
    enableRestrictedSecurityContext: false
93
94

    # Docker registry configuration for private repositories
95
    dockerRegistry:
96
      # -- Whether to use Kubernetes secrets for registry authentication
97
      useKubernetesSecret: false
98
      # -- Docker registry server URL
99
      server:
100
      # -- Registry username
101
      username:
102
      # -- Registry password (consider using existingSecretName instead)
103
      password:
104
      # -- Name of existing Kubernetes secret containing registry credentials
105
      existingSecretName:
106
      # -- Whether the registry uses HTTPS
107
      secure: true
108
109

    # Ingress configuration for external access
110
    ingress:
111
      # -- Whether to create ingress resources
112
      enabled: false
113
      # -- Ingress class name (e.g., "nginx", "traefik")
114
      className:
115
      # -- Secret name containing TLS certificates
116
      tlsSecretName: my-tls-secret
117
118

    # Istio service mesh configuration
119
    istio:
120
      # -- Whether to enable Istio integration
121
      enabled: false
122
      # -- Istio gateway name for routing
123
      gateway:
124
125

    # -- Host suffix for generated ingress hostnames
126
    ingressHostSuffix: ""
127
128

    # -- Whether VirtualServices should support HTTPS routing
129
    virtualServiceSupportsHTTPS: false
130

131
132
133
134
135
    # Metrics configuration
    metrics:
      # -- Endpoint that services can use to retrieve metrics. If set, dynamo operator will automatically inject the PROMETHEUS_ENDPOINT environment variable into services it manages. Users can override the value of the PROMETHEUS_ENDPOINT environment variable by modifying the corresponding deployment's environment variables
      prometheusEndpoint: ""

136
137
138
139
140
141
142
143
144
    # MPI Run configuration
    mpiRun:
      # -- Name of the secret containing the SSH key for MPI Run
      secretName: "mpi-run-ssh-secret"
      # SSH key generation configuration
      sshKeygen:
        # -- Whether to enable SSH key generation for MPI Run
        enabled: true

145
146
147
148
149
150
151
152
153
154
155
156

# Grove component - distributed inference orchestration
grove:
  # -- Whether to enable Grove for multi-node inference coordination, if enabled, the Grove operator will be deployed cluster-wide
  enabled: false

# Kai Scheduler component - advanced workload scheduling
kai-scheduler:
  # -- Whether to enable Kai Scheduler for intelligent resource allocation, if enabled, the Kai Scheduler operator will be deployed cluster-wide
  enabled: false

# etcd configuration - distributed key-value store for operator state
157
etcd:
158

159
  # -- Whether to enable etcd deployment, disable if you want to use an external etcd instance. For complete configuration options, see: https://github.com/bitnami/charts/tree/main/bitnami/etcd , all etcd settings should be prefixed with "etcd."
160
  enabled: true
161

162
  image:
163
    # -- following bitnami announcement for brownout - https://github.com/bitnami/charts/tree/main/bitnami/etcd#%EF%B8%8F-important-notice-upcoming-changes-to-the-bitnami-catalog, we need to use the legacy repository until we migrate to the new "secure" repository
164
    repository: bitnamilegacy/etcd
165
    tag: 3.5.18-debian-12-r5
166

167
  # Persistent storage configuration for etcd data
168
  persistence:
169
    # Whether to enable persistent storage (recommended for production)
170
171
172
    enabled: true
    # Use the cluster default storage-class or override with a named class
    storageClass: null
173
    # Size of persistent volume for etcd data
174
    size: 1Gi
175
176

  # Pre-upgrade job configuration
177
  preUpgradeJob:
178
    # Whether to run pre-upgrade validation jobs
179
    enabled: false
180
181

  # Number of etcd replicas (1 for single-node, 3+ for HA)
182
  replicaCount: 1
183
184
185

  # Authentication and authorization settings
  # Explicitly remove authentication for simplified internal communication
186
187
  auth:
    rbac:
188
      # Whether to create RBAC authentication (disabled for internal use)
189
190
      create: false

191
  # Health check configuration
192
  readinessProbe:
193
    # Whether to enable readiness probes (disabled to reduce startup complexity)
194
195
196
    enabled: false

  livenessProbe:
197
    # Whether to enable liveness probes (disabled to reduce startup complexity)
198
199
    enabled: false

200
  # Node tolerations for etcd pods (allows scheduling on specific nodes)
201
202
  tolerations: []

203
# NATS configuration - messaging system for operator communication
204
nats:
205
  # -- Whether to enable NATS deployment, disable if you want to use an external NATS instance. For complete configuration options, see: https://github.com/nats-io/k8s/tree/main/helm/charts/nats , all nats settings should be prefixed with "nats."
206
  enabled: true
207
208
209
210

  # TLS Certificate Authority configuration for secure communication
  # Reference a common CA Certificate or Bundle in all nats config `tls` blocks and nats-box contexts
  # Note: `tls.verify` still must be set in the appropriate nats config `tls` blocks to require mTLS
211
  tlsCA:
212
    # Whether to enable TLS CA configuration
213
214
    enabled: false

215
  # Core NATS server configuration
216
  config:
217
    # NATS clustering for high availability (multiple NATS servers)
218
    cluster:
219
      # Whether to enable NATS clustering (disabled for single-node setups)
220
221
      enabled: false

222
    # JetStream - persistent messaging and streaming capabilities
223
    jetstream:
224
      # Whether to enable JetStream (recommended for persistent messaging)
225
226
      enabled: true

227
      # File-based storage for JetStream streams and consumers
228
      fileStore:
229
        # Whether to enable file storage (persistent across restarts)
230
        enabled: true
231
        # Directory path for JetStream file storage
232
233
234
        dir: /data

        ############################################################
235
        # Persistent Volume Claim for JetStream file storage
236
237
        ############################################################
        pvc:
238
          # Whether to create a PVC for JetStream storage
239
          enabled: true
240
          # Size of the persistent volume for JetStream data
241
          size: 10Gi
242
          # Storage class name (leave empty for default)
243
244
          storageClassName:

245
          # Advanced PVC configuration (merge additional fields)
246
247
248
          # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#persistentvolumeclaim-v1-core
          merge: {}
          patch: []
249
          # PVC name (defaults to "{{ include "nats.fullname" $ }}-js")
250
251
          name:

252
        # Maximum size for JetStream file storage (defaults to PVC size)
253
254
        maxSize:

255
      # Memory-based storage for JetStream (non-persistent)
256
      memoryStore:
257
        # Whether to enable memory storage (faster but not persistent)
258
259
        enabled: false

260
261
      # Advanced JetStream configuration
      # For options see: https://docs.nats.io/running-a-nats-service/configuration#jetstream
262
263
264
      merge: {}
      patch: []

265
    # Core NATS server settings
266
    nats:
267
      # Port for NATS client connections
268
      port: 4222
269
270

      # TLS configuration for encrypted connections
271
      tls:
272
        # Whether to enable TLS encryption
273
        enabled: false
274
275
        # Advanced TLS configuration
        # For options see: https://docs.nats.io/running-a-nats-service/configuration/securing_nats/tls
276
277
278
        merge: {}
        patch: []

279
    # Leaf nodes for creating NATS topologies and remote connections
280
    leafnodes:
281
      # Whether to enable leaf node connections
282
283
      enabled: false

284
    # WebSocket support for browser-based NATS clients
285
    websocket:
286
      # Whether to enable WebSocket protocol support
287
288
      enabled: false

289
    # MQTT protocol bridge for IoT device connectivity
290
    mqtt:
291
      # Whether to enable MQTT protocol support
292
293
      enabled: false

294
    # Gateway connections for multi-cluster NATS deployments
295
    gateway:
296
      # Whether to enable gateway connections
297
298
      enabled: false

299
    # HTTP monitoring endpoint for NATS server metrics
300
    monitor:
301
      # Whether to enable HTTP monitoring interface
302
      enabled: true
303
      # Port for monitoring HTTP endpoint
304
      port: 8222
305
306

      # TLS configuration for monitoring endpoint
307
      tls:
308
309
        # Whether to enable HTTPS for monitoring (requires config.nats.tls enabled)
        # When enabled, monitoring port will use HTTPS with the options from config.nats.tls
310
311
        enabled: false

312
    # Go pprof profiling endpoint for performance debugging
313
    profiling:
314
      # Whether to enable profiling endpoint (for debugging only)
315
      enabled: false
316
      # Port for profiling endpoint
317
318
      port: 65432

319
    # Account resolver for multi-tenant NATS deployments
320
    resolver:
321
      # Whether to enable account resolution (for advanced multi-tenancy)
322
323
      enabled: false

324
325
326
    # Server naming configuration
    # Adds a prefix to the server name, which defaults to the pod name
    # Helpful for ensuring server name is unique in a super cluster
327
328
    serverNamePrefix: ""

329
330
331
    # Advanced NATS configuration merging and patching
    # For complete options see: https://docs.nats.io/running-a-nats-service/configuration
    # Special rules apply:
332
333
334
335
336
    #  1. strings that start with << and end with >> will be unquoted
    #     use this for variables and numbers with units
    #  2. keys ending in $include will be switched to include directives
    #     keys are sorted alphabetically, use prefix before $includes to control includes ordering
    #     paths should be relative to /etc/nats-config/nats.conf
337
    # Example:
338
339
340
341
342
343
344
345
    #   merge:
    #     $include: ./my-config.conf
    #     zzz$include: ./my-config-last.conf
    #     server_name: nats
    #     authorization:
    #       token: << $TOKEN >>
    #     jetstream:
    #       max_memory_store: << 1GB >>
346
347
348
    merge:
      # 10MB which allows for larger context size : The default NATS max payload size is 1MB, and 256K tokens (with tokens being int32 - 4 bytes each) tips over that 1MB max.
      max_payload: 10485760
349
350
351
    patch: []

  ############################################################
352
  # NATS container configuration in StatefulSet
353
354
  ############################################################
  container:
355
    # NATS server container image configuration
356
    image:
357
      # Official NATS server repository
358
      repository: nats
359
      # NATS server version (Alpine-based for smaller size)
360
      tag: 2.10.21-alpine
361
      # Image pull policy (leave empty for chart default)
362
      pullPolicy:
363
      # Custom registry URL (leave empty for Docker Hub)
364
365
      registry:

366
367
    # Container port configuration
    # Note: Ports must also be enabled in the config section above
368
369
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#containerport-v1-core
    ports:
370
      # Main NATS client connection port
371
      nats: {}
372
      # Leaf node connection port
373
      leafnodes: {}
374
      # WebSocket connection port
375
      websocket: {}
376
      # MQTT protocol port
377
      mqtt: {}
378
      # Cluster communication port
379
      cluster: {}
380
      # Gateway connection port
381
      gateway: {}
382
      # HTTP monitoring port
383
      monitor: {}
384
      # Go profiling port
385
386
      profiling: {}

387
388
389
    # Environment variables for the NATS container
    # Map with key as env var name, value can be string or map
    # Example:
390
391
392
393
394
395
396
397
398
    #   env:
    #     GOMEMLIMIT: 7GiB
    #     TOKEN:
    #       valueFrom:
    #         secretKeyRef:
    #           name: nats-auth
    #           key: token
    env: {}

399
    # Advanced container configuration merging and patching
400
401
402
403
404
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#container-v1-core
    merge: {}
    patch: []

  ############################################################
405
  # Configuration reloader container for hot config updates
406
407
  ############################################################
  reloader:
408
    # Whether to enable the config reloader sidecar container
409
    enabled: true
410
411

    # Config reloader container image
412
    image:
413
      # Official NATS config reloader repository
414
      repository: natsio/nats-server-config-reloader
415
      # Config reloader version
416
      tag: 0.16.0
417
      # Image pull policy (leave empty for chart default)
418
      pullPolicy:
419
      # Custom registry URL (leave empty for Docker Hub)
420
421
      registry:

422
    # Environment variables for the reloader container
423
424
    env: {}

425
426
    # Volume mount prefixes from NATS container to share with reloader
    # All NATS container volume mounts with these prefixes will be mounted into the reloader
427
428
429
    natsVolumeMountPrefixes:
    - /etc/

430
    # Advanced reloader container configuration
431
432
433
434
435
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#container-v1-core
    merge: {}
    patch: []

  ############################################################
436
  # Prometheus metrics exporter container (optional)
437
  ############################################################
438
  # Note: config.monitor must be enabled for this to work
439
  promExporter:
440
    # Whether to enable Prometheus metrics exporter sidecar
441
442
443
    enabled: false

  ############################################################
444
  # Kubernetes Service for NATS access
445
446
  ############################################################
  service:
447
    # Whether to create a Kubernetes Service for NATS
448
449
    enabled: true

450
451
452
    # Service port configuration
    # Additional boolean field 'enabled' controls whether port is exposed in the service
    # Note: Ports must also be enabled in the config section above
453
454
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#serviceport-v1-core
    ports:
455
      # Main NATS client connection port
456
457
      nats:
        enabled: true
458
      # Leaf node connection port
459
460
      leafnodes:
        enabled: true
461
      # WebSocket connection port
462
463
      websocket:
        enabled: true
464
      # MQTT protocol port
465
466
      mqtt:
        enabled: true
467
      # Cluster communication port (typically internal only)
468
469
      cluster:
        enabled: false
470
      # Gateway connection port (typically internal only)
471
472
      gateway:
        enabled: false
473
      # HTTP monitoring port (typically internal only)
474
475
      monitor:
        enabled: false
476
      # Go profiling port (typically internal only)
477
478
479
      profiling:
        enabled: false

480
    # Advanced service configuration
481
482
483
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#service-v1-core
    merge: {}
    patch: []
484
    # Service name (defaults to "{{ include "nats.fullname" $ }}")
485
486
487
    name:

  ############################################################
488
  # Advanced NATS Kubernetes resource configuration
489
490
  ############################################################

491
  # StatefulSet configuration for NATS server persistence
492
  statefulSet:
493
    # Advanced StatefulSet configuration merging and patching
494
495
496
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#statefulset-v1-apps
    merge: {}
    patch: []
497
    # StatefulSet name (defaults to "{{ include "nats.fullname" $ }}")
498
499
    name:

500
  # Pod template configuration for NATS StatefulSet
501
  podTemplate:
502
503
    # Whether to add a hash of the ConfigMap as a pod annotation
    # This will cause the StatefulSet to roll when the ConfigMap is updated
504
505
    configChecksumAnnotation: true

506
507
508
509
510
511
512
    # Pod topology spread constraints for better distribution across nodes
    # Map of topologyKey: topologySpreadConstraint
    # labelSelector will be added automatically to match StatefulSet pods
    # Example:
    #   topologySpreadConstraints:
    #     kubernetes.io/hostname:
    #       maxSkew: 1
513
514
    topologySpreadConstraints: {}

515
    # Advanced pod template configuration
516
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#pod-v1-core
517
518
    merge:
      spec:
519
        # Node tolerations for NATS pods (allows scheduling on specific nodes)
520
        tolerations: []
521
522
    patch: []

523
  # Headless service for StatefulSet pod discovery
524
  headlessService:
525
    # Advanced headless service configuration
526
527
528
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#service-v1-core
    merge: {}
    patch: []
529
    # Headless service name (defaults to "{{ include "nats.fullname" $ }}-headless")
530
531
    name:

532
  # ConfigMap for NATS server configuration
533
  configMap:
534
    # Advanced ConfigMap configuration
535
536
537
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#configmap-v1-core
    merge: {}
    patch: []
538
    # ConfigMap name (defaults to "{{ include "nats.fullname" $ }}-config")
539
540
    name:

541
  # Pod Disruption Budget for controlled rolling updates
542
  podDisruptionBudget:
543
    # Whether to create a PodDisruptionBudget (recommended for production)
544
545
    enabled: true

546
  # Service Account for NATS server pods
547
  serviceAccount:
548
    # Whether to create and use a dedicated service account
549
550
551
    enabled: false

  ############################################################
552
553
  # NATS Box - CLI tools and debugging container
  # NATS Box provides CLI tools for interacting with NATS server
554
555
  ############################################################
  natsBox:
556
    # Whether to deploy NATS Box for CLI access and debugging
557
    enabled: false
558
559

    ############################################################
560
    # NATS client contexts for authentication and connection
561
562
    ############################################################
    contexts:
563
      # Default context configuration
564
      default:
565
        # Credentials-based authentication
566
        creds:
567
          # Inline credentials file contents (base64 encoded)
568
          contents:
569
          # Name of existing secret containing credentials file
570
          secretName:
571
          # Directory to mount credentials (defaults to /etc/nats-creds/<context-name>)
572
          dir:
573
          # Key name in secret for credentials file
574
          key: nats.creds
575
576

        # NKey-based authentication (public/private key pairs)
577
        nkey:
578
          # Inline NKey file contents (base64 encoded)
579
          contents:
580
          # Name of existing secret containing NKey file
581
          secretName:
582
          # Directory to mount NKey (defaults to /etc/nats-nkeys/<context-name>)
583
          dir:
584
          # Key name in secret for NKey file
585
          key: nats.nk
586
587

        # TLS client certificate authentication
588
        tls:
589
          # Name of existing secret containing TLS client certificates
590
          secretName:
591
          # Directory to mount certificates (defaults to /etc/nats-certs/<context-name>)
592
          dir:
593
          # Certificate file name in secret
594
          cert: tls.crt
595
          # Private key file name in secret
596
597
          key: tls.key

598
599
        # Advanced context configuration
        # For options see: https://docs.nats.io/using-nats/nats-tools/nats_cli#nats-contexts
600
601
602
        merge: {}
        patch: []

603
    # Name of context to select by default for NATS CLI operations
604
605
606
    defaultContextName: default

    ############################################################
607
    # NATS Box container configuration
608
609
    ############################################################
    container:
610
      # NATS Box container image
611
      image:
612
        # Official NATS Box repository with CLI tools
613
        repository: natsio/nats-box
614
        # NATS Box version
615
        tag: 0.14.5
616
        # Image pull policy (leave empty for chart default)
617
        pullPolicy:
618
        # Custom registry URL (leave empty for Docker Hub)
619
620
        registry:

621
      # Environment variables for NATS Box container
622
623
      env: {}

624
      # Advanced container configuration
625
626
627
      # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#container-v1-core
      merge: {}
      patch: []
628
629

    # Service Account for NATS Box deployment
630
    serviceAccount:
631
      # Whether to create and use a dedicated service account for NATS Box
632
      enabled: false
633

634
    # Pod template configuration for NATS Box deployment
635
636
637
    podTemplate:
      merge:
        spec:
638
          # Node tolerations for NATS Box pods
639
640
          tolerations: []
      patch: []