values.yaml 24.2 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Used to generate top-level secrets (overridden by custom-values.yaml)
16

17
18
19
# Subcharts configuration

# Dynamo operator configuration
20
dynamo-operator:
21
  # -- Whether to enable the Dynamo Kubernetes operator deployment
22
  enabled: true
23
24

  # -- NATS server address for operator communication (leave empty to use the bundled NATS chart). Format: "nats://hostname:port"
25
  natsAddr: ""
26
27

  # -- etcd server address for operator state storage (leave empty to use the bundled etcd chart). Format: "http://hostname:port" or "https://hostname:port"
28
  etcdAddr: ""
29

30
31
32
  # -- URL for the Model Express server if not deployed by this helm chart. This is ignored if Model Express server is installed by this helm chart (global.model-express.enabled is true).
  modelExpressURL: ""
  # -- Namespace access controls for the operator
33
  namespaceRestriction:
34
35
    # -- Whether to restrict operator to specific namespaces. By default, the operator will run with cluster-wide permissions. Only 1 instance of the operator should be deployed in the cluster. If you want to deploy multiple operator instances, you can set this to true and specify the target namespace (by default, the target namespace is the helm release namespace).
    enabled: false
36
    # -- Target namespace for operator deployment (leave empty for current namespace)
37
    targetNamespace:
38
39
40
41
42
43
44
    # Namespace scope marker lease configuration (used to prevent conflicts when running both cluster-wide and namespace-restricted operators)
    lease:
      # Duration before the namespace scope marker lease expires if not renewed (namespace-restricted mode only). When a namespace-restricted operator is running, it creates a lease in its namespace. The cluster-wide operator detects this lease and excludes that namespace from processing. If the namespace operator stops renewing the lease (e.g., crashes), the lease expires and the cluster-wide operator automatically resumes processing that namespace.
      duration: 30s
      # Interval for renewing the namespace scope marker lease (namespace-restricted mode only). The namespace-restricted operator renews its lease at this interval to signal it's still running.
      renewInterval: 10s

45
46
  # -- The Dynamo discovery backend to use. By default, will rely on ETCD for discovery. Can be set to "kubernetes" to use Kubernetes API for service discovery. --
  discoveryBackend: ""
47
48

  # Controller manager configuration
49
  controllerManager:
50
    # -- Node tolerations for controller manager pods
51
    tolerations: []
52

53
54
55
    # -- Affinity for controller manager pods
    affinity: []

56
57
58
59
60
61
62
    # Leader election configuration for cluster-wide coordination
    leaderElection:
      # -- Leader election ID for cluster-wide coordination. WARNING: All cluster-wide operators must use the SAME ID to prevent split-brain. Different IDs would allow multiple leaders simultaneously.
      id: ""  # If empty, defaults to: dynamo.nvidia.com (shared across all cluster-wide operators)
      # -- Namespace for leader election leases (only used in cluster-wide mode). If empty, defaults to kube-system for cluster-wide coordination. All cluster-wide operators should use the SAME namespace for proper leader election.
      namespace: ""

63
    manager:
64
      # Container image configuration for the operator manager
65
      image:
66
        # -- Official NVIDIA Dynamo operator image repository
67
        repository: "nvcr.io/nvidia/ai-dynamo/kubernetes-operator"
68
        # -- Image tag (leave empty to use chart default)
69
        tag: ""
70
        # -- Image pull policy - when to pull the image
71
        pullPolicy: IfNotPresent
72
73

      # Command line arguments for the operator manager
74
      args:
75
        # -- Health probe endpoint for Kubernetes health checks
76
        - --health-probe-bind-address=:8081
77
        # -- Metrics endpoint for Prometheus scraping (localhost only for security)
78
        - --metrics-bind-address=127.0.0.1:8080
79
80

  # -- Secrets for pulling private container images
81
  imagePullSecrets: []
82
83

  # Core Dynamo platform configuration
84
  dynamo:
85
    # -- How long to wait before forcefully terminating Grove instances
86
    groveTerminationDelay: 4h
87
88

    # Internal utility images used by the platform
89
    internalImages:
90
      # -- Debugger image for troubleshooting deployments
91
      debugger: python:3.12-slim
92
93

    # -- Whether to enable restricted security contexts for enhanced security
94
    enableRestrictedSecurityContext: false
95
96

    # Docker registry configuration for private repositories
97
    dockerRegistry:
98
      # -- Whether to use Kubernetes secrets for registry authentication
99
      useKubernetesSecret: false
100
      # -- Docker registry server URL
101
      server:
102
      # -- Registry username
103
      username:
104
      # -- Registry password (consider using existingSecretName instead)
105
      password:
106
      # -- Name of existing Kubernetes secret containing registry credentials
107
      existingSecretName:
108
      # -- Whether the registry uses HTTPS
109
      secure: true
110
111

    # Ingress configuration for external access
112
    ingress:
113
      # -- Whether to create ingress resources
114
      enabled: false
115
      # -- Ingress class name (e.g., "nginx", "traefik")
116
      className:
117
      # -- Secret name containing TLS certificates
118
      tlsSecretName: my-tls-secret
119
120

    # Istio service mesh configuration
121
    istio:
122
      # -- Whether to enable Istio integration
123
      enabled: false
124
      # -- Istio gateway name for routing
125
      gateway:
126
127

    # -- Host suffix for generated ingress hostnames
128
    ingressHostSuffix: ""
129
130

    # -- Whether VirtualServices should support HTTPS routing
131
    virtualServiceSupportsHTTPS: false
132

133
134
135
136
137
    # Metrics configuration
    metrics:
      # -- Endpoint that services can use to retrieve metrics. If set, dynamo operator will automatically inject the PROMETHEUS_ENDPOINT environment variable into services it manages. Users can override the value of the PROMETHEUS_ENDPOINT environment variable by modifying the corresponding deployment's environment variables
      prometheusEndpoint: ""

138
139
140
141
142
143
144
145
146
    # MPI Run configuration
    mpiRun:
      # -- Name of the secret containing the SSH key for MPI Run
      secretName: "mpi-run-ssh-secret"
      # SSH key generation configuration
      sshKeygen:
        # -- Whether to enable SSH key generation for MPI Run
        enabled: true

147
148
149
150
151
152
153
154
155
156
157
158

# Grove component - distributed inference orchestration
grove:
  # -- Whether to enable Grove for multi-node inference coordination, if enabled, the Grove operator will be deployed cluster-wide
  enabled: false

# Kai Scheduler component - advanced workload scheduling
kai-scheduler:
  # -- Whether to enable Kai Scheduler for intelligent resource allocation, if enabled, the Kai Scheduler operator will be deployed cluster-wide
  enabled: false

# etcd configuration - distributed key-value store for operator state
159
etcd:
160

161
  # -- Whether to enable etcd deployment, disable if you want to use an external etcd instance. For complete configuration options, see: https://github.com/bitnami/charts/tree/main/bitnami/etcd , all etcd settings should be prefixed with "etcd."
162
  enabled: true
163

164
  image:
165
    # -- following bitnami announcement for brownout - https://github.com/bitnami/charts/tree/main/bitnami/etcd#%EF%B8%8F-important-notice-upcoming-changes-to-the-bitnami-catalog, we need to use the legacy repository until we migrate to the new "secure" repository
166
    repository: bitnamilegacy/etcd
167
    tag: 3.5.18-debian-12-r5
168

169
  # Persistent storage configuration for etcd data
170
  persistence:
171
    # Whether to enable persistent storage (recommended for production)
172
173
174
    enabled: true
    # Use the cluster default storage-class or override with a named class
    storageClass: null
175
    # Size of persistent volume for etcd data
176
    size: 1Gi
177
178

  # Pre-upgrade job configuration
179
  preUpgradeJob:
180
    # Whether to run pre-upgrade validation jobs
181
    enabled: false
182
183

  # Number of etcd replicas (1 for single-node, 3+ for HA)
184
  replicaCount: 1
185
186
187

  # Authentication and authorization settings
  # Explicitly remove authentication for simplified internal communication
188
189
  auth:
    rbac:
190
      # Whether to create RBAC authentication (disabled for internal use)
191
192
      create: false

193
  # Health check configuration
194
  readinessProbe:
195
    # Whether to enable readiness probes (disabled to reduce startup complexity)
196
197
198
    enabled: false

  livenessProbe:
199
    # Whether to enable liveness probes (disabled to reduce startup complexity)
200
201
    enabled: false

202
  # Node tolerations for etcd pods (allows scheduling on specific nodes)
203
204
  tolerations: []

205
# NATS configuration - messaging system for operator communication
206
nats:
207
  # -- Whether to enable NATS deployment, disable if you want to use an external NATS instance. For complete configuration options, see: https://github.com/nats-io/k8s/tree/main/helm/charts/nats , all nats settings should be prefixed with "nats."
208
  enabled: true
209
210
211
212

  # TLS Certificate Authority configuration for secure communication
  # Reference a common CA Certificate or Bundle in all nats config `tls` blocks and nats-box contexts
  # Note: `tls.verify` still must be set in the appropriate nats config `tls` blocks to require mTLS
213
  tlsCA:
214
    # Whether to enable TLS CA configuration
215
216
    enabled: false

217
  # Core NATS server configuration
218
  config:
219
    # NATS clustering for high availability (multiple NATS servers)
220
    cluster:
221
      # Whether to enable NATS clustering (disabled for single-node setups)
222
223
      enabled: false

224
    # JetStream - persistent messaging and streaming capabilities
225
    jetstream:
226
      # Whether to enable JetStream (recommended for persistent messaging)
227
228
      enabled: true

229
      # File-based storage for JetStream streams and consumers
230
      fileStore:
231
        # Whether to enable file storage (persistent across restarts)
232
        enabled: true
233
        # Directory path for JetStream file storage
234
235
236
        dir: /data

        ############################################################
237
        # Persistent Volume Claim for JetStream file storage
238
239
        ############################################################
        pvc:
240
          # Whether to create a PVC for JetStream storage
241
          enabled: true
242
          # Size of the persistent volume for JetStream data
243
          size: 10Gi
244
          # Storage class name (leave empty for default)
245
246
          storageClassName:

247
          # Advanced PVC configuration (merge additional fields)
248
249
250
          # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#persistentvolumeclaim-v1-core
          merge: {}
          patch: []
251
          # PVC name (defaults to "{{ include "nats.fullname" $ }}-js")
252
253
          name:

254
        # Maximum size for JetStream file storage (defaults to PVC size)
255
256
        maxSize:

257
      # Memory-based storage for JetStream (non-persistent)
258
      memoryStore:
259
        # Whether to enable memory storage (faster but not persistent)
260
261
        enabled: false

262
263
      # Advanced JetStream configuration
      # For options see: https://docs.nats.io/running-a-nats-service/configuration#jetstream
264
265
266
      merge: {}
      patch: []

267
    # Core NATS server settings
268
    nats:
269
      # Port for NATS client connections
270
      port: 4222
271
272

      # TLS configuration for encrypted connections
273
      tls:
274
        # Whether to enable TLS encryption
275
        enabled: false
276
277
        # Advanced TLS configuration
        # For options see: https://docs.nats.io/running-a-nats-service/configuration/securing_nats/tls
278
279
280
        merge: {}
        patch: []

281
    # Leaf nodes for creating NATS topologies and remote connections
282
    leafnodes:
283
      # Whether to enable leaf node connections
284
285
      enabled: false

286
    # WebSocket support for browser-based NATS clients
287
    websocket:
288
      # Whether to enable WebSocket protocol support
289
290
      enabled: false

291
    # MQTT protocol bridge for IoT device connectivity
292
    mqtt:
293
      # Whether to enable MQTT protocol support
294
295
      enabled: false

296
    # Gateway connections for multi-cluster NATS deployments
297
    gateway:
298
      # Whether to enable gateway connections
299
300
      enabled: false

301
    # HTTP monitoring endpoint for NATS server metrics
302
    monitor:
303
      # Whether to enable HTTP monitoring interface
304
      enabled: true
305
      # Port for monitoring HTTP endpoint
306
      port: 8222
307
308

      # TLS configuration for monitoring endpoint
309
      tls:
310
311
        # Whether to enable HTTPS for monitoring (requires config.nats.tls enabled)
        # When enabled, monitoring port will use HTTPS with the options from config.nats.tls
312
313
        enabled: false

314
    # Go pprof profiling endpoint for performance debugging
315
    profiling:
316
      # Whether to enable profiling endpoint (for debugging only)
317
      enabled: false
318
      # Port for profiling endpoint
319
320
      port: 65432

321
    # Account resolver for multi-tenant NATS deployments
322
    resolver:
323
      # Whether to enable account resolution (for advanced multi-tenancy)
324
325
      enabled: false

326
327
328
    # Server naming configuration
    # Adds a prefix to the server name, which defaults to the pod name
    # Helpful for ensuring server name is unique in a super cluster
329
330
    serverNamePrefix: ""

331
332
333
    # Advanced NATS configuration merging and patching
    # For complete options see: https://docs.nats.io/running-a-nats-service/configuration
    # Special rules apply:
334
335
336
337
338
    #  1. strings that start with << and end with >> will be unquoted
    #     use this for variables and numbers with units
    #  2. keys ending in $include will be switched to include directives
    #     keys are sorted alphabetically, use prefix before $includes to control includes ordering
    #     paths should be relative to /etc/nats-config/nats.conf
339
    # Example:
340
341
342
343
344
345
346
347
    #   merge:
    #     $include: ./my-config.conf
    #     zzz$include: ./my-config-last.conf
    #     server_name: nats
    #     authorization:
    #       token: << $TOKEN >>
    #     jetstream:
    #       max_memory_store: << 1GB >>
348
349
350
    merge:
      # 10MB which allows for larger context size : The default NATS max payload size is 1MB, and 256K tokens (with tokens being int32 - 4 bytes each) tips over that 1MB max.
      max_payload: 10485760
351
352
353
    patch: []

  ############################################################
354
  # NATS container configuration in StatefulSet
355
356
  ############################################################
  container:
357
    # NATS server container image configuration
358
    image:
359
      # Official NATS server repository
360
      repository: nats
361
      # NATS server version (Alpine-based for smaller size)
362
      tag: 2.10.21-alpine
363
      # Image pull policy (leave empty for chart default)
364
      pullPolicy:
365
      # Custom registry URL (leave empty for Docker Hub)
366
367
      registry:

368
369
    # Container port configuration
    # Note: Ports must also be enabled in the config section above
370
371
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#containerport-v1-core
    ports:
372
      # Main NATS client connection port
373
      nats: {}
374
      # Leaf node connection port
375
      leafnodes: {}
376
      # WebSocket connection port
377
      websocket: {}
378
      # MQTT protocol port
379
      mqtt: {}
380
      # Cluster communication port
381
      cluster: {}
382
      # Gateway connection port
383
      gateway: {}
384
      # HTTP monitoring port
385
      monitor: {}
386
      # Go profiling port
387
388
      profiling: {}

389
390
391
    # Environment variables for the NATS container
    # Map with key as env var name, value can be string or map
    # Example:
392
393
394
395
396
397
398
399
400
    #   env:
    #     GOMEMLIMIT: 7GiB
    #     TOKEN:
    #       valueFrom:
    #         secretKeyRef:
    #           name: nats-auth
    #           key: token
    env: {}

401
    # Advanced container configuration merging and patching
402
403
404
405
406
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#container-v1-core
    merge: {}
    patch: []

  ############################################################
407
  # Configuration reloader container for hot config updates
408
409
  ############################################################
  reloader:
410
    # Whether to enable the config reloader sidecar container
411
    enabled: true
412
413

    # Config reloader container image
414
    image:
415
      # Official NATS config reloader repository
416
      repository: natsio/nats-server-config-reloader
417
      # Config reloader version
418
      tag: 0.16.0
419
      # Image pull policy (leave empty for chart default)
420
      pullPolicy:
421
      # Custom registry URL (leave empty for Docker Hub)
422
423
      registry:

424
    # Environment variables for the reloader container
425
426
    env: {}

427
428
    # Volume mount prefixes from NATS container to share with reloader
    # All NATS container volume mounts with these prefixes will be mounted into the reloader
429
430
431
    natsVolumeMountPrefixes:
    - /etc/

432
    # Advanced reloader container configuration
433
434
435
436
437
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#container-v1-core
    merge: {}
    patch: []

  ############################################################
438
  # Prometheus metrics exporter container (optional)
439
  ############################################################
440
  # Note: config.monitor must be enabled for this to work
441
  promExporter:
442
    # Whether to enable Prometheus metrics exporter sidecar
443
444
445
    enabled: false

  ############################################################
446
  # Kubernetes Service for NATS access
447
448
  ############################################################
  service:
449
    # Whether to create a Kubernetes Service for NATS
450
451
    enabled: true

452
453
454
    # Service port configuration
    # Additional boolean field 'enabled' controls whether port is exposed in the service
    # Note: Ports must also be enabled in the config section above
455
456
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#serviceport-v1-core
    ports:
457
      # Main NATS client connection port
458
459
      nats:
        enabled: true
460
      # Leaf node connection port
461
462
      leafnodes:
        enabled: true
463
      # WebSocket connection port
464
465
      websocket:
        enabled: true
466
      # MQTT protocol port
467
468
      mqtt:
        enabled: true
469
      # Cluster communication port (typically internal only)
470
471
      cluster:
        enabled: false
472
      # Gateway connection port (typically internal only)
473
474
      gateway:
        enabled: false
475
      # HTTP monitoring port (typically internal only)
476
477
      monitor:
        enabled: false
478
      # Go profiling port (typically internal only)
479
480
481
      profiling:
        enabled: false

482
    # Advanced service configuration
483
484
485
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#service-v1-core
    merge: {}
    patch: []
486
    # Service name (defaults to "{{ include "nats.fullname" $ }}")
487
488
489
    name:

  ############################################################
490
  # Advanced NATS Kubernetes resource configuration
491
492
  ############################################################

493
  # StatefulSet configuration for NATS server persistence
494
  statefulSet:
495
    # Advanced StatefulSet configuration merging and patching
496
497
498
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#statefulset-v1-apps
    merge: {}
    patch: []
499
    # StatefulSet name (defaults to "{{ include "nats.fullname" $ }}")
500
501
    name:

502
  # Pod template configuration for NATS StatefulSet
503
  podTemplate:
504
505
    # Whether to add a hash of the ConfigMap as a pod annotation
    # This will cause the StatefulSet to roll when the ConfigMap is updated
506
507
    configChecksumAnnotation: true

508
509
510
511
512
513
514
    # Pod topology spread constraints for better distribution across nodes
    # Map of topologyKey: topologySpreadConstraint
    # labelSelector will be added automatically to match StatefulSet pods
    # Example:
    #   topologySpreadConstraints:
    #     kubernetes.io/hostname:
    #       maxSkew: 1
515
516
    topologySpreadConstraints: {}

517
    # Advanced pod template configuration
518
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#pod-v1-core
519
520
    merge:
      spec:
521
        # Node tolerations for NATS pods (allows scheduling on specific nodes)
522
        tolerations: []
523
524
    patch: []

525
  # Headless service for StatefulSet pod discovery
526
  headlessService:
527
    # Advanced headless service configuration
528
529
530
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#service-v1-core
    merge: {}
    patch: []
531
    # Headless service name (defaults to "{{ include "nats.fullname" $ }}-headless")
532
533
    name:

534
  # ConfigMap for NATS server configuration
535
  configMap:
536
    # Advanced ConfigMap configuration
537
538
539
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#configmap-v1-core
    merge: {}
    patch: []
540
    # ConfigMap name (defaults to "{{ include "nats.fullname" $ }}-config")
541
542
    name:

543
  # Pod Disruption Budget for controlled rolling updates
544
  podDisruptionBudget:
545
    # Whether to create a PodDisruptionBudget (recommended for production)
546
547
    enabled: true

548
  # Service Account for NATS server pods
549
  serviceAccount:
550
    # Whether to create and use a dedicated service account
551
552
553
    enabled: false

  ############################################################
554
555
  # NATS Box - CLI tools and debugging container
  # NATS Box provides CLI tools for interacting with NATS server
556
557
  ############################################################
  natsBox:
558
    # Whether to deploy NATS Box for CLI access and debugging
559
    enabled: false
560
561

    ############################################################
562
    # NATS client contexts for authentication and connection
563
564
    ############################################################
    contexts:
565
      # Default context configuration
566
      default:
567
        # Credentials-based authentication
568
        creds:
569
          # Inline credentials file contents (base64 encoded)
570
          contents:
571
          # Name of existing secret containing credentials file
572
          secretName:
573
          # Directory to mount credentials (defaults to /etc/nats-creds/<context-name>)
574
          dir:
575
          # Key name in secret for credentials file
576
          key: nats.creds
577
578

        # NKey-based authentication (public/private key pairs)
579
        nkey:
580
          # Inline NKey file contents (base64 encoded)
581
          contents:
582
          # Name of existing secret containing NKey file
583
          secretName:
584
          # Directory to mount NKey (defaults to /etc/nats-nkeys/<context-name>)
585
          dir:
586
          # Key name in secret for NKey file
587
          key: nats.nk
588
589

        # TLS client certificate authentication
590
        tls:
591
          # Name of existing secret containing TLS client certificates
592
          secretName:
593
          # Directory to mount certificates (defaults to /etc/nats-certs/<context-name>)
594
          dir:
595
          # Certificate file name in secret
596
          cert: tls.crt
597
          # Private key file name in secret
598
599
          key: tls.key

600
601
        # Advanced context configuration
        # For options see: https://docs.nats.io/using-nats/nats-tools/nats_cli#nats-contexts
602
603
604
        merge: {}
        patch: []

605
    # Name of context to select by default for NATS CLI operations
606
607
608
    defaultContextName: default

    ############################################################
609
    # NATS Box container configuration
610
611
    ############################################################
    container:
612
      # NATS Box container image
613
      image:
614
        # Official NATS Box repository with CLI tools
615
        repository: natsio/nats-box
616
        # NATS Box version
617
        tag: 0.14.5
618
        # Image pull policy (leave empty for chart default)
619
        pullPolicy:
620
        # Custom registry URL (leave empty for Docker Hub)
621
622
        registry:

623
      # Environment variables for NATS Box container
624
625
      env: {}

626
      # Advanced container configuration
627
628
629
      # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#container-v1-core
      merge: {}
      patch: []
630
631

    # Service Account for NATS Box deployment
632
    serviceAccount:
633
      # Whether to create and use a dedicated service account for NATS Box
634
      enabled: false
635

636
    # Pod template configuration for NATS Box deployment
637
638
639
    podTemplate:
      merge:
        spec:
640
          # Node tolerations for NATS Box pods
641
642
          tolerations: []
      patch: []