values.yaml 20.7 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Used to generate top-level secrets (overridden by custom-values.yaml)
16

17
18
19
# Subcharts configuration

# Dynamo operator configuration
20
dynamo-operator:
21
  # -- Whether to enable the Dynamo Kubernetes operator deployment
22
  enabled: true
23
24

  # -- NATS server address for operator communication (leave empty to use the bundled NATS chart). Format: "nats://hostname:port"
25
  natsAddr: ""
26
27

  # -- etcd server address for operator state storage (leave empty to use the bundled etcd chart). Format: "http://hostname:port" or "https://hostname:port"
28
  etcdAddr: ""
29
30

  # Namespace access controls for the operator
31
  namespaceRestriction:
32
    # -- Whether to restrict operator to specific namespaces
33
    enabled: true
34
    # -- Target namespace for operator deployment (leave empty for current namespace)
35
    targetNamespace:
36
37

  # Controller manager configuration
38
  controllerManager:
39
    # -- Node tolerations for controller manager pods
40
    tolerations: []
41

42
    manager:
43
      # Container image configuration for the operator manager
44
      image:
45
        # -- Official NVIDIA Dynamo operator image repository
46
        repository: "nvcr.io/nvidia/ai-dynamo/kubernetes-operator"
47
        # -- Image tag (leave empty to use chart default)
48
        tag: ""
49
        # -- Image pull policy - when to pull the image
50
        pullPolicy: IfNotPresent
51
52

      # Command line arguments for the operator manager
53
      args:
54
        # -- Health probe endpoint for Kubernetes health checks
55
        - --health-probe-bind-address=:8081
56
        # -- Metrics endpoint for Prometheus scraping (localhost only for security)
57
        - --metrics-bind-address=127.0.0.1:8080
58
59

  # -- Secrets for pulling private container images
60
  imagePullSecrets: []
61
62

  # Core Dynamo platform configuration
63
  dynamo:
64
    # -- How long to wait before forcefully terminating Grove instances
65
    groveTerminationDelay: 15m
66
67

    # Internal utility images used by the platform
68
    internalImages:
69
      # -- Debugger image for troubleshooting deployments
70
      debugger: python:3.12-slim
71
72

    # -- Whether to enable restricted security contexts for enhanced security
73
    enableRestrictedSecurityContext: false
74
75

    # Docker registry configuration for private repositories
76
    dockerRegistry:
77
      # -- Whether to use Kubernetes secrets for registry authentication
78
      useKubernetesSecret: false
79
      # -- Docker registry server URL
80
      server:
81
      # -- Registry username
82
      username:
83
      # -- Registry password (consider using existingSecretName instead)
84
      password:
85
      # -- Name of existing Kubernetes secret containing registry credentials
86
      existingSecretName:
87
      # -- Whether the registry uses HTTPS
88
      secure: true
89
90

    # Ingress configuration for external access
91
    ingress:
92
      # -- Whether to create ingress resources
93
      enabled: false
94
      # -- Ingress class name (e.g., "nginx", "traefik")
95
      className:
96
      # -- Secret name containing TLS certificates
97
      tlsSecretName: my-tls-secret
98
99

    # Istio service mesh configuration
100
    istio:
101
      # -- Whether to enable Istio integration
102
      enabled: false
103
      # -- Istio gateway name for routing
104
      gateway:
105
106

    # -- Host suffix for generated ingress hostnames
107
    ingressHostSuffix: ""
108
109

    # -- Whether VirtualServices should support HTTPS routing
110
    virtualServiceSupportsHTTPS: false
111

112
113
114
115
116
117
118
119
120
121
122
123
124

# Grove component - distributed inference orchestration
grove:
  # -- Whether to enable Grove for multi-node inference coordination, if enabled, the Grove operator will be deployed cluster-wide
  enabled: false

# Kai Scheduler component - advanced workload scheduling
kai-scheduler:
  # -- Whether to enable Kai Scheduler for intelligent resource allocation, if enabled, the Kai Scheduler operator will be deployed cluster-wide
  enabled: false

# etcd configuration - distributed key-value store for operator state
# For complete configuration options, see: https://github.com/bitnami/charts/tree/main/bitnami/etcd
125
etcd:
126
  # -- Whether to enable etcd deployment, disable if you want to use an external etcd instance
127
  enabled: true
128
129

  # Persistent storage configuration for etcd data
130
  persistence:
131
    # Whether to enable persistent storage (recommended for production)
132
133
134
    enabled: true
    # Use the cluster default storage-class or override with a named class
    storageClass: null
135
    # Size of persistent volume for etcd data
136
    size: 1Gi
137
138

  # Pre-upgrade job configuration
139
  preUpgrade:
140
    # Whether to run pre-upgrade validation jobs
141
    enabled: false
142
143

  # Number of etcd replicas (1 for single-node, 3+ for HA)
144
  replicaCount: 1
145
146
147

  # Authentication and authorization settings
  # Explicitly remove authentication for simplified internal communication
148
149
  auth:
    rbac:
150
      # Whether to create RBAC authentication (disabled for internal use)
151
152
      create: false

153
  # Health check configuration
154
  readinessProbe:
155
    # Whether to enable readiness probes (disabled to reduce startup complexity)
156
157
158
    enabled: false

  livenessProbe:
159
    # Whether to enable liveness probes (disabled to reduce startup complexity)
160
161
    enabled: false

162
  # Node tolerations for etcd pods (allows scheduling on specific nodes)
163
164
  tolerations: []

165
166
# NATS configuration - messaging system for operator communication
# For complete configuration options, see: https://github.com/nats-io/k8s/tree/main/helm/charts/nats
167
nats:
168
  # -- Whether to enable NATS deployment, disable if you want to use an external NATS instance
169
  enabled: true
170
171
172
173

  # TLS Certificate Authority configuration for secure communication
  # Reference a common CA Certificate or Bundle in all nats config `tls` blocks and nats-box contexts
  # Note: `tls.verify` still must be set in the appropriate nats config `tls` blocks to require mTLS
174
  tlsCA:
175
    # Whether to enable TLS CA configuration
176
177
    enabled: false

178
  # Core NATS server configuration
179
  config:
180
    # NATS clustering for high availability (multiple NATS servers)
181
    cluster:
182
      # Whether to enable NATS clustering (disabled for single-node setups)
183
184
      enabled: false

185
    # JetStream - persistent messaging and streaming capabilities
186
    jetstream:
187
      # Whether to enable JetStream (recommended for persistent messaging)
188
189
      enabled: true

190
      # File-based storage for JetStream streams and consumers
191
      fileStore:
192
        # Whether to enable file storage (persistent across restarts)
193
        enabled: true
194
        # Directory path for JetStream file storage
195
196
197
        dir: /data

        ############################################################
198
        # Persistent Volume Claim for JetStream file storage
199
200
        ############################################################
        pvc:
201
          # Whether to create a PVC for JetStream storage
202
          enabled: true
203
          # Size of the persistent volume for JetStream data
204
          size: 10Gi
205
          # Storage class name (leave empty for default)
206
207
          storageClassName:

208
          # Advanced PVC configuration (merge additional fields)
209
210
211
          # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#persistentvolumeclaim-v1-core
          merge: {}
          patch: []
212
          # PVC name (defaults to "{{ include "nats.fullname" $ }}-js")
213
214
          name:

215
        # Maximum size for JetStream file storage (defaults to PVC size)
216
217
        maxSize:

218
      # Memory-based storage for JetStream (non-persistent)
219
      memoryStore:
220
        # Whether to enable memory storage (faster but not persistent)
221
222
        enabled: false

223
224
      # Advanced JetStream configuration
      # For options see: https://docs.nats.io/running-a-nats-service/configuration#jetstream
225
226
227
      merge: {}
      patch: []

228
    # Core NATS server settings
229
    nats:
230
      # Port for NATS client connections
231
      port: 4222
232
233

      # TLS configuration for encrypted connections
234
      tls:
235
        # Whether to enable TLS encryption
236
        enabled: false
237
238
        # Advanced TLS configuration
        # For options see: https://docs.nats.io/running-a-nats-service/configuration/securing_nats/tls
239
240
241
        merge: {}
        patch: []

242
    # Leaf nodes for creating NATS topologies and remote connections
243
    leafnodes:
244
      # Whether to enable leaf node connections
245
246
      enabled: false

247
    # WebSocket support for browser-based NATS clients
248
    websocket:
249
      # Whether to enable WebSocket protocol support
250
251
      enabled: false

252
    # MQTT protocol bridge for IoT device connectivity
253
    mqtt:
254
      # Whether to enable MQTT protocol support
255
256
      enabled: false

257
    # Gateway connections for multi-cluster NATS deployments
258
    gateway:
259
      # Whether to enable gateway connections
260
261
      enabled: false

262
    # HTTP monitoring endpoint for NATS server metrics
263
    monitor:
264
      # Whether to enable HTTP monitoring interface
265
      enabled: true
266
      # Port for monitoring HTTP endpoint
267
      port: 8222
268
269

      # TLS configuration for monitoring endpoint
270
      tls:
271
272
        # Whether to enable HTTPS for monitoring (requires config.nats.tls enabled)
        # When enabled, monitoring port will use HTTPS with the options from config.nats.tls
273
274
        enabled: false

275
    # Go pprof profiling endpoint for performance debugging
276
    profiling:
277
      # Whether to enable profiling endpoint (for debugging only)
278
      enabled: false
279
      # Port for profiling endpoint
280
281
      port: 65432

282
    # Account resolver for multi-tenant NATS deployments
283
    resolver:
284
      # Whether to enable account resolution (for advanced multi-tenancy)
285
286
      enabled: false

287
288
289
    # Server naming configuration
    # Adds a prefix to the server name, which defaults to the pod name
    # Helpful for ensuring server name is unique in a super cluster
290
291
    serverNamePrefix: ""

292
293
294
    # Advanced NATS configuration merging and patching
    # For complete options see: https://docs.nats.io/running-a-nats-service/configuration
    # Special rules apply:
295
296
297
298
299
    #  1. strings that start with << and end with >> will be unquoted
    #     use this for variables and numbers with units
    #  2. keys ending in $include will be switched to include directives
    #     keys are sorted alphabetically, use prefix before $includes to control includes ordering
    #     paths should be relative to /etc/nats-config/nats.conf
300
    # Example:
301
302
303
304
305
306
307
308
309
310
311
312
    #   merge:
    #     $include: ./my-config.conf
    #     zzz$include: ./my-config-last.conf
    #     server_name: nats
    #     authorization:
    #       token: << $TOKEN >>
    #     jetstream:
    #       max_memory_store: << 1GB >>
    merge: {}
    patch: []

  ############################################################
313
  # NATS container configuration in StatefulSet
314
315
  ############################################################
  container:
316
    # NATS server container image configuration
317
    image:
318
      # Official NATS server repository
319
      repository: nats
320
      # NATS server version (Alpine-based for smaller size)
321
      tag: 2.10.21-alpine
322
      # Image pull policy (leave empty for chart default)
323
      pullPolicy:
324
      # Custom registry URL (leave empty for Docker Hub)
325
326
      registry:

327
328
    # Container port configuration
    # Note: Ports must also be enabled in the config section above
329
330
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#containerport-v1-core
    ports:
331
      # Main NATS client connection port
332
      nats: {}
333
      # Leaf node connection port
334
      leafnodes: {}
335
      # WebSocket connection port
336
      websocket: {}
337
      # MQTT protocol port
338
      mqtt: {}
339
      # Cluster communication port
340
      cluster: {}
341
      # Gateway connection port
342
      gateway: {}
343
      # HTTP monitoring port
344
      monitor: {}
345
      # Go profiling port
346
347
      profiling: {}

348
349
350
    # Environment variables for the NATS container
    # Map with key as env var name, value can be string or map
    # Example:
351
352
353
354
355
356
357
358
359
    #   env:
    #     GOMEMLIMIT: 7GiB
    #     TOKEN:
    #       valueFrom:
    #         secretKeyRef:
    #           name: nats-auth
    #           key: token
    env: {}

360
    # Advanced container configuration merging and patching
361
362
363
364
365
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#container-v1-core
    merge: {}
    patch: []

  ############################################################
366
  # Configuration reloader container for hot config updates
367
368
  ############################################################
  reloader:
369
    # Whether to enable the config reloader sidecar container
370
    enabled: true
371
372

    # Config reloader container image
373
    image:
374
      # Official NATS config reloader repository
375
      repository: natsio/nats-server-config-reloader
376
      # Config reloader version
377
      tag: 0.16.0
378
      # Image pull policy (leave empty for chart default)
379
      pullPolicy:
380
      # Custom registry URL (leave empty for Docker Hub)
381
382
      registry:

383
    # Environment variables for the reloader container
384
385
    env: {}

386
387
    # Volume mount prefixes from NATS container to share with reloader
    # All NATS container volume mounts with these prefixes will be mounted into the reloader
388
389
390
    natsVolumeMountPrefixes:
    - /etc/

391
    # Advanced reloader container configuration
392
393
394
395
396
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#container-v1-core
    merge: {}
    patch: []

  ############################################################
397
  # Prometheus metrics exporter container (optional)
398
  ############################################################
399
  # Note: config.monitor must be enabled for this to work
400
  promExporter:
401
    # Whether to enable Prometheus metrics exporter sidecar
402
403
404
    enabled: false

  ############################################################
405
  # Kubernetes Service for NATS access
406
407
  ############################################################
  service:
408
    # Whether to create a Kubernetes Service for NATS
409
410
    enabled: true

411
412
413
    # Service port configuration
    # Additional boolean field 'enabled' controls whether port is exposed in the service
    # Note: Ports must also be enabled in the config section above
414
415
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#serviceport-v1-core
    ports:
416
      # Main NATS client connection port
417
418
      nats:
        enabled: true
419
      # Leaf node connection port
420
421
      leafnodes:
        enabled: true
422
      # WebSocket connection port
423
424
      websocket:
        enabled: true
425
      # MQTT protocol port
426
427
      mqtt:
        enabled: true
428
      # Cluster communication port (typically internal only)
429
430
      cluster:
        enabled: false
431
      # Gateway connection port (typically internal only)
432
433
      gateway:
        enabled: false
434
      # HTTP monitoring port (typically internal only)
435
436
      monitor:
        enabled: false
437
      # Go profiling port (typically internal only)
438
439
440
      profiling:
        enabled: false

441
    # Advanced service configuration
442
443
444
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#service-v1-core
    merge: {}
    patch: []
445
    # Service name (defaults to "{{ include "nats.fullname" $ }}")
446
447
448
    name:

  ############################################################
449
  # Advanced NATS Kubernetes resource configuration
450
451
  ############################################################

452
  # StatefulSet configuration for NATS server persistence
453
  statefulSet:
454
    # Advanced StatefulSet configuration merging and patching
455
456
457
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#statefulset-v1-apps
    merge: {}
    patch: []
458
    # StatefulSet name (defaults to "{{ include "nats.fullname" $ }}")
459
460
    name:

461
  # Pod template configuration for NATS StatefulSet
462
  podTemplate:
463
464
    # Whether to add a hash of the ConfigMap as a pod annotation
    # This will cause the StatefulSet to roll when the ConfigMap is updated
465
466
    configChecksumAnnotation: true

467
468
469
470
471
472
473
    # Pod topology spread constraints for better distribution across nodes
    # Map of topologyKey: topologySpreadConstraint
    # labelSelector will be added automatically to match StatefulSet pods
    # Example:
    #   topologySpreadConstraints:
    #     kubernetes.io/hostname:
    #       maxSkew: 1
474
475
    topologySpreadConstraints: {}

476
    # Advanced pod template configuration
477
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#pod-v1-core
478
479
    merge:
      spec:
480
        # Node tolerations for NATS pods (allows scheduling on specific nodes)
481
        tolerations: []
482
483
    patch: []

484
  # Headless service for StatefulSet pod discovery
485
  headlessService:
486
    # Advanced headless service configuration
487
488
489
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#service-v1-core
    merge: {}
    patch: []
490
    # Headless service name (defaults to "{{ include "nats.fullname" $ }}-headless")
491
492
    name:

493
  # ConfigMap for NATS server configuration
494
  configMap:
495
    # Advanced ConfigMap configuration
496
497
498
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#configmap-v1-core
    merge: {}
    patch: []
499
    # ConfigMap name (defaults to "{{ include "nats.fullname" $ }}-config")
500
501
    name:

502
  # Pod Disruption Budget for controlled rolling updates
503
  podDisruptionBudget:
504
    # Whether to create a PodDisruptionBudget (recommended for production)
505
506
    enabled: true

507
  # Service Account for NATS server pods
508
  serviceAccount:
509
    # Whether to create and use a dedicated service account
510
511
512
    enabled: false

  ############################################################
513
514
  # NATS Box - CLI tools and debugging container
  # NATS Box provides CLI tools for interacting with NATS server
515
516
  ############################################################
  natsBox:
517
    # Whether to deploy NATS Box for CLI access and debugging
518
519
520
    enabled: true

    ############################################################
521
    # NATS client contexts for authentication and connection
522
523
    ############################################################
    contexts:
524
      # Default context configuration
525
      default:
526
        # Credentials-based authentication
527
        creds:
528
          # Inline credentials file contents (base64 encoded)
529
          contents:
530
          # Name of existing secret containing credentials file
531
          secretName:
532
          # Directory to mount credentials (defaults to /etc/nats-creds/<context-name>)
533
          dir:
534
          # Key name in secret for credentials file
535
          key: nats.creds
536
537

        # NKey-based authentication (public/private key pairs)
538
        nkey:
539
          # Inline NKey file contents (base64 encoded)
540
          contents:
541
          # Name of existing secret containing NKey file
542
          secretName:
543
          # Directory to mount NKey (defaults to /etc/nats-nkeys/<context-name>)
544
          dir:
545
          # Key name in secret for NKey file
546
          key: nats.nk
547
548

        # TLS client certificate authentication
549
        tls:
550
          # Name of existing secret containing TLS client certificates
551
          secretName:
552
          # Directory to mount certificates (defaults to /etc/nats-certs/<context-name>)
553
          dir:
554
          # Certificate file name in secret
555
          cert: tls.crt
556
          # Private key file name in secret
557
558
          key: tls.key

559
560
        # Advanced context configuration
        # For options see: https://docs.nats.io/using-nats/nats-tools/nats_cli#nats-contexts
561
562
563
        merge: {}
        patch: []

564
    # Name of context to select by default for NATS CLI operations
565
566
567
    defaultContextName: default

    ############################################################
568
    # NATS Box container configuration
569
570
    ############################################################
    container:
571
      # NATS Box container image
572
      image:
573
        # Official NATS Box repository with CLI tools
574
        repository: natsio/nats-box
575
        # NATS Box version
576
        tag: 0.14.5
577
        # Image pull policy (leave empty for chart default)
578
        pullPolicy:
579
        # Custom registry URL (leave empty for Docker Hub)
580
581
        registry:

582
      # Environment variables for NATS Box container
583
584
      env: {}

585
      # Advanced container configuration
586
587
588
      # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#container-v1-core
      merge: {}
      patch: []
589
590

    # Service Account for NATS Box deployment
591
    serviceAccount:
592
      # Whether to create and use a dedicated service account for NATS Box
593
      enabled: false
594

595
    # Pod template configuration for NATS Box deployment
596
597
598
    podTemplate:
      merge:
        spec:
599
          # Node tolerations for NATS Box pods
600
601
          tolerations: []
      patch: []