values.yaml 20.9 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Used to generate top-level secrets (overridden by custom-values.yaml)
16

17
18
19
# Subcharts configuration

# Dynamo operator configuration
20
dynamo-operator:
21
  # -- Whether to enable the Dynamo Kubernetes operator deployment
22
  enabled: true
23
24

  # -- NATS server address for operator communication (leave empty to use the bundled NATS chart). Format: "nats://hostname:port"
25
  natsAddr: ""
26
27

  # -- etcd server address for operator state storage (leave empty to use the bundled etcd chart). Format: "http://hostname:port" or "https://hostname:port"
28
  etcdAddr: ""
29

30
31
32
  # -- URL for the Model Express server if not deployed by this helm chart. This is ignored if Model Express server is installed by this helm chart (global.model-express.enabled is true).
  modelExpressURL: ""
  # -- Namespace access controls for the operator
33
  namespaceRestriction:
34
    # -- Whether to restrict operator to specific namespaces
35
    enabled: true
36
    # -- Target namespace for operator deployment (leave empty for current namespace)
37
    targetNamespace:
38
39

  # Controller manager configuration
40
  controllerManager:
41
    # -- Node tolerations for controller manager pods
42
    tolerations: []
43

44
    manager:
45
      # Container image configuration for the operator manager
46
      image:
47
        # -- Official NVIDIA Dynamo operator image repository
48
        repository: "nvcr.io/nvidia/ai-dynamo/kubernetes-operator"
49
        # -- Image tag (leave empty to use chart default)
50
        tag: ""
51
        # -- Image pull policy - when to pull the image
52
        pullPolicy: IfNotPresent
53
54

      # Command line arguments for the operator manager
55
      args:
56
        # -- Health probe endpoint for Kubernetes health checks
57
        - --health-probe-bind-address=:8081
58
        # -- Metrics endpoint for Prometheus scraping (localhost only for security)
59
        - --metrics-bind-address=127.0.0.1:8080
60
61

  # -- Secrets for pulling private container images
62
  imagePullSecrets: []
63
64

  # Core Dynamo platform configuration
65
  dynamo:
66
    # -- How long to wait before forcefully terminating Grove instances
67
    groveTerminationDelay: 15m
68
69

    # Internal utility images used by the platform
70
    internalImages:
71
      # -- Debugger image for troubleshooting deployments
72
      debugger: python:3.12-slim
73
74

    # -- Whether to enable restricted security contexts for enhanced security
75
    enableRestrictedSecurityContext: false
76
77

    # Docker registry configuration for private repositories
78
    dockerRegistry:
79
      # -- Whether to use Kubernetes secrets for registry authentication
80
      useKubernetesSecret: false
81
      # -- Docker registry server URL
82
      server:
83
      # -- Registry username
84
      username:
85
      # -- Registry password (consider using existingSecretName instead)
86
      password:
87
      # -- Name of existing Kubernetes secret containing registry credentials
88
      existingSecretName:
89
      # -- Whether the registry uses HTTPS
90
      secure: true
91
92

    # Ingress configuration for external access
93
    ingress:
94
      # -- Whether to create ingress resources
95
      enabled: false
96
      # -- Ingress class name (e.g., "nginx", "traefik")
97
      className:
98
      # -- Secret name containing TLS certificates
99
      tlsSecretName: my-tls-secret
100
101

    # Istio service mesh configuration
102
    istio:
103
      # -- Whether to enable Istio integration
104
      enabled: false
105
      # -- Istio gateway name for routing
106
      gateway:
107
108

    # -- Host suffix for generated ingress hostnames
109
    ingressHostSuffix: ""
110
111

    # -- Whether VirtualServices should support HTTPS routing
112
    virtualServiceSupportsHTTPS: false
113

114
115
116
117
118
119
120
121
122
123
124
125
126

# Grove component - distributed inference orchestration
grove:
  # -- Whether to enable Grove for multi-node inference coordination, if enabled, the Grove operator will be deployed cluster-wide
  enabled: false

# Kai Scheduler component - advanced workload scheduling
kai-scheduler:
  # -- Whether to enable Kai Scheduler for intelligent resource allocation, if enabled, the Kai Scheduler operator will be deployed cluster-wide
  enabled: false

# etcd configuration - distributed key-value store for operator state
# For complete configuration options, see: https://github.com/bitnami/charts/tree/main/bitnami/etcd
127
etcd:
128
  # -- Whether to enable etcd deployment, disable if you want to use an external etcd instance
129
  enabled: true
130
131

  # Persistent storage configuration for etcd data
132
  persistence:
133
    # Whether to enable persistent storage (recommended for production)
134
135
136
    enabled: true
    # Use the cluster default storage-class or override with a named class
    storageClass: null
137
    # Size of persistent volume for etcd data
138
    size: 1Gi
139
140

  # Pre-upgrade job configuration
141
  preUpgrade:
142
    # Whether to run pre-upgrade validation jobs
143
    enabled: false
144
145

  # Number of etcd replicas (1 for single-node, 3+ for HA)
146
  replicaCount: 1
147
148
149

  # Authentication and authorization settings
  # Explicitly remove authentication for simplified internal communication
150
151
  auth:
    rbac:
152
      # Whether to create RBAC authentication (disabled for internal use)
153
154
      create: false

155
  # Health check configuration
156
  readinessProbe:
157
    # Whether to enable readiness probes (disabled to reduce startup complexity)
158
159
160
    enabled: false

  livenessProbe:
161
    # Whether to enable liveness probes (disabled to reduce startup complexity)
162
163
    enabled: false

164
  # Node tolerations for etcd pods (allows scheduling on specific nodes)
165
166
  tolerations: []

167
168
# NATS configuration - messaging system for operator communication
# For complete configuration options, see: https://github.com/nats-io/k8s/tree/main/helm/charts/nats
169
nats:
170
  # -- Whether to enable NATS deployment, disable if you want to use an external NATS instance
171
  enabled: true
172
173
174
175

  # TLS Certificate Authority configuration for secure communication
  # Reference a common CA Certificate or Bundle in all nats config `tls` blocks and nats-box contexts
  # Note: `tls.verify` still must be set in the appropriate nats config `tls` blocks to require mTLS
176
  tlsCA:
177
    # Whether to enable TLS CA configuration
178
179
    enabled: false

180
  # Core NATS server configuration
181
  config:
182
    # NATS clustering for high availability (multiple NATS servers)
183
    cluster:
184
      # Whether to enable NATS clustering (disabled for single-node setups)
185
186
      enabled: false

187
    # JetStream - persistent messaging and streaming capabilities
188
    jetstream:
189
      # Whether to enable JetStream (recommended for persistent messaging)
190
191
      enabled: true

192
      # File-based storage for JetStream streams and consumers
193
      fileStore:
194
        # Whether to enable file storage (persistent across restarts)
195
        enabled: true
196
        # Directory path for JetStream file storage
197
198
199
        dir: /data

        ############################################################
200
        # Persistent Volume Claim for JetStream file storage
201
202
        ############################################################
        pvc:
203
          # Whether to create a PVC for JetStream storage
204
          enabled: true
205
          # Size of the persistent volume for JetStream data
206
          size: 10Gi
207
          # Storage class name (leave empty for default)
208
209
          storageClassName:

210
          # Advanced PVC configuration (merge additional fields)
211
212
213
          # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#persistentvolumeclaim-v1-core
          merge: {}
          patch: []
214
          # PVC name (defaults to "{{ include "nats.fullname" $ }}-js")
215
216
          name:

217
        # Maximum size for JetStream file storage (defaults to PVC size)
218
219
        maxSize:

220
      # Memory-based storage for JetStream (non-persistent)
221
      memoryStore:
222
        # Whether to enable memory storage (faster but not persistent)
223
224
        enabled: false

225
226
      # Advanced JetStream configuration
      # For options see: https://docs.nats.io/running-a-nats-service/configuration#jetstream
227
228
229
      merge: {}
      patch: []

230
    # Core NATS server settings
231
    nats:
232
      # Port for NATS client connections
233
      port: 4222
234
235

      # TLS configuration for encrypted connections
236
      tls:
237
        # Whether to enable TLS encryption
238
        enabled: false
239
240
        # Advanced TLS configuration
        # For options see: https://docs.nats.io/running-a-nats-service/configuration/securing_nats/tls
241
242
243
        merge: {}
        patch: []

244
    # Leaf nodes for creating NATS topologies and remote connections
245
    leafnodes:
246
      # Whether to enable leaf node connections
247
248
      enabled: false

249
    # WebSocket support for browser-based NATS clients
250
    websocket:
251
      # Whether to enable WebSocket protocol support
252
253
      enabled: false

254
    # MQTT protocol bridge for IoT device connectivity
255
    mqtt:
256
      # Whether to enable MQTT protocol support
257
258
      enabled: false

259
    # Gateway connections for multi-cluster NATS deployments
260
    gateway:
261
      # Whether to enable gateway connections
262
263
      enabled: false

264
    # HTTP monitoring endpoint for NATS server metrics
265
    monitor:
266
      # Whether to enable HTTP monitoring interface
267
      enabled: true
268
      # Port for monitoring HTTP endpoint
269
      port: 8222
270
271

      # TLS configuration for monitoring endpoint
272
      tls:
273
274
        # Whether to enable HTTPS for monitoring (requires config.nats.tls enabled)
        # When enabled, monitoring port will use HTTPS with the options from config.nats.tls
275
276
        enabled: false

277
    # Go pprof profiling endpoint for performance debugging
278
    profiling:
279
      # Whether to enable profiling endpoint (for debugging only)
280
      enabled: false
281
      # Port for profiling endpoint
282
283
      port: 65432

284
    # Account resolver for multi-tenant NATS deployments
285
    resolver:
286
      # Whether to enable account resolution (for advanced multi-tenancy)
287
288
      enabled: false

289
290
291
    # Server naming configuration
    # Adds a prefix to the server name, which defaults to the pod name
    # Helpful for ensuring server name is unique in a super cluster
292
293
    serverNamePrefix: ""

294
295
296
    # Advanced NATS configuration merging and patching
    # For complete options see: https://docs.nats.io/running-a-nats-service/configuration
    # Special rules apply:
297
298
299
300
301
    #  1. strings that start with << and end with >> will be unquoted
    #     use this for variables and numbers with units
    #  2. keys ending in $include will be switched to include directives
    #     keys are sorted alphabetically, use prefix before $includes to control includes ordering
    #     paths should be relative to /etc/nats-config/nats.conf
302
    # Example:
303
304
305
306
307
308
309
310
311
312
313
314
    #   merge:
    #     $include: ./my-config.conf
    #     zzz$include: ./my-config-last.conf
    #     server_name: nats
    #     authorization:
    #       token: << $TOKEN >>
    #     jetstream:
    #       max_memory_store: << 1GB >>
    merge: {}
    patch: []

  ############################################################
315
  # NATS container configuration in StatefulSet
316
317
  ############################################################
  container:
318
    # NATS server container image configuration
319
    image:
320
      # Official NATS server repository
321
      repository: nats
322
      # NATS server version (Alpine-based for smaller size)
323
      tag: 2.10.21-alpine
324
      # Image pull policy (leave empty for chart default)
325
      pullPolicy:
326
      # Custom registry URL (leave empty for Docker Hub)
327
328
      registry:

329
330
    # Container port configuration
    # Note: Ports must also be enabled in the config section above
331
332
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#containerport-v1-core
    ports:
333
      # Main NATS client connection port
334
      nats: {}
335
      # Leaf node connection port
336
      leafnodes: {}
337
      # WebSocket connection port
338
      websocket: {}
339
      # MQTT protocol port
340
      mqtt: {}
341
      # Cluster communication port
342
      cluster: {}
343
      # Gateway connection port
344
      gateway: {}
345
      # HTTP monitoring port
346
      monitor: {}
347
      # Go profiling port
348
349
      profiling: {}

350
351
352
    # Environment variables for the NATS container
    # Map with key as env var name, value can be string or map
    # Example:
353
354
355
356
357
358
359
360
361
    #   env:
    #     GOMEMLIMIT: 7GiB
    #     TOKEN:
    #       valueFrom:
    #         secretKeyRef:
    #           name: nats-auth
    #           key: token
    env: {}

362
    # Advanced container configuration merging and patching
363
364
365
366
367
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#container-v1-core
    merge: {}
    patch: []

  ############################################################
368
  # Configuration reloader container for hot config updates
369
370
  ############################################################
  reloader:
371
    # Whether to enable the config reloader sidecar container
372
    enabled: true
373
374

    # Config reloader container image
375
    image:
376
      # Official NATS config reloader repository
377
      repository: natsio/nats-server-config-reloader
378
      # Config reloader version
379
      tag: 0.16.0
380
      # Image pull policy (leave empty for chart default)
381
      pullPolicy:
382
      # Custom registry URL (leave empty for Docker Hub)
383
384
      registry:

385
    # Environment variables for the reloader container
386
387
    env: {}

388
389
    # Volume mount prefixes from NATS container to share with reloader
    # All NATS container volume mounts with these prefixes will be mounted into the reloader
390
391
392
    natsVolumeMountPrefixes:
    - /etc/

393
    # Advanced reloader container configuration
394
395
396
397
398
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#container-v1-core
    merge: {}
    patch: []

  ############################################################
399
  # Prometheus metrics exporter container (optional)
400
  ############################################################
401
  # Note: config.monitor must be enabled for this to work
402
  promExporter:
403
    # Whether to enable Prometheus metrics exporter sidecar
404
405
406
    enabled: false

  ############################################################
407
  # Kubernetes Service for NATS access
408
409
  ############################################################
  service:
410
    # Whether to create a Kubernetes Service for NATS
411
412
    enabled: true

413
414
415
    # Service port configuration
    # Additional boolean field 'enabled' controls whether port is exposed in the service
    # Note: Ports must also be enabled in the config section above
416
417
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#serviceport-v1-core
    ports:
418
      # Main NATS client connection port
419
420
      nats:
        enabled: true
421
      # Leaf node connection port
422
423
      leafnodes:
        enabled: true
424
      # WebSocket connection port
425
426
      websocket:
        enabled: true
427
      # MQTT protocol port
428
429
      mqtt:
        enabled: true
430
      # Cluster communication port (typically internal only)
431
432
      cluster:
        enabled: false
433
      # Gateway connection port (typically internal only)
434
435
      gateway:
        enabled: false
436
      # HTTP monitoring port (typically internal only)
437
438
      monitor:
        enabled: false
439
      # Go profiling port (typically internal only)
440
441
442
      profiling:
        enabled: false

443
    # Advanced service configuration
444
445
446
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#service-v1-core
    merge: {}
    patch: []
447
    # Service name (defaults to "{{ include "nats.fullname" $ }}")
448
449
450
    name:

  ############################################################
451
  # Advanced NATS Kubernetes resource configuration
452
453
  ############################################################

454
  # StatefulSet configuration for NATS server persistence
455
  statefulSet:
456
    # Advanced StatefulSet configuration merging and patching
457
458
459
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#statefulset-v1-apps
    merge: {}
    patch: []
460
    # StatefulSet name (defaults to "{{ include "nats.fullname" $ }}")
461
462
    name:

463
  # Pod template configuration for NATS StatefulSet
464
  podTemplate:
465
466
    # Whether to add a hash of the ConfigMap as a pod annotation
    # This will cause the StatefulSet to roll when the ConfigMap is updated
467
468
    configChecksumAnnotation: true

469
470
471
472
473
474
475
    # Pod topology spread constraints for better distribution across nodes
    # Map of topologyKey: topologySpreadConstraint
    # labelSelector will be added automatically to match StatefulSet pods
    # Example:
    #   topologySpreadConstraints:
    #     kubernetes.io/hostname:
    #       maxSkew: 1
476
477
    topologySpreadConstraints: {}

478
    # Advanced pod template configuration
479
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#pod-v1-core
480
481
    merge:
      spec:
482
        # Node tolerations for NATS pods (allows scheduling on specific nodes)
483
        tolerations: []
484
485
    patch: []

486
  # Headless service for StatefulSet pod discovery
487
  headlessService:
488
    # Advanced headless service configuration
489
490
491
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#service-v1-core
    merge: {}
    patch: []
492
    # Headless service name (defaults to "{{ include "nats.fullname" $ }}-headless")
493
494
    name:

495
  # ConfigMap for NATS server configuration
496
  configMap:
497
    # Advanced ConfigMap configuration
498
499
500
    # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#configmap-v1-core
    merge: {}
    patch: []
501
    # ConfigMap name (defaults to "{{ include "nats.fullname" $ }}-config")
502
503
    name:

504
  # Pod Disruption Budget for controlled rolling updates
505
  podDisruptionBudget:
506
    # Whether to create a PodDisruptionBudget (recommended for production)
507
508
    enabled: true

509
  # Service Account for NATS server pods
510
  serviceAccount:
511
    # Whether to create and use a dedicated service account
512
513
514
    enabled: false

  ############################################################
515
516
  # NATS Box - CLI tools and debugging container
  # NATS Box provides CLI tools for interacting with NATS server
517
518
  ############################################################
  natsBox:
519
    # Whether to deploy NATS Box for CLI access and debugging
520
521
522
    enabled: true

    ############################################################
523
    # NATS client contexts for authentication and connection
524
525
    ############################################################
    contexts:
526
      # Default context configuration
527
      default:
528
        # Credentials-based authentication
529
        creds:
530
          # Inline credentials file contents (base64 encoded)
531
          contents:
532
          # Name of existing secret containing credentials file
533
          secretName:
534
          # Directory to mount credentials (defaults to /etc/nats-creds/<context-name>)
535
          dir:
536
          # Key name in secret for credentials file
537
          key: nats.creds
538
539

        # NKey-based authentication (public/private key pairs)
540
        nkey:
541
          # Inline NKey file contents (base64 encoded)
542
          contents:
543
          # Name of existing secret containing NKey file
544
          secretName:
545
          # Directory to mount NKey (defaults to /etc/nats-nkeys/<context-name>)
546
          dir:
547
          # Key name in secret for NKey file
548
          key: nats.nk
549
550

        # TLS client certificate authentication
551
        tls:
552
          # Name of existing secret containing TLS client certificates
553
          secretName:
554
          # Directory to mount certificates (defaults to /etc/nats-certs/<context-name>)
555
          dir:
556
          # Certificate file name in secret
557
          cert: tls.crt
558
          # Private key file name in secret
559
560
          key: tls.key

561
562
        # Advanced context configuration
        # For options see: https://docs.nats.io/using-nats/nats-tools/nats_cli#nats-contexts
563
564
565
        merge: {}
        patch: []

566
    # Name of context to select by default for NATS CLI operations
567
568
569
    defaultContextName: default

    ############################################################
570
    # NATS Box container configuration
571
572
    ############################################################
    container:
573
      # NATS Box container image
574
      image:
575
        # Official NATS Box repository with CLI tools
576
        repository: natsio/nats-box
577
        # NATS Box version
578
        tag: 0.14.5
579
        # Image pull policy (leave empty for chart default)
580
        pullPolicy:
581
        # Custom registry URL (leave empty for Docker Hub)
582
583
        registry:

584
      # Environment variables for NATS Box container
585
586
      env: {}

587
      # Advanced container configuration
588
589
590
      # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#container-v1-core
      merge: {}
      patch: []
591
592

    # Service Account for NATS Box deployment
593
    serviceAccount:
594
      # Whether to create and use a dedicated service account for NATS Box
595
      enabled: false
596

597
    # Pod template configuration for NATS Box deployment
598
599
600
    podTemplate:
      merge:
        spec:
601
          # Node tolerations for NATS Box pods
602
603
          tolerations: []
      patch: []