# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Used to generate top-level secrets (overridden by custom-values.yaml) global: etcd: # -- Whether this chart should install the bundled etcd subchart. # When true, deploys etcd and auto-configures the operator with its address. # When false, etcd is not deployed. Use dynamo-operator.etcdAddr to point at an external instance if you are bringing your own etcd. install: false kai-scheduler: # -- Whether this chart should install the bundled kai-scheduler subchart. # When true, deploys kai-scheduler and its CRDs. Integration is automatically enabled. # NOTE: For production environments, it is recommended to install kai-scheduler separately. install: false # -- Whether to enable Kai Scheduler integration (queue creation, schedulerName injection). # Set to true when kai-scheduler is available in the cluster (installed externally). # Automatically enabled when install=true. The operator uses this to decide whether to # inject schedulerName and queue labels into pod templates. enabled: false grove: # -- Whether this chart should install the bundled Grove subchart. # When true, deploys the Grove operator cluster-wide. Integration is automatically enabled. # NOTE: For production environments, it is recommended to install Grove separately. install: false # -- Whether to enable Grove integration (multinode orchestration via PodCliqueSets). # Set to true when Grove is available in the cluster (installed externally). # Automatically true when install=true. The operator uses this to decide whether to # create PodCliqueSets for multinode deployments. enabled: false # Subcharts configuration # Dynamo operator configuration dynamo-operator: # -- Whether to enable the Dynamo Kubernetes operator deployment enabled: true # -- Whether to manage CRDs via a pre-install/pre-upgrade hook Job. # The Job runs the operator image with the crd-apply tool to apply CRDs via server-side apply. upgradeCRD: true # -- NATS server address for operator communication (leave empty to use the bundled NATS chart). Format: "nats://hostname:port" natsAddr: "" # -- etcd server address for an external etcd instance. Only needed when using external etcd without the bundled subchart. Format: "http://hostname:port" or "https://hostname:port" etcdAddr: "" nats: # -- Whether the NATS is enabled enabled: true # -- URL for the Model Express server if not deployed by this helm chart. This is ignored if Model Express server is installed by this helm chart (global.model-express.enabled is true). modelExpressURL: "" # -- Namespace access controls for the operator namespaceRestriction: # -- Whether to restrict operator to specific namespaces. By default, the operator will run with cluster-wide permissions. Only 1 instance of the operator should be deployed in the cluster. If you want to deploy multiple operator instances, you can set this to true and specify the target namespace (by default, the target namespace is the helm release namespace). enabled: false # -- Target namespace for operator deployment (leave empty for current namespace) targetNamespace: # Namespace scope marker lease configuration (used to prevent conflicts when running both cluster-wide and namespace-restricted operators) lease: # Duration before the namespace scope marker lease expires if not renewed (namespace-restricted mode only). When a namespace-restricted operator is running, it creates a lease in its namespace. The cluster-wide operator detects this lease and excludes that namespace from processing. If the namespace operator stops renewing the lease (e.g., crashes), the lease expires and the cluster-wide operator automatically resumes processing that namespace. duration: 30s # Interval for renewing the namespace scope marker lease (namespace-restricted mode only). The namespace-restricted operator renews its lease at this interval to signal it's still running. renewInterval: 10s # -- GPU discovery configuration (only applies when namespaceRestriction.enabled=true) gpuDiscovery: # -- Whether to provision a ClusterRole for the namespace-scoped operator to read GPU node labels. # When true (default), Helm creates a ClusterRole/ClusterRoleBinding granting node read access. # Set to false if your installer lacks ClusterRole creation permissions. enabled: true # -- The Dynamo discovery backend to use. Default is "kubernetes" for Kubernetes API service discovery. Set to "etcd" to use ETCD for discovery. -- discoveryBackend: "kubernetes" # Controller manager configuration controllerManager: # -- Node tolerations for controller manager pods tolerations: [] # -- Affinity for controller manager pods affinity: {} # Leader election configuration for cluster-wide coordination leaderElection: # -- Leader election ID for cluster-wide coordination. WARNING: All cluster-wide operators must use the SAME ID to prevent split-brain. Different IDs would allow multiple leaders simultaneously. id: "" # If empty, defaults to: dynamo.nvidia.com (shared across all cluster-wide operators) # -- Namespace for leader election leases (only used in cluster-wide mode). If empty, defaults to kube-system for cluster-wide coordination. All cluster-wide operators should use the SAME namespace for proper leader election. namespace: "" manager: # Container image configuration for the operator manager image: # -- Official NVIDIA Dynamo operator image repository repository: "nvcr.io/nvidia/ai-dynamo/kubernetes-operator" # -- Image tag (leave empty to use chart default) tag: "" # -- Image pull policy - when to pull the image pullPolicy: IfNotPresent # Command line arguments for the operator manager args: # -- Health probe endpoint for Kubernetes health checks - --health-probe-bind-address=:8081 # -- Metrics endpoint for Prometheus scraping (localhost only for security) - --metrics-bind-address=127.0.0.1:8080 # -- Secrets for pulling private container images imagePullSecrets: [] # Core Dynamo platform configuration dynamo: # -- How long to wait before forcefully terminating Grove instances groveTerminationDelay: 4h # Docker registry configuration for private repositories dockerRegistry: # -- Whether to use Kubernetes secrets for registry authentication useKubernetesSecret: false # -- Docker registry server URL server: # -- Registry username username: # -- Registry password (consider using existingSecretName instead) password: # -- Name of existing Kubernetes secret containing registry credentials existingSecretName: # -- Whether the registry uses HTTPS secure: true # Ingress configuration for external access ingress: # -- Whether to create ingress resources enabled: false # -- Ingress class name (e.g., "nginx", "traefik") className: # -- Secret name containing TLS certificates tlsSecretName: my-tls-secret # Istio service mesh configuration istio: # -- Whether to enable Istio integration enabled: false # -- Istio gateway name for routing gateway: # -- Host suffix for generated ingress hostnames ingressHostSuffix: "" # -- Whether VirtualServices should support HTTPS routing virtualServiceSupportsHTTPS: false # Metrics configuration metrics: # -- Endpoint that services can use to retrieve metrics. If set, dynamo operator will automatically inject the PROMETHEUS_ENDPOINT environment variable into services it manages. Users can override the value of the PROMETHEUS_ENDPOINT environment variable by modifying the corresponding deployment's environment variables prometheusEndpoint: "" # MPI Run configuration mpiRun: # -- Name of the secret containing the SSH key for MPI Run secretName: "mpi-run-ssh-secret" # Webhook configuration for admission control and validation webhook: # Certificate configuration for webhook TLS certificateSecret: # -- Name of the Kubernetes secret containing webhook TLS certificates. The secret must contain three keys: tls.crt (server certificate), tls.key (server private key), and ca.crt (Certificate Authority certificate). name: webhook-server-cert # -- Whether to manage the certificate secret externally. When false (default), the operator's built-in cert-controller generates and rotates certificates automatically. When true, you must create the secret manually before installing the chart. external: false # -- CA bundle (base64 encoded) for webhook validation. Only used when certificateSecret.external=true. For automatic certificate generation or cert-manager integration, leave this empty as it will be injected automatically. caBundle: "" # -- Webhook failure policy controls how Kubernetes handles requests when the webhook is unavailable. 'Fail' (recommended for production) rejects requests if the webhook cannot be reached, ensuring strict validation. 'Ignore' allows requests through if the webhook is unavailable, providing availability over validation guarantees. failurePolicy: Fail # -- Timeout in seconds for webhook validation calls. If the webhook doesn't respond within this time, the request will be handled according to the failurePolicy. timeoutSeconds: 10 # Namespace selector for webhook scope control # -- Custom namespace selector for webhook validation. Use this to include or exclude specific namespaces from webhook validation. For CLUSTER-WIDE operators, you can exclude namespaces managed by namespace-restricted operators by using: matchExpressions: [{ key: "dynamo-operator", operator: "NotIn", values: ["namespace-restricted"] }]. For NAMESPACE-RESTRICTED operators, leave empty as it will be auto-configured to match only the operator's namespace. namespaceSelector: {} # cert-manager integration for automated certificate lifecycle management certManager: # -- Whether to use cert-manager for automatic certificate management. Requires cert-manager to be installed in the cluster. When enabled, cert-manager will provision and rotate certificates instead of the operator's built-in cert-controller. enabled: false # Certificate configuration for cert-manager certificate: # -- Certificate duration for webhook certificates managed by cert-manager (e.g., "8760h" for 1 year). cert-manager will automatically renew the certificate before it expires. duration: "8760h" # -- Time before certificate expiration to trigger renewal (e.g., "360h" for 15 days). cert-manager will attempt to renew the certificate when this threshold is reached. renewBefore: "360h" # Root CA configuration for cert-manager rootCA: # -- Duration for the root CA certificate (e.g., "87600h" for 10 years). The root CA typically has a much longer lifetime than the leaf certificates it signs. duration: "87600h" # -- Time before root CA expiration to trigger renewal (e.g., "720h" for 30 days). Renewing a CA can be disruptive as all signed certificates must be reissued. renewBefore: "720h" # Checkpoint configuration for fast pod restore using CRIU/cuda-checkpoint # NOTE: The checkpoint infrastructure (PVC + DaemonSet) must be installed separately # using the chrek Helm chart in each namespace where checkpointing is needed. checkpoint: # -- Whether to enable checkpoint/restore functionality enabled: false # -- Path written by worker when model is loaded and ready for checkpointing readyForCheckpointFilePath: "/tmp/ready-for-checkpoint" # Storage configuration # These settings tell the operator where to find checkpoint storage # Must match the configuration in the chrek chart storage: # -- Storage backend type: pvc, s3, or oci type: pvc # PVC storage configuration (used when type=pvc) pvc: # -- Name of the PVC created by the chrek chart pvcName: "chrek-pvc" # -- Base path within the PVC for storing checkpoints basePath: "/checkpoints" # S3 storage configuration (used when type=s3) s3: # -- S3 URI in format: s3://[endpoint/]bucket/prefix uri: "" # -- Reference to a secret containing AWS credentials credentialsSecretRef: "" # OCI registry storage configuration (used when type=oci) oci: # -- OCI URI in format: oci://registry/repository uri: "" # -- Reference to a docker config secret for registry authentication credentialsSecretRef: "" # Grove component - distributed inference orchestration # Installation is controlled by global.grove.install above. grove: # -- Node tolerations for Grove pods tolerations: [] # -- Affinity for Grove pods affinity: {} # Kai Scheduler component - advanced workload scheduling # Installation is controlled by global.kai-scheduler.install above. # Integration is controlled by global.kai-scheduler.enabled above. kai-scheduler: # Global configuration for kai-scheduler (applies to all components including crd-upgrader) global: # -- Node tolerations for kai-scheduler pods tolerations: [] # -- Affinity for kai-scheduler pods affinity: {} # etcd configuration - distributed key-value store # Installation is controlled by global.etcd.install above. etcd: image: # -- following bitnami announcement for brownout - https://github.com/bitnami/charts/tree/main/bitnami/etcd#%EF%B8%8F-important-notice-upcoming-changes-to-the-bitnami-catalog, we need to use the legacy repository until we migrate to the new "secure" repository repository: bitnamilegacy/etcd tag: 3.5.18-debian-12-r5 # Persistent storage configuration for etcd data persistence: # Whether to enable persistent storage (recommended for production) enabled: true # Use the cluster default storage-class or override with a named class storageClass: null # Size of persistent volume for etcd data size: 1Gi # Pre-upgrade job configuration preUpgradeJob: # Whether to run pre-upgrade validation jobs enabled: false # Number of etcd replicas (1 for single-node, 3+ for HA) replicaCount: 1 # Authentication and authorization settings # Explicitly remove authentication for simplified internal communication auth: rbac: # Whether to create RBAC authentication (disabled for internal use) create: false # Health check configuration readinessProbe: # Whether to enable readiness probes (disabled to reduce startup complexity) enabled: false livenessProbe: # Whether to enable liveness probes (disabled to reduce startup complexity) enabled: false # Pod Disruption Budget configuration # Should be enabled for HA deployments with 3+ replicas pdb: # Whether to create a PodDisruptionBudget (disabled for single-node deployments) create: false # Node tolerations for etcd pods (allows scheduling on specific nodes) tolerations: [] # Affinity for etcd pods affinity: {} # NATS configuration - messaging system for operator communication nats: # -- Whether to enable NATS deployment, disable if you want to use an external NATS instance. For complete configuration options, see: https://github.com/nats-io/k8s/tree/main/helm/charts/nats , all nats settings should be prefixed with "nats." enabled: true # TLS Certificate Authority configuration for secure communication # Reference a common CA Certificate or Bundle in all nats config `tls` blocks and nats-box contexts # Note: `tls.verify` still must be set in the appropriate nats config `tls` blocks to require mTLS tlsCA: # Whether to enable TLS CA configuration enabled: false # Core NATS server configuration config: # NATS clustering for high availability (multiple NATS servers) cluster: # Whether to enable NATS clustering (disabled for single-node setups) enabled: false # JetStream - persistent messaging and streaming capabilities jetstream: # Whether to enable JetStream (recommended for persistent messaging) enabled: true # File-based storage for JetStream streams and consumers fileStore: # Whether to enable file storage (persistent across restarts) enabled: true # Directory path for JetStream file storage dir: /data ############################################################ # Persistent Volume Claim for JetStream file storage ############################################################ pvc: # Whether to create a PVC for JetStream storage enabled: true # Size of the persistent volume for JetStream data size: 10Gi # Storage class name (leave empty for default) storageClassName: # Advanced PVC configuration (merge additional fields) # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#persistentvolumeclaim-v1-core merge: {} patch: [] # PVC name (defaults to "{{ include "nats.fullname" $ }}-js") name: # Maximum size for JetStream file storage (defaults to PVC size) maxSize: # Memory-based storage for JetStream (non-persistent) memoryStore: # Whether to enable memory storage (faster but not persistent) enabled: false # Advanced JetStream configuration # For options see: https://docs.nats.io/running-a-nats-service/configuration#jetstream merge: {} patch: [] # Core NATS server settings nats: # Port for NATS client connections port: 4222 # TLS configuration for encrypted connections tls: # Whether to enable TLS encryption enabled: false # Advanced TLS configuration # For options see: https://docs.nats.io/running-a-nats-service/configuration/securing_nats/tls merge: {} patch: [] # Leaf nodes for creating NATS topologies and remote connections leafnodes: # Whether to enable leaf node connections enabled: false # WebSocket support for browser-based NATS clients websocket: # Whether to enable WebSocket protocol support enabled: false # MQTT protocol bridge for IoT device connectivity mqtt: # Whether to enable MQTT protocol support enabled: false # Gateway connections for multi-cluster NATS deployments gateway: # Whether to enable gateway connections enabled: false # HTTP monitoring endpoint for NATS server metrics monitor: # Whether to enable HTTP monitoring interface enabled: true # Port for monitoring HTTP endpoint port: 8222 # TLS configuration for monitoring endpoint tls: # Whether to enable HTTPS for monitoring (requires config.nats.tls enabled) # When enabled, monitoring port will use HTTPS with the options from config.nats.tls enabled: false # Go pprof profiling endpoint for performance debugging profiling: # Whether to enable profiling endpoint (for debugging only) enabled: false # Port for profiling endpoint port: 65432 # Account resolver for multi-tenant NATS deployments resolver: # Whether to enable account resolution (for advanced multi-tenancy) enabled: false # Server naming configuration # Adds a prefix to the server name, which defaults to the pod name # Helpful for ensuring server name is unique in a super cluster serverNamePrefix: "" # Advanced NATS configuration merging and patching # For complete options see: https://docs.nats.io/running-a-nats-service/configuration # Special rules apply: # 1. strings that start with << and end with >> will be unquoted # use this for variables and numbers with units # 2. keys ending in $include will be switched to include directives # keys are sorted alphabetically, use prefix before $includes to control includes ordering # paths should be relative to /etc/nats-config/nats.conf # Example: # merge: # $include: ./my-config.conf # zzz$include: ./my-config-last.conf # server_name: nats # authorization: # token: << $TOKEN >> # jetstream: # max_memory_store: << 1GB >> merge: # 15MB to accommodate prompt embeddings: 10MB decoded → ~13.3MB base64-encoded + metadata # Also allows larger context: 256K tokens (int32 - 4 bytes each) = 1MB max_payload: 15728640 patch: [] ############################################################ # NATS container configuration in StatefulSet ############################################################ container: # NATS server container image configuration image: # Official NATS server repository repository: nats # NATS server version (Alpine-based for smaller size) tag: 2.10.21-alpine # Image pull policy (leave empty for chart default) pullPolicy: # Custom registry URL (leave empty for Docker Hub) registry: # Container port configuration # Note: Ports must also be enabled in the config section above # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#containerport-v1-core ports: # Main NATS client connection port nats: {} # Leaf node connection port leafnodes: {} # WebSocket connection port websocket: {} # MQTT protocol port mqtt: {} # Cluster communication port cluster: {} # Gateway connection port gateway: {} # HTTP monitoring port monitor: {} # Go profiling port profiling: {} # Environment variables for the NATS container # Map with key as env var name, value can be string or map # Example: # env: # GOMEMLIMIT: 7GiB # TOKEN: # valueFrom: # secretKeyRef: # name: nats-auth # key: token env: {} # Advanced container configuration merging and patching # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#container-v1-core merge: {} patch: [] ############################################################ # Configuration reloader container for hot config updates ############################################################ reloader: # Whether to enable the config reloader sidecar container enabled: true # Config reloader container image image: # Official NATS config reloader repository repository: natsio/nats-server-config-reloader # Config reloader version tag: 0.16.0 # Image pull policy (leave empty for chart default) pullPolicy: # Custom registry URL (leave empty for Docker Hub) registry: # Environment variables for the reloader container env: {} # Volume mount prefixes from NATS container to share with reloader # All NATS container volume mounts with these prefixes will be mounted into the reloader natsVolumeMountPrefixes: - /etc/ # Advanced reloader container configuration # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#container-v1-core merge: {} patch: [] ############################################################ # Prometheus metrics exporter container (optional) ############################################################ # Note: config.monitor must be enabled for this to work promExporter: # Whether to enable Prometheus metrics exporter sidecar enabled: false ############################################################ # Kubernetes Service for NATS access ############################################################ service: # Whether to create a Kubernetes Service for NATS enabled: true # Service port configuration # Additional boolean field 'enabled' controls whether port is exposed in the service # Note: Ports must also be enabled in the config section above # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#serviceport-v1-core ports: # Main NATS client connection port nats: enabled: true # Leaf node connection port leafnodes: enabled: true # WebSocket connection port websocket: enabled: true # MQTT protocol port mqtt: enabled: true # Cluster communication port (typically internal only) cluster: enabled: false # Gateway connection port (typically internal only) gateway: enabled: false # HTTP monitoring port (typically internal only) monitor: enabled: false # Go profiling port (typically internal only) profiling: enabled: false # Advanced service configuration # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#service-v1-core merge: {} patch: [] # Service name (defaults to "{{ include "nats.fullname" $ }}") name: ############################################################ # Advanced NATS Kubernetes resource configuration ############################################################ # StatefulSet configuration for NATS server persistence statefulSet: # Advanced StatefulSet configuration merging and patching # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#statefulset-v1-apps merge: {} patch: [] # StatefulSet name (defaults to "{{ include "nats.fullname" $ }}") name: # Pod template configuration for NATS StatefulSet podTemplate: # Whether to add a hash of the ConfigMap as a pod annotation # This will cause the StatefulSet to roll when the ConfigMap is updated configChecksumAnnotation: true # Pod topology spread constraints for better distribution across nodes # Map of topologyKey: topologySpreadConstraint # labelSelector will be added automatically to match StatefulSet pods # Example: # topologySpreadConstraints: # kubernetes.io/hostname: # maxSkew: 1 topologySpreadConstraints: {} # Advanced pod template configuration # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#pod-v1-core merge: spec: # Node tolerations for NATS pods (allows scheduling on specific nodes) tolerations: [] # Affinity for NATS pods affinity: {} patch: [] # Headless service for StatefulSet pod discovery headlessService: # Advanced headless service configuration # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#service-v1-core merge: {} patch: [] # Headless service name (defaults to "{{ include "nats.fullname" $ }}-headless") name: # ConfigMap for NATS server configuration configMap: # Advanced ConfigMap configuration # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#configmap-v1-core merge: {} patch: [] # ConfigMap name (defaults to "{{ include "nats.fullname" $ }}-config") name: # Pod Disruption Budget for controlled rolling updates podDisruptionBudget: # Whether to create a PodDisruptionBudget (recommended for production) enabled: true # Service Account for NATS server pods serviceAccount: # Whether to create and use a dedicated service account enabled: false ############################################################ # NATS Box - CLI tools and debugging container # NATS Box provides CLI tools for interacting with NATS server ############################################################ natsBox: # Whether to deploy NATS Box for CLI access and debugging enabled: false ############################################################ # NATS client contexts for authentication and connection ############################################################ contexts: # Default context configuration default: # Credentials-based authentication creds: # Inline credentials file contents (base64 encoded) contents: # Name of existing secret containing credentials file secretName: # Directory to mount credentials (defaults to /etc/nats-creds/) dir: # Key name in secret for credentials file key: nats.creds # NKey-based authentication (public/private key pairs) nkey: # Inline NKey file contents (base64 encoded) contents: # Name of existing secret containing NKey file secretName: # Directory to mount NKey (defaults to /etc/nats-nkeys/) dir: # Key name in secret for NKey file key: nats.nk # TLS client certificate authentication tls: # Name of existing secret containing TLS client certificates secretName: # Directory to mount certificates (defaults to /etc/nats-certs/) dir: # Certificate file name in secret cert: tls.crt # Private key file name in secret key: tls.key # Advanced context configuration # For options see: https://docs.nats.io/using-nats/nats-tools/nats_cli#nats-contexts merge: {} patch: [] # Name of context to select by default for NATS CLI operations defaultContextName: default ############################################################ # NATS Box container configuration ############################################################ container: # NATS Box container image image: # Official NATS Box repository with CLI tools repository: natsio/nats-box # NATS Box version tag: 0.14.5 # Image pull policy (leave empty for chart default) pullPolicy: # Custom registry URL (leave empty for Docker Hub) registry: # Environment variables for NATS Box container env: {} # Advanced container configuration # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#container-v1-core merge: {} patch: [] # Service Account for NATS Box deployment serviceAccount: # Whether to create and use a dedicated service account for NATS Box enabled: false # Pod template configuration for NATS Box deployment podTemplate: merge: spec: # Node tolerations for NATS Box pods tolerations: [] # Affinity for NATS Box pods affinity: {} patch: []