Unverified Commit 36ce39bd authored by Rohan Varma's avatar Rohan Varma Committed by GitHub
Browse files

fix: Add tolerations and affinity support for all platform components (#5561)

parent 2ef408ff
...@@ -149,10 +149,20 @@ The chart includes built-in validation to prevent all operator conflicts: ...@@ -149,10 +149,20 @@ The chart includes built-in validation to prevent all operator conflicts:
| dynamo-operator.webhook.certManager.certificate.rootCA.duration | string | `"87600h"` | Duration for the root CA certificate (e.g., "87600h" for 10 years). The root CA typically has a much longer lifetime than the leaf certificates it signs. | | dynamo-operator.webhook.certManager.certificate.rootCA.duration | string | `"87600h"` | Duration for the root CA certificate (e.g., "87600h" for 10 years). The root CA typically has a much longer lifetime than the leaf certificates it signs. |
| dynamo-operator.webhook.certManager.certificate.rootCA.renewBefore | string | `"720h"` | Time before root CA expiration to trigger renewal (e.g., "720h" for 30 days). Renewing a CA can be disruptive as all signed certificates must be reissued. | | dynamo-operator.webhook.certManager.certificate.rootCA.renewBefore | string | `"720h"` | Time before root CA expiration to trigger renewal (e.g., "720h" for 30 days). Renewing a CA can be disruptive as all signed certificates must be reissued. |
| grove.enabled | bool | `false` | Whether to enable Grove for multi-node inference coordination, if enabled, the Grove operator will be deployed cluster-wide | | grove.enabled | bool | `false` | Whether to enable Grove for multi-node inference coordination, if enabled, the Grove operator will be deployed cluster-wide |
| grove.tolerations | list | `[]` | Node tolerations for Grove pods |
| grove.affinity | object | `{}` | Affinity rules for Grove pods |
| kai-scheduler.enabled | bool | `false` | Whether to enable Kai Scheduler for intelligent resource allocation, if enabled, the Kai Scheduler operator will be deployed cluster-wide | | kai-scheduler.enabled | bool | `false` | Whether to enable Kai Scheduler for intelligent resource allocation, if enabled, the Kai Scheduler operator will be deployed cluster-wide |
| kai-scheduler.global.tolerations | list | `[]` | Node tolerations for kai-scheduler pods |
| kai-scheduler.global.affinity | object | `{}` | Affinity rules for kai-scheduler pods |
| etcd.enabled | bool | `true` | Whether to enable etcd deployment, disable if you want to use an external etcd instance. For complete configuration options, see: https://github.com/bitnami/charts/tree/main/bitnami/etcd , all etcd settings should be prefixed with "etcd." | | etcd.enabled | bool | `true` | Whether to enable etcd deployment, disable if you want to use an external etcd instance. For complete configuration options, see: https://github.com/bitnami/charts/tree/main/bitnami/etcd , all etcd settings should be prefixed with "etcd." |
| etcd.image.repository | string | `"bitnamilegacy/etcd"` | following bitnami announcement for brownout - https://github.com/bitnami/charts/tree/main/bitnami/etcd#%EF%B8%8F-important-notice-upcoming-changes-to-the-bitnami-catalog, we need to use the legacy repository until we migrate to the new "secure" repository | | etcd.image.repository | string | `"bitnamilegacy/etcd"` | following bitnami announcement for brownout - https://github.com/bitnami/charts/tree/main/bitnami/etcd#%EF%B8%8F-important-notice-upcoming-changes-to-the-bitnami-catalog, we need to use the legacy repository until we migrate to the new "secure" repository |
| etcd.tolerations | list | `[]` | Node tolerations for etcd pods |
| etcd.affinity | object | `{}` | Affinity rules for etcd pods |
| nats.enabled | bool | `true` | Whether to enable NATS deployment, disable if you want to use an external NATS instance. For complete configuration options, see: https://github.com/nats-io/k8s/tree/main/helm/charts/nats , all nats settings should be prefixed with "nats." | | nats.enabled | bool | `true` | Whether to enable NATS deployment, disable if you want to use an external NATS instance. For complete configuration options, see: https://github.com/nats-io/k8s/tree/main/helm/charts/nats , all nats settings should be prefixed with "nats." |
| nats.podTemplate.merge.spec.tolerations | list | `[]` | Node tolerations for NATS pods |
| nats.podTemplate.merge.spec.affinity | object | `{}` | Affinity rules for NATS pods |
| nats.natsBox.podTemplate.merge.spec.tolerations | list | `[]` | Node tolerations for NATS Box pods |
| nats.natsBox.podTemplate.merge.spec.affinity | object | `{}` | Affinity rules for NATS Box pods |
### NATS Configuration ### NATS Configuration
......
...@@ -149,6 +149,14 @@ spec: ...@@ -149,6 +149,14 @@ spec:
spec: spec:
serviceAccountName: {{ include "dynamo-operator.fullname" . }}-webhook-ca-inject serviceAccountName: {{ include "dynamo-operator.fullname" . }}-webhook-ca-inject
restartPolicy: OnFailure restartPolicy: OnFailure
{{- with .Values.controllerManager.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.controllerManager.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
containers: containers:
- name: ca-injector - name: ca-injector
image: {{ .Values.webhook.certGenerator.image.repository }}:{{ .Values.webhook.certGenerator.image.tag }} image: {{ .Values.webhook.certGenerator.image.repository }}:{{ .Values.webhook.certGenerator.image.tag }}
......
...@@ -102,6 +102,14 @@ spec: ...@@ -102,6 +102,14 @@ spec:
spec: spec:
serviceAccountName: {{ include "dynamo-operator.fullname" . }}-webhook-cert-gen serviceAccountName: {{ include "dynamo-operator.fullname" . }}-webhook-cert-gen
restartPolicy: OnFailure restartPolicy: OnFailure
{{- with .Values.controllerManager.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.controllerManager.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
containers: containers:
- name: cert-generator - name: cert-generator
image: {{ .Values.webhook.certGenerator.image.repository }}:{{ .Values.webhook.certGenerator.image.tag }} image: {{ .Values.webhook.certGenerator.image.repository }}:{{ .Values.webhook.certGenerator.image.tag }}
......
...@@ -33,6 +33,7 @@ namespaceRestriction: ...@@ -33,6 +33,7 @@ namespaceRestriction:
renewInterval: 10s renewInterval: 10s
controllerManager: controllerManager:
tolerations: [] tolerations: []
affinity: {}
# Leader election configuration # Leader election configuration
leaderElection: leaderElection:
......
...@@ -214,11 +214,21 @@ dynamo-operator: ...@@ -214,11 +214,21 @@ dynamo-operator:
grove: grove:
# -- Whether to enable Grove for multi-node inference coordination, if enabled, the Grove operator will be deployed cluster-wide # -- Whether to enable Grove for multi-node inference coordination, if enabled, the Grove operator will be deployed cluster-wide
enabled: false enabled: false
# -- Node tolerations for Grove pods
tolerations: []
# -- Affinity for Grove pods
affinity: {}
# Kai Scheduler component - advanced workload scheduling # Kai Scheduler component - advanced workload scheduling
kai-scheduler: kai-scheduler:
# -- Whether to enable Kai Scheduler for intelligent resource allocation, if enabled, the Kai Scheduler operator will be deployed cluster-wide # -- Whether to enable Kai Scheduler for intelligent resource allocation, if enabled, the Kai Scheduler operator will be deployed cluster-wide
enabled: false enabled: false
# Global configuration for kai-scheduler (applies to all components including crd-upgrader)
global:
# -- Node tolerations for kai-scheduler pods
tolerations: []
# -- Affinity for kai-scheduler pods
affinity: {}
# etcd configuration - distributed key-value store for operator state # etcd configuration - distributed key-value store for operator state
etcd: etcd:
...@@ -273,6 +283,9 @@ etcd: ...@@ -273,6 +283,9 @@ etcd:
# Node tolerations for etcd pods (allows scheduling on specific nodes) # Node tolerations for etcd pods (allows scheduling on specific nodes)
tolerations: [] tolerations: []
# Affinity for etcd pods
affinity: {}
# NATS configuration - messaging system for operator communication # NATS configuration - messaging system for operator communication
nats: nats:
# -- Whether to enable NATS deployment, disable if you want to use an external NATS instance. For complete configuration options, see: https://github.com/nats-io/k8s/tree/main/helm/charts/nats , all nats settings should be prefixed with "nats." # -- Whether to enable NATS deployment, disable if you want to use an external NATS instance. For complete configuration options, see: https://github.com/nats-io/k8s/tree/main/helm/charts/nats , all nats settings should be prefixed with "nats."
...@@ -592,6 +605,8 @@ nats: ...@@ -592,6 +605,8 @@ nats:
spec: spec:
# Node tolerations for NATS pods (allows scheduling on specific nodes) # Node tolerations for NATS pods (allows scheduling on specific nodes)
tolerations: [] tolerations: []
# Affinity for NATS pods
affinity: {}
patch: [] patch: []
# Headless service for StatefulSet pod discovery # Headless service for StatefulSet pod discovery
...@@ -711,4 +726,6 @@ nats: ...@@ -711,4 +726,6 @@ nats:
spec: spec:
# Node tolerations for NATS Box pods # Node tolerations for NATS Box pods
tolerations: [] tolerations: []
# Affinity for NATS Box pods
affinity: {}
patch: [] patch: []
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment