Unverified Commit 73b0cdb4 authored by Julien Mancuso's avatar Julien Mancuso Committed by GitHub
Browse files

fix: fix some defaults (#3417)


Signed-off-by: default avatarJulien Mancuso <jmancuso@nvidia.com>
parent d809906e
...@@ -113,7 +113,7 @@ The chart includes built-in validation to prevent all operator conflicts: ...@@ -113,7 +113,7 @@ The chart includes built-in validation to prevent all operator conflicts:
| dynamo-operator.controllerManager.manager.args[0] | string | `"--health-probe-bind-address=:8081"` | Health probe endpoint for Kubernetes health checks | | dynamo-operator.controllerManager.manager.args[0] | string | `"--health-probe-bind-address=:8081"` | Health probe endpoint for Kubernetes health checks |
| dynamo-operator.controllerManager.manager.args[1] | string | `"--metrics-bind-address=127.0.0.1:8080"` | Metrics endpoint for Prometheus scraping (localhost only for security) | | dynamo-operator.controllerManager.manager.args[1] | string | `"--metrics-bind-address=127.0.0.1:8080"` | Metrics endpoint for Prometheus scraping (localhost only for security) |
| dynamo-operator.imagePullSecrets | list | `[]` | Secrets for pulling private container images | | dynamo-operator.imagePullSecrets | list | `[]` | Secrets for pulling private container images |
| dynamo-operator.dynamo.groveTerminationDelay | string | `"15m"` | How long to wait before forcefully terminating Grove instances | | dynamo-operator.dynamo.groveTerminationDelay | string | `"4h"` | How long to wait before forcefully terminating Grove instances |
| dynamo-operator.dynamo.internalImages.debugger | string | `"python:3.12-slim"` | Debugger image for troubleshooting deployments | | dynamo-operator.dynamo.internalImages.debugger | string | `"python:3.12-slim"` | Debugger image for troubleshooting deployments |
| dynamo-operator.dynamo.enableRestrictedSecurityContext | bool | `false` | Whether to enable restricted security contexts for enhanced security | | dynamo-operator.dynamo.enableRestrictedSecurityContext | bool | `false` | Whether to enable restricted security contexts for enhanced security |
| dynamo-operator.dynamo.dockerRegistry.useKubernetesSecret | bool | `false` | Whether to use Kubernetes secrets for registry authentication | | dynamo-operator.dynamo.dockerRegistry.useKubernetesSecret | bool | `false` | Whether to use Kubernetes secrets for registry authentication |
...@@ -134,9 +134,9 @@ The chart includes built-in validation to prevent all operator conflicts: ...@@ -134,9 +134,9 @@ The chart includes built-in validation to prevent all operator conflicts:
| dynamo-operator.dynamo.mpiRun.sshKeygen.enabled | bool | `true` | Whether to enable SSH key generation for MPI Run | | dynamo-operator.dynamo.mpiRun.sshKeygen.enabled | bool | `true` | Whether to enable SSH key generation for MPI Run |
| grove.enabled | bool | `false` | Whether to enable Grove for multi-node inference coordination, if enabled, the Grove operator will be deployed cluster-wide | | grove.enabled | bool | `false` | Whether to enable Grove for multi-node inference coordination, if enabled, the Grove operator will be deployed cluster-wide |
| kai-scheduler.enabled | bool | `false` | Whether to enable Kai Scheduler for intelligent resource allocation, if enabled, the Kai Scheduler operator will be deployed cluster-wide | | kai-scheduler.enabled | bool | `false` | Whether to enable Kai Scheduler for intelligent resource allocation, if enabled, the Kai Scheduler operator will be deployed cluster-wide |
| etcd.enabled | bool | `true` | Whether to enable etcd deployment, disable if you want to use an external etcd instance | | etcd.enabled | bool | `true` | Whether to enable etcd deployment, disable if you want to use an external etcd instance. For complete configuration options, see: https://github.com/bitnami/charts/tree/main/bitnami/etcd , all etcd settings should be prefixed with "etcd." |
| etcd.image.repository | string | `"bitnamilegacy/etcd"` | following bitnami announcement for brownout - https://github.com/bitnami/charts/tree/main/bitnami/etcd#%EF%B8%8F-important-notice-upcoming-changes-to-the-bitnami-catalog, we need to use the legacy repository until we migrate to the new "secure" repository | | etcd.image.repository | string | `"bitnamilegacy/etcd"` | following bitnami announcement for brownout - https://github.com/bitnami/charts/tree/main/bitnami/etcd#%EF%B8%8F-important-notice-upcoming-changes-to-the-bitnami-catalog, we need to use the legacy repository until we migrate to the new "secure" repository |
| nats.enabled | bool | `true` | Whether to enable NATS deployment, disable if you want to use an external NATS instance | | nats.enabled | bool | `true` | Whether to enable NATS deployment, disable if you want to use an external NATS instance. For complete configuration options, see: https://github.com/nats-io/k8s/tree/main/helm/charts/nats , all nats settings should be prefixed with "nats." |
### NATS Configuration ### NATS Configuration
......
...@@ -74,7 +74,7 @@ dynamo-operator: ...@@ -74,7 +74,7 @@ dynamo-operator:
# Core Dynamo platform configuration # Core Dynamo platform configuration
dynamo: dynamo:
# -- How long to wait before forcefully terminating Grove instances # -- How long to wait before forcefully terminating Grove instances
groveTerminationDelay: 15m groveTerminationDelay: 4h
# Internal utility images used by the platform # Internal utility images used by the platform
internalImages: internalImages:
...@@ -147,10 +147,9 @@ kai-scheduler: ...@@ -147,10 +147,9 @@ kai-scheduler:
enabled: false enabled: false
# etcd configuration - distributed key-value store for operator state # etcd configuration - distributed key-value store for operator state
# For complete configuration options, see: https://github.com/bitnami/charts/tree/main/bitnami/etcd
etcd: etcd:
# -- Whether to enable etcd deployment, disable if you want to use an external etcd instance # -- Whether to enable etcd deployment, disable if you want to use an external etcd instance. For complete configuration options, see: https://github.com/bitnami/charts/tree/main/bitnami/etcd , all etcd settings should be prefixed with "etcd."
enabled: true enabled: true
image: image:
...@@ -195,9 +194,8 @@ etcd: ...@@ -195,9 +194,8 @@ etcd:
tolerations: [] tolerations: []
# NATS configuration - messaging system for operator communication # NATS configuration - messaging system for operator communication
# For complete configuration options, see: https://github.com/nats-io/k8s/tree/main/helm/charts/nats
nats: nats:
# -- Whether to enable NATS deployment, disable if you want to use an external NATS instance # -- Whether to enable NATS deployment, disable if you want to use an external NATS instance. For complete configuration options, see: https://github.com/nats-io/k8s/tree/main/helm/charts/nats , all nats settings should be prefixed with "nats."
enabled: true enabled: true
# TLS Certificate Authority configuration for secure communication # TLS Certificate Authority configuration for secure communication
...@@ -338,7 +336,9 @@ nats: ...@@ -338,7 +336,9 @@ nats:
# token: << $TOKEN >> # token: << $TOKEN >>
# jetstream: # jetstream:
# max_memory_store: << 1GB >> # max_memory_store: << 1GB >>
merge: {} merge:
# 10MB which allows for larger context size : The default NATS max payload size is 1MB, and 256K tokens (with tokens being int32 - 4 bytes each) tips over that 1MB max.
max_payload: 10485760
patch: [] patch: []
############################################################ ############################################################
......
...@@ -887,7 +887,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing. ...@@ -887,7 +887,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
TimeoutSeconds: 5, TimeoutSeconds: 5,
PeriodSeconds: 10, PeriodSeconds: 10,
SuccessThreshold: 0, SuccessThreshold: 0,
FailureThreshold: 60, FailureThreshold: 720,
}, },
}, },
}, },
......
...@@ -67,7 +67,7 @@ func (w *WorkerDefaults) GetBaseContainer(context ComponentContext) (corev1.Cont ...@@ -67,7 +67,7 @@ func (w *WorkerDefaults) GetBaseContainer(context ComponentContext) (corev1.Cont
}, },
PeriodSeconds: 10, PeriodSeconds: 10,
TimeoutSeconds: 5, TimeoutSeconds: 5,
FailureThreshold: 60, FailureThreshold: 720, // 10s * 720 = 7200s = 2h
} }
container.Env = append(container.Env, []corev1.EnvVar{ container.Env = append(container.Env, []corev1.EnvVar{
......
...@@ -1937,7 +1937,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -1937,7 +1937,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
TimeoutSeconds: 5, TimeoutSeconds: 5,
PeriodSeconds: 10, PeriodSeconds: 10,
SuccessThreshold: 0, SuccessThreshold: 0,
FailureThreshold: 60, FailureThreshold: 720,
}, },
}, },
}, },
...@@ -4721,7 +4721,7 @@ func TestGenerateBasePodSpec_Worker(t *testing.T) { ...@@ -4721,7 +4721,7 @@ func TestGenerateBasePodSpec_Worker(t *testing.T) {
}, },
PeriodSeconds: 10, PeriodSeconds: 10,
TimeoutSeconds: 5, TimeoutSeconds: 5,
FailureThreshold: 60, FailureThreshold: 720,
}, },
Ports: []corev1.ContainerPort{ Ports: []corev1.ContainerPort{
{ {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment