"lib/llm/vscode:/vscode.git/clone" did not exist on "e3d00b899f0c8a417cdfc45106c94bba5e3836a3"
Unverified Commit 872900f1 authored by Julien Mancuso's avatar Julien Mancuso Committed by GitHub
Browse files

feat: add validation webhooks (#4416)


Signed-off-by: default avatarJulien Mancuso <jmancuso@nvidia.com>
parent b2605a8e
......@@ -19,7 +19,7 @@ limitations under the License.
A Helm chart for NVIDIA Dynamo Platform.
![Version: 0.5.0](https://img.shields.io/badge/Version-0.5.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square)
![Version: 0.7.0](https://img.shields.io/badge/Version-0.7.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square)
## 🚀 Overview
......@@ -86,10 +86,10 @@ The chart includes built-in validation to prevent all operator conflicts:
| Repository | Name | Version |
|------------|------|---------|
| file://components/operator | dynamo-operator | 0.5.0 |
| file://components/operator | dynamo-operator | 0.6.1 |
| https://charts.bitnami.com/bitnami | etcd | 12.0.18 |
| https://nats-io.github.io/k8s/helm/charts/ | nats | 1.3.2 |
| oci://ghcr.io/nvidia/grove | grove(grove-charts) | v0.1.0-alpha.2 |
| oci://ghcr.io/nvidia/grove | grove(grove-charts) | v0.1.0-alpha.3 |
| oci://ghcr.io/nvidia/kai-scheduler | kai-scheduler | v0.9.4 |
## Values
......@@ -100,7 +100,7 @@ The chart includes built-in validation to prevent all operator conflicts:
| dynamo-operator.natsAddr | string | `""` | NATS server address for operator communication (leave empty to use the bundled NATS chart). Format: "nats://hostname:port" |
| dynamo-operator.etcdAddr | string | `""` | etcd server address for operator state storage (leave empty to use the bundled etcd chart). Format: "http://hostname:port" or "https://hostname:port" |
| dynamo-operator.modelExpressURL | string | `""` | URL for the Model Express server if not deployed by this helm chart. This is ignored if Model Express server is installed by this helm chart (global.model-express.enabled is true). |
| dynamo-operator.namespaceRestriction | object | `{"enabled":false,"targetNamespace":null}` | Namespace access controls for the operator |
| dynamo-operator.namespaceRestriction | object | `{"enabled":false,"lease":{"duration":"30s","renewInterval":"10s"},"targetNamespace":null}` | Namespace access controls for the operator |
| dynamo-operator.namespaceRestriction.enabled | bool | `false` | Whether to restrict operator to specific namespaces. By default, the operator will run with cluster-wide permissions. Only 1 instance of the operator should be deployed in the cluster. If you want to deploy multiple operator instances, you can set this to true and specify the target namespace (by default, the target namespace is the helm release namespace). |
| dynamo-operator.namespaceRestriction.targetNamespace | string | `nil` | Target namespace for operator deployment (leave empty for current namespace) |
| dynamo-operator.controllerManager.tolerations | list | `[]` | Node tolerations for controller manager pods |
......@@ -132,7 +132,22 @@ The chart includes built-in validation to prevent all operator conflicts:
| dynamo-operator.dynamo.metrics.prometheusEndpoint | string | `""` | Endpoint that services can use to retrieve metrics. If set, dynamo operator will automatically inject the PROMETHEUS_ENDPOINT environment variable into services it manages. Users can override the value of the PROMETHEUS_ENDPOINT environment variable by modifying the corresponding deployment's environment variables |
| dynamo-operator.dynamo.mpiRun.secretName | string | `"mpi-run-ssh-secret"` | Name of the secret containing the SSH key for MPI Run |
| dynamo-operator.dynamo.mpiRun.sshKeygen.enabled | bool | `true` | Whether to enable SSH key generation for MPI Run |
| dynamo-operator.dynamo.dgdr.profilerImage | string | `""` | Container image to use for profiling jobs (both online and offline/AIC) |
| dynamo-operator.webhook.enabled | bool | `true` | Whether to enable admission webhooks for resource validation. When enabled, the operator will validate DynamoComponentDeployment and DynamoGraphDeployment resources before they are created or updated in the cluster. Enabled by default for production-ready validation and better error reporting. |
| dynamo-operator.webhook.certificateSecret.name | string | `"webhook-server-cert"` | Name of the Kubernetes secret containing webhook TLS certificates. The secret must contain three keys: tls.crt (server certificate), tls.key (server private key), and ca.crt (Certificate Authority certificate). |
| dynamo-operator.webhook.certificateSecret.external | bool | `false` | Whether to manage the certificate secret externally. When false (default), certificates are automatically generated via Helm hooks during installation. When true, you must create the secret manually before installing the chart. |
| dynamo-operator.webhook.certificateValidity | int | `365` | Certificate validity duration in days for auto-generated certificates. Only used when certManager.enabled=false and certificateSecret.external=false. After this duration, certificates will expire and need to be regenerated. |
| dynamo-operator.webhook.certGenerator.image.repository | string | `"bitnami/kubectl"` | Container image repository for certificate generation jobs. This image must contain both openssl and kubectl commands. |
| dynamo-operator.webhook.certGenerator.image.tag | string | `"latest"` | Container image tag for certificate generation jobs |
| dynamo-operator.webhook.certGenerator.image.pullPolicy | string | `"IfNotPresent"` | Image pull policy for certificate generation jobs |
| dynamo-operator.webhook.caBundle | string | `""` | CA bundle (base64 encoded) for webhook validation. Only used when certificateSecret.external=true. For automatic certificate generation or cert-manager integration, leave this empty as it will be injected automatically. |
| dynamo-operator.webhook.failurePolicy | string | `"Fail"` | Webhook failure policy controls how Kubernetes handles requests when the webhook is unavailable. 'Fail' (recommended for production) rejects requests if the webhook cannot be reached, ensuring strict validation. 'Ignore' allows requests through if the webhook is unavailable, providing availability over validation guarantees. |
| dynamo-operator.webhook.timeoutSeconds | int | `10` | Timeout in seconds for webhook validation calls. If the webhook doesn't respond within this time, the request will be handled according to the failurePolicy. |
| dynamo-operator.webhook.namespaceSelector | object | `{}` | Custom namespace selector for webhook validation. Use this to include or exclude specific namespaces from webhook validation. For CLUSTER-WIDE operators, you can exclude namespaces managed by namespace-restricted operators by using: matchExpressions: [{ key: "dynamo-operator", operator: "NotIn", values: ["namespace-restricted"] }]. For NAMESPACE-RESTRICTED operators, leave empty as it will be auto-configured to match only the operator's namespace. |
| dynamo-operator.webhook.certManager.enabled | bool | `false` | Whether to use cert-manager for automatic certificate management. Requires cert-manager to be installed in the cluster. When enabled, cert-manager will automatically generate, renew, and rotate certificates, and the automatic certificate generation via Helm hooks will be disabled. |
| dynamo-operator.webhook.certManager.certificate.duration | string | `"8760h"` | Certificate duration for webhook certificates managed by cert-manager (e.g., "8760h" for 1 year). cert-manager will automatically renew the certificate before it expires. |
| dynamo-operator.webhook.certManager.certificate.renewBefore | string | `"360h"` | Time before certificate expiration to trigger renewal (e.g., "360h" for 15 days). cert-manager will attempt to renew the certificate when this threshold is reached. |
| dynamo-operator.webhook.certManager.certificate.rootCA.duration | string | `"87600h"` | Duration for the root CA certificate (e.g., "87600h" for 10 years). The root CA typically has a much longer lifetime than the leaf certificates it signs. |
| dynamo-operator.webhook.certManager.certificate.rootCA.renewBefore | string | `"720h"` | Time before root CA expiration to trigger renewal (e.g., "720h" for 30 days). Renewing a CA can be disruptive as all signed certificates must be reissued. |
| grove.enabled | bool | `false` | Whether to enable Grove for multi-node inference coordination, if enabled, the Grove operator will be deployed cluster-wide |
| kai-scheduler.enabled | bool | `false` | Whether to enable Kai Scheduler for intelligent resource allocation, if enabled, the Kai Scheduler operator will be deployed cluster-wide |
| etcd.enabled | bool | `true` | Whether to enable etcd deployment, disable if you want to use an external etcd instance. For complete configuration options, see: https://github.com/bitnami/charts/tree/main/bitnami/etcd , all etcd settings should be prefixed with "etcd." |
......
......@@ -142,6 +142,9 @@ spec:
{{- end }}
{{- end }}
- --operator-version={{ .Values.controllerManager.manager.image.tag | default .Chart.AppVersion }}
{{- if .Values.webhook.enabled }}
- --enable-webhooks=true
{{- end }}
command:
- /manager
env:
......@@ -159,6 +162,12 @@ spec:
initialDelaySeconds: 15
periodSeconds: 20
name: manager
{{- if .Values.webhook.enabled }}
ports:
- containerPort: 9443
name: webhook-server
protocol: TCP
{{- end }}
readinessProbe:
httpGet:
path: /readyz
......@@ -169,7 +178,20 @@ spec:
10 }}
securityContext: {{- toYaml .Values.controllerManager.manager.containerSecurityContext
| nindent 10 }}
{{- if .Values.webhook.enabled }}
volumeMounts:
- mountPath: /tmp/k8s-webhook-server/serving-certs
name: cert
readOnly: true
{{- end }}
securityContext:
runAsNonRoot: true
serviceAccountName: {{ include "dynamo-operator.fullname" . }}-controller-manager
terminationGracePeriodSeconds: 30
{{- if .Values.webhook.enabled }}
volumes:
- name: cert
secret:
defaultMode: 420
secretName: {{ .Values.webhook.certificateSecret.name }}
{{- end }}
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
{{- if and .Values.webhook.enabled (not .Values.webhook.certManager.enabled) (not .Values.webhook.certificateSecret.external) }}
---
# ServiceAccount for CA bundle injection job
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ include "dynamo-operator.fullname" . }}-webhook-ca-inject
namespace: {{ .Release.Namespace }}
labels:
app.kubernetes.io/component: webhook
app.kubernetes.io/created-by: dynamo-operator
app.kubernetes.io/part-of: dynamo-operator
{{- include "dynamo-operator.labels" . | nindent 4 }}
annotations:
"helm.sh/hook": post-install,post-upgrade
"helm.sh/hook-weight": "1"
"helm.sh/hook-delete-policy": before-hook-creation
---
# Role to read the certificate secret
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: {{ include "dynamo-operator.fullname" . }}-webhook-ca-inject
namespace: {{ .Release.Namespace }}
labels:
app.kubernetes.io/component: webhook
app.kubernetes.io/created-by: dynamo-operator
app.kubernetes.io/part-of: dynamo-operator
{{- include "dynamo-operator.labels" . | nindent 4 }}
annotations:
"helm.sh/hook": post-install,post-upgrade
"helm.sh/hook-weight": "1"
"helm.sh/hook-delete-policy": before-hook-creation
rules:
- apiGroups: [""]
resources: ["secrets"]
resourceNames: ["{{ .Values.webhook.certificateSecret.name }}"]
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: {{ include "dynamo-operator.fullname" . }}-webhook-ca-inject
namespace: {{ .Release.Namespace }}
labels:
app.kubernetes.io/component: webhook
app.kubernetes.io/created-by: dynamo-operator
app.kubernetes.io/part-of: dynamo-operator
{{- include "dynamo-operator.labels" . | nindent 4 }}
annotations:
"helm.sh/hook": post-install,post-upgrade
"helm.sh/hook-weight": "1"
"helm.sh/hook-delete-policy": before-hook-creation
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: {{ include "dynamo-operator.fullname" . }}-webhook-ca-inject
subjects:
- kind: ServiceAccount
name: {{ include "dynamo-operator.fullname" . }}-webhook-ca-inject
namespace: {{ .Release.Namespace }}
---
# ClusterRole to patch ValidatingWebhookConfiguration
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ include "dynamo-operator.fullname" . }}-webhook-ca-inject
labels:
app.kubernetes.io/component: webhook
app.kubernetes.io/created-by: dynamo-operator
app.kubernetes.io/part-of: dynamo-operator
{{- include "dynamo-operator.labels" . | nindent 4 }}
annotations:
"helm.sh/hook": post-install,post-upgrade
"helm.sh/hook-weight": "1"
"helm.sh/hook-delete-policy": before-hook-creation
rules:
- apiGroups: ["admissionregistration.k8s.io"]
resources: ["validatingwebhookconfigurations"]
{{- if .Values.namespaceRestriction.enabled }}
resourceNames: ["{{ include "dynamo-operator.fullname" . }}-validating-{{ .Release.Namespace }}"]
{{- else }}
resourceNames: ["{{ include "dynamo-operator.fullname" . }}-validating"]
{{- end }}
verbs: ["get", "patch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{ include "dynamo-operator.fullname" . }}-webhook-ca-inject
labels:
app.kubernetes.io/component: webhook
app.kubernetes.io/created-by: dynamo-operator
app.kubernetes.io/part-of: dynamo-operator
{{- include "dynamo-operator.labels" . | nindent 4 }}
annotations:
"helm.sh/hook": post-install,post-upgrade
"helm.sh/hook-weight": "1"
"helm.sh/hook-delete-policy": before-hook-creation
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: {{ include "dynamo-operator.fullname" . }}-webhook-ca-inject
subjects:
- kind: ServiceAccount
name: {{ include "dynamo-operator.fullname" . }}-webhook-ca-inject
namespace: {{ .Release.Namespace }}
---
# Job to inject CA bundle into ValidatingWebhookConfiguration
apiVersion: batch/v1
kind: Job
metadata:
name: {{ include "dynamo-operator.fullname" . }}-webhook-ca-inject-{{ .Release.Revision }}
namespace: {{ .Release.Namespace }}
labels:
app.kubernetes.io/component: webhook
app.kubernetes.io/created-by: dynamo-operator
app.kubernetes.io/part-of: dynamo-operator
{{- include "dynamo-operator.labels" . | nindent 4 }}
annotations:
"helm.sh/hook": post-install,post-upgrade
"helm.sh/hook-weight": "2"
"helm.sh/hook-delete-policy": before-hook-creation
spec:
backoffLimit: 5
template:
metadata:
name: {{ include "dynamo-operator.fullname" . }}-webhook-ca-inject
labels:
app.kubernetes.io/component: webhook
app.kubernetes.io/created-by: dynamo-operator
app.kubernetes.io/part-of: dynamo-operator
{{- include "dynamo-operator.labels" . | nindent 8 }}
spec:
serviceAccountName: {{ include "dynamo-operator.fullname" . }}-webhook-ca-inject
restartPolicy: OnFailure
containers:
- name: ca-injector
image: {{ .Values.webhook.certGenerator.image.repository }}:{{ .Values.webhook.certGenerator.image.tag }}
imagePullPolicy: {{ .Values.webhook.certGenerator.image.pullPolicy }}
env:
- name: OPENSSL_CONF
value: /tmp/openssl.cnf
command:
- /bin/bash
- -c
- |
set -e
echo "🔐 Injecting CA bundle into webhook configuration..."
# Configuration
SECRET_NAME="{{ .Values.webhook.certificateSecret.name }}"
NAMESPACE="{{ .Release.Namespace }}"
{{- if .Values.namespaceRestriction.enabled }}
WEBHOOK_NAME="{{ include "dynamo-operator.fullname" . }}-validating-{{ .Release.Namespace }}"
{{- else }}
WEBHOOK_NAME="{{ include "dynamo-operator.fullname" . }}-validating"
{{- end }}
echo "⏳ Waiting for certificate secret to be available..."
# Wait for secret (up to 5 minutes)
for i in $(seq 1 60); do
if kubectl get secret ${SECRET_NAME} -n ${NAMESPACE} >/dev/null 2>&1; then
echo "✅ Secret found!"
break
fi
if [ $i -eq 60 ]; then
echo "❌ ERROR: Secret ${SECRET_NAME} not found after 5 minutes"
exit 1
fi
echo " Waiting for secret... ($i/60)"
sleep 5
done
echo "📝 Extracting CA bundle from secret..."
CA_BUNDLE=$(kubectl get secret ${SECRET_NAME} \
-n ${NAMESPACE} \
-o jsonpath='{.data.ca\.crt}')
if [ -z "$CA_BUNDLE" ]; then
echo "❌ ERROR: ca.crt not found in secret ${SECRET_NAME}"
exit 1
fi
echo "📝 Patching ValidatingWebhookConfiguration..."
# Patch all webhooks (DynamoComponentDeployment, DynamoGraphDeployment, DynamoModel, DynamoGraphDeploymentRequest)
kubectl patch validatingwebhookconfiguration ${WEBHOOK_NAME} \
--type='json' -p="[
{
\"op\": \"add\",
\"path\": \"/webhooks/0/clientConfig/caBundle\",
\"value\": \"${CA_BUNDLE}\"
},
{
\"op\": \"add\",
\"path\": \"/webhooks/1/clientConfig/caBundle\",
\"value\": \"${CA_BUNDLE}\"
},
{
\"op\": \"add\",
\"path\": \"/webhooks/2/clientConfig/caBundle\",
\"value\": \"${CA_BUNDLE}\"
},
{
\"op\": \"add\",
\"path\": \"/webhooks/3/clientConfig/caBundle\",
\"value\": \"${CA_BUNDLE}\"
}
]"
echo "✅ CA bundle injected successfully!"
echo "🎉 Webhook configuration complete!"
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsNonRoot: true
runAsUser: 1001
{{- end }}
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
{{- if and .Values.webhook.enabled (not .Values.webhook.certManager.enabled) (not .Values.webhook.certificateSecret.external) }}
---
# ServiceAccount for certificate generation job
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ include "dynamo-operator.fullname" . }}-webhook-cert-gen
namespace: {{ .Release.Namespace }}
labels:
app.kubernetes.io/component: webhook
app.kubernetes.io/created-by: dynamo-operator
app.kubernetes.io/part-of: dynamo-operator
{{- include "dynamo-operator.labels" . | nindent 4 }}
annotations:
"helm.sh/hook": pre-install,pre-upgrade
"helm.sh/hook-weight": "-5"
"helm.sh/hook-delete-policy": before-hook-creation
---
# Role to create/update the certificate secret
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: {{ include "dynamo-operator.fullname" . }}-webhook-cert-gen
namespace: {{ .Release.Namespace }}
labels:
app.kubernetes.io/component: webhook
app.kubernetes.io/created-by: dynamo-operator
app.kubernetes.io/part-of: dynamo-operator
{{- include "dynamo-operator.labels" . | nindent 4 }}
annotations:
"helm.sh/hook": pre-install,pre-upgrade
"helm.sh/hook-weight": "-5"
"helm.sh/hook-delete-policy": before-hook-creation
rules:
- apiGroups: [""]
resources: ["secrets"]
verbs: ["create", "update", "patch", "get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: {{ include "dynamo-operator.fullname" . }}-webhook-cert-gen
namespace: {{ .Release.Namespace }}
labels:
app.kubernetes.io/component: webhook
app.kubernetes.io/created-by: dynamo-operator
app.kubernetes.io/part-of: dynamo-operator
{{- include "dynamo-operator.labels" . | nindent 4 }}
annotations:
"helm.sh/hook": pre-install,pre-upgrade
"helm.sh/hook-weight": "-5"
"helm.sh/hook-delete-policy": before-hook-creation
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: {{ include "dynamo-operator.fullname" . }}-webhook-cert-gen
subjects:
- kind: ServiceAccount
name: {{ include "dynamo-operator.fullname" . }}-webhook-cert-gen
namespace: {{ .Release.Namespace }}
---
# Job to generate certificates and create secret
apiVersion: batch/v1
kind: Job
metadata:
name: {{ include "dynamo-operator.fullname" . }}-webhook-cert-gen-{{ .Release.Revision }}
namespace: {{ .Release.Namespace }}
labels:
app.kubernetes.io/component: webhook
app.kubernetes.io/created-by: dynamo-operator
app.kubernetes.io/part-of: dynamo-operator
{{- include "dynamo-operator.labels" . | nindent 4 }}
annotations:
"helm.sh/hook": pre-install,pre-upgrade
"helm.sh/hook-weight": "-4"
"helm.sh/hook-delete-policy": before-hook-creation
spec:
backoffLimit: 3
template:
metadata:
name: {{ include "dynamo-operator.fullname" . }}-webhook-cert-gen
labels:
app.kubernetes.io/component: webhook
app.kubernetes.io/created-by: dynamo-operator
app.kubernetes.io/part-of: dynamo-operator
{{- include "dynamo-operator.labels" . | nindent 8 }}
spec:
serviceAccountName: {{ include "dynamo-operator.fullname" . }}-webhook-cert-gen
restartPolicy: OnFailure
containers:
- name: cert-generator
image: {{ .Values.webhook.certGenerator.image.repository }}:{{ .Values.webhook.certGenerator.image.tag }}
imagePullPolicy: {{ .Values.webhook.certGenerator.image.pullPolicy }}
env:
- name: OPENSSL_CONF
value: /tmp/openssl.cnf
command:
- /bin/bash
- -c
- |
set -e
echo "🔐 Generating webhook certificates..."
# Create OpenSSL config without FIPS
cat > /tmp/openssl.cnf <<'OPENSSLEOF'
openssl_conf = openssl_init
[openssl_init]
providers = provider_sect
[provider_sect]
default = default_sect
[default_sect]
activate = 1
OPENSSLEOF
# Configuration
SERVICE_NAME="{{ include "dynamo-operator.fullname" . }}-webhook-service"
NAMESPACE="{{ .Release.Namespace }}"
SECRET_NAME="{{ .Values.webhook.certificateSecret.name }}"
CERT_VALIDITY_DAYS="{{ .Values.webhook.certificateValidity }}"
# Check if valid certificates already exist
echo "🔍 Checking if valid certificates already exist..."
NEED_NEW_CERTS=false
if kubectl get secret "${SECRET_NAME}" -n "${NAMESPACE}" >/dev/null 2>&1; then
echo "Secret exists, checking certificate validity..."
# Extract and verify the certificate
mkdir -p /tmp/verify
kubectl get secret "${SECRET_NAME}" -n "${NAMESPACE}" -o jsonpath='{.data.tls\.crt}' | base64 -d > /tmp/verify/tls.crt
kubectl get secret "${SECRET_NAME}" -n "${NAMESPACE}" -o jsonpath='{.data.ca\.crt}' | base64 -d > /tmp/verify/ca.crt
# Check if certificate is valid for at least 30 more days
if openssl x509 -checkend 2592000 -noout -in /tmp/verify/tls.crt 2>/dev/null; then
echo "✅ Certificate is valid for at least 30 more days"
# Verify SANs match the service name
CERT_SANS=$(openssl x509 -in /tmp/verify/tls.crt -noout -text 2>/dev/null | grep -A1 "Subject Alternative Name" | tail -1)
if echo "$CERT_SANS" | grep -q "${SERVICE_NAME}.${NAMESPACE}.svc"; then
echo "✅ Certificate has correct SANs"
echo "🎉 Existing certificates are valid, skipping generation"
else
echo "⚠️ Certificate SANs don't match, need to regenerate"
NEED_NEW_CERTS=true
fi
else
echo "⚠️ Certificate expires soon or is invalid, need to regenerate"
NEED_NEW_CERTS=true
fi
rm -rf /tmp/verify
else
echo "Secret doesn't exist, need to generate certificates"
NEED_NEW_CERTS=true
fi
# Only generate certificates if needed
if [ "$NEED_NEW_CERTS" = false ]; then
echo "✅ Using existing valid certificates"
exit 0
fi
echo "🔐 Generating new webhook certificates..."
# Create working directory
mkdir -p /tmp/certs
cd /tmp/certs
echo "📝 Generating CA key and certificate..."
# Generate CA key (4096-bit RSA)
openssl genrsa -out ca.key 4096
# Generate self-signed CA certificate
cat > ca.cnf <<EOF
[req]
prompt = no
distinguished_name = dn
x509_extensions = v3_ca
[dn]
CN = Dynamo Webhook CA
O = NVIDIA
[v3_ca]
basicConstraints = critical,CA:TRUE
keyUsage = critical,digitalSignature,keyCertSign,cRLSign
subjectKeyIdentifier = hash
EOF
openssl req -x509 -new -nodes \
-key ca.key \
-sha256 \
-days ${CERT_VALIDITY_DAYS} \
-out ca.crt \
-config ca.cnf
echo "📝 Generating server key and certificate..."
# Generate server key (4096-bit RSA)
openssl genrsa -out tls.key 4096
# Generate server certificate signing request with SANs
cat > server.cnf <<EOF
[req]
prompt = no
distinguished_name = dn
req_extensions = v3_req
[dn]
CN = ${SERVICE_NAME}
O = NVIDIA
[v3_req]
basicConstraints = CA:FALSE
keyUsage = critical,digitalSignature,keyEncipherment
extendedKeyUsage = serverAuth
subjectAltName = @alt_names
[alt_names]
DNS.1 = ${SERVICE_NAME}
DNS.2 = ${SERVICE_NAME}.${NAMESPACE}
DNS.3 = ${SERVICE_NAME}.${NAMESPACE}.svc
DNS.4 = ${SERVICE_NAME}.${NAMESPACE}.svc.{{ .Values.kubernetesClusterDomain }}
EOF
openssl req -new -key tls.key -out server.csr -config server.cnf
# Sign server certificate with CA
openssl x509 -req \
-in server.csr \
-CA ca.crt \
-CAkey ca.key \
-CAcreateserial \
-out tls.crt \
-days ${CERT_VALIDITY_DAYS} \
-extensions v3_req \
-extfile server.cnf
echo "✅ Certificates generated successfully!"
echo "📦 Creating Kubernetes secret..."
# Check if secret exists
if kubectl get secret ${SECRET_NAME} -n ${NAMESPACE} >/dev/null 2>&1; then
echo "Secret exists, updating..."
kubectl create secret generic ${SECRET_NAME} \
--from-file=tls.crt=tls.crt \
--from-file=tls.key=tls.key \
--from-file=ca.crt=ca.crt \
--dry-run=client -o yaml | \
kubectl apply -f -
else
echo "Creating new secret..."
kubectl create secret generic ${SECRET_NAME} \
--from-file=tls.crt=tls.crt \
--from-file=tls.key=tls.key \
--from-file=ca.crt=ca.crt \
-n ${NAMESPACE}
fi
echo "✅ Secret '${SECRET_NAME}' created successfully!"
echo "🎉 Webhook certificate generation complete!"
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: false
runAsNonRoot: true
runAsUser: 1001
{{- end }}
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
{{- if and .Values.webhook.enabled .Values.webhook.certManager.enabled }}
---
# Self-signed issuer to bootstrap the CA
apiVersion: cert-manager.io/v1
kind: Issuer
metadata:
name: {{ include "dynamo-operator.fullname" . }}-selfsigned-issuer
namespace: {{ .Release.Namespace }}
labels:
app.kubernetes.io/component: webhook
app.kubernetes.io/created-by: dynamo-operator
app.kubernetes.io/part-of: dynamo-operator
{{- include "dynamo-operator.labels" . | nindent 4 }}
spec:
selfSigned: {}
---
# Root CA Certificate
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: {{ include "dynamo-operator.fullname" . }}-root-ca
namespace: {{ .Release.Namespace }}
labels:
app.kubernetes.io/component: webhook
app.kubernetes.io/created-by: dynamo-operator
app.kubernetes.io/part-of: dynamo-operator
{{- include "dynamo-operator.labels" . | nindent 4 }}
spec:
isCA: true
commonName: {{ include "dynamo-operator.fullname" . }}-root-ca
secretName: {{ include "dynamo-operator.fullname" . }}-root-ca-secret
duration: {{ .Values.webhook.certManager.certificate.rootCA.duration }}
renewBefore: {{ .Values.webhook.certManager.certificate.rootCA.renewBefore }}
privateKey:
algorithm: RSA
size: 4096
issuerRef:
name: {{ include "dynamo-operator.fullname" . }}-selfsigned-issuer
kind: Issuer
---
# CA Issuer (uses the root CA to sign webhook certs)
apiVersion: cert-manager.io/v1
kind: Issuer
metadata:
name: {{ include "dynamo-operator.fullname" . }}-ca-issuer
namespace: {{ .Release.Namespace }}
labels:
app.kubernetes.io/component: webhook
app.kubernetes.io/created-by: dynamo-operator
app.kubernetes.io/part-of: dynamo-operator
{{- include "dynamo-operator.labels" . | nindent 4 }}
spec:
ca:
secretName: {{ include "dynamo-operator.fullname" . }}-root-ca-secret
---
# Webhook Server Certificate
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: {{ include "dynamo-operator.fullname" . }}-serving-cert
namespace: {{ .Release.Namespace }}
labels:
app.kubernetes.io/component: webhook
app.kubernetes.io/created-by: dynamo-operator
app.kubernetes.io/part-of: dynamo-operator
{{- include "dynamo-operator.labels" . | nindent 4 }}
spec:
secretName: {{ .Values.webhook.certificateSecret.name }}
duration: {{ .Values.webhook.certManager.certificate.duration }}
renewBefore: {{ .Values.webhook.certManager.certificate.renewBefore }}
commonName: {{ include "dynamo-operator.fullname" . }}-webhook-service.{{ .Release.Namespace }}.svc
dnsNames:
- {{ include "dynamo-operator.fullname" . }}-webhook-service
- {{ include "dynamo-operator.fullname" . }}-webhook-service.{{ .Release.Namespace }}.svc
- {{ include "dynamo-operator.fullname" . }}-webhook-service.{{ .Release.Namespace }}.svc.{{ .Values.kubernetesClusterDomain }}
issuerRef:
name: {{ include "dynamo-operator.fullname" . }}-ca-issuer
kind: Issuer
{{- end }}
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
{{- if .Values.webhook.enabled }}
---
apiVersion: admissionregistration.k8s.io/v1
kind: ValidatingWebhookConfiguration
metadata:
{{- if .Values.namespaceRestriction.enabled }}
name: {{ include "dynamo-operator.fullname" . }}-validating-{{ .Release.Namespace }}
{{- else }}
name: {{ include "dynamo-operator.fullname" . }}-validating
{{- end }}
labels:
app.kubernetes.io/component: webhook
app.kubernetes.io/created-by: dynamo-operator
app.kubernetes.io/part-of: dynamo-operator
{{- include "dynamo-operator.labels" . | nindent 4 }}
{{- if .Values.webhook.certManager.enabled }}
annotations:
cert-manager.io/inject-ca-from: {{ .Release.Namespace }}/{{ include "dynamo-operator.fullname" . }}-serving-cert
{{- end }}
webhooks:
- admissionReviewVersions:
- v1
clientConfig:
{{- if and (not .Values.webhook.certManager.enabled) .Values.webhook.certificateSecret.external }}
{{- if .Values.webhook.caBundle }}
caBundle: {{ .Values.webhook.caBundle }}
{{- end }}
{{- end }}
service:
name: {{ include "dynamo-operator.fullname" . }}-webhook-service
namespace: {{ .Release.Namespace }}
path: /validate-nvidia-com-v1alpha1-dynamocomponentdeployment
failurePolicy: {{ .Values.webhook.failurePolicy }}
name: vdynamocomponentdeployment.kb.io
{{- if .Values.webhook.namespaceSelector }}
namespaceSelector:
{{- toYaml .Values.webhook.namespaceSelector | nindent 4 }}
{{- else if .Values.namespaceRestriction.enabled }}
namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: {{ .Release.Namespace }}
{{- end }}
rules:
- apiGroups:
- nvidia.com
apiVersions:
- v1alpha1
operations:
- CREATE
- UPDATE
resources:
- dynamocomponentdeployments
sideEffects: None
timeoutSeconds: {{ .Values.webhook.timeoutSeconds }}
- admissionReviewVersions:
- v1
clientConfig:
{{- if and (not .Values.webhook.certManager.enabled) .Values.webhook.certificateSecret.external }}
{{- if .Values.webhook.caBundle }}
caBundle: {{ .Values.webhook.caBundle }}
{{- end }}
{{- end }}
service:
name: {{ include "dynamo-operator.fullname" . }}-webhook-service
namespace: {{ .Release.Namespace }}
path: /validate-nvidia-com-v1alpha1-dynamographdeployment
failurePolicy: {{ .Values.webhook.failurePolicy }}
name: vdynamographdeployment.kb.io
{{- if .Values.webhook.namespaceSelector }}
namespaceSelector:
{{- toYaml .Values.webhook.namespaceSelector | nindent 4 }}
{{- else if .Values.namespaceRestriction.enabled }}
namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: {{ .Release.Namespace }}
{{- end }}
rules:
- apiGroups:
- nvidia.com
apiVersions:
- v1alpha1
operations:
- CREATE
- UPDATE
resources:
- dynamographdeployments
sideEffects: None
timeoutSeconds: {{ .Values.webhook.timeoutSeconds }}
- admissionReviewVersions:
- v1
clientConfig:
{{- if and (not .Values.webhook.certManager.enabled) .Values.webhook.certificateSecret.external }}
{{- if .Values.webhook.caBundle }}
caBundle: {{ .Values.webhook.caBundle }}
{{- end }}
{{- end }}
service:
name: {{ include "dynamo-operator.fullname" . }}-webhook-service
namespace: {{ .Release.Namespace }}
path: /validate-nvidia-com-v1alpha1-dynamomodel
failurePolicy: {{ .Values.webhook.failurePolicy }}
name: vdynamomodel.kb.io
{{- if .Values.webhook.namespaceSelector }}
namespaceSelector:
{{- toYaml .Values.webhook.namespaceSelector | nindent 4 }}
{{- else if .Values.namespaceRestriction.enabled }}
namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: {{ .Release.Namespace }}
{{- end }}
rules:
- apiGroups:
- nvidia.com
apiVersions:
- v1alpha1
operations:
- CREATE
- UPDATE
resources:
- dynamomodels
sideEffects: None
timeoutSeconds: {{ .Values.webhook.timeoutSeconds }}
- admissionReviewVersions:
- v1
clientConfig:
{{- if and (not .Values.webhook.certManager.enabled) .Values.webhook.certificateSecret.external }}
{{- if .Values.webhook.caBundle }}
caBundle: {{ .Values.webhook.caBundle }}
{{- end }}
{{- end }}
service:
name: {{ include "dynamo-operator.fullname" . }}-webhook-service
namespace: {{ .Release.Namespace }}
path: /validate-nvidia-com-v1alpha1-dynamographdeploymentrequest
failurePolicy: {{ .Values.webhook.failurePolicy }}
name: vdynamographdeploymentrequest.kb.io
{{- if .Values.webhook.namespaceSelector }}
namespaceSelector:
{{- toYaml .Values.webhook.namespaceSelector | nindent 4 }}
{{- else if .Values.namespaceRestriction.enabled }}
namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: {{ .Release.Namespace }}
{{- end }}
rules:
- apiGroups:
- nvidia.com
apiVersions:
- v1alpha1
operations:
- CREATE
- UPDATE
resources:
- dynamographdeploymentrequests
sideEffects: None
timeoutSeconds: {{ .Values.webhook.timeoutSeconds }}
{{- end }}
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
{{- if .Values.webhook.enabled }}
---
# Role to read the webhook certificate secret
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: {{ include "dynamo-operator.fullname" . }}-webhook-cert-reader
namespace: {{ .Release.Namespace }}
labels:
app.kubernetes.io/component: webhook
app.kubernetes.io/created-by: dynamo-operator
app.kubernetes.io/part-of: dynamo-operator
{{- include "dynamo-operator.labels" . | nindent 4 }}
rules:
- apiGroups:
- ""
resources:
- secrets
resourceNames:
- {{ .Values.webhook.certificateSecret.name }}
verbs:
- get
- list
- watch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: {{ include "dynamo-operator.fullname" . }}-webhook-cert-reader
namespace: {{ .Release.Namespace }}
labels:
app.kubernetes.io/component: webhook
app.kubernetes.io/created-by: dynamo-operator
app.kubernetes.io/part-of: dynamo-operator
{{- include "dynamo-operator.labels" . | nindent 4 }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: {{ include "dynamo-operator.fullname" . }}-webhook-cert-reader
subjects:
- kind: ServiceAccount
name: {{ include "dynamo-operator.fullname" . }}-controller-manager
namespace: {{ .Release.Namespace }}
{{- end }}
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
{{- if .Values.webhook.enabled }}
---
apiVersion: v1
kind: Service
metadata:
name: {{ include "dynamo-operator.fullname" . }}-webhook-service
namespace: {{ .Release.Namespace }}
labels:
app.kubernetes.io/component: webhook
app.kubernetes.io/created-by: dynamo-operator
app.kubernetes.io/part-of: dynamo-operator
{{- include "dynamo-operator.labels" . | nindent 4 }}
spec:
ports:
- port: 443
protocol: TCP
targetPort: 9443
name: webhook
selector:
control-plane: controller-manager
{{- include "dynamo-operator.selectorLabels" . | nindent 4 }}
{{- end }}
......@@ -138,3 +138,80 @@ natsAddr: ""
etcdAddr: ""
modelExpressURL: ""
# Webhook configuration
webhook:
# Enable admission webhooks for validation
# Enabled by default for production-ready validation and better error reporting
enabled: true
# Certificate configuration
certificateSecret:
# Name of the secret containing webhook TLS certificates
# The secret must contain: tls.crt, tls.key, and ca.crt
name: webhook-server-cert
# Set to true if you're managing the certificate secret externally
# When false (default), certificates are auto-generated via Helm hooks
external: false
# Certificate validity duration (in days) for auto-generated certificates
# Only used when certManager.enabled=false and certificateSecret.external=false
certificateValidity: 365
# Container image for certificate generation and CA injection jobs
# Only used when certManager.enabled=false and certificateSecret.external=false
certGenerator:
image:
repository: bitnami/kubectl
tag: latest
pullPolicy: IfNotPresent
# CA bundle for webhook (base64 encoded)
# Only used when certificateSecret.external=true
# For automatic mode or cert-manager, leave empty
caBundle: ""
# Webhook failure policy
# Fail: Reject requests if webhook is unavailable (recommended for production)
# Ignore: Allow requests if webhook is unavailable
failurePolicy: Fail
# Timeout for webhook calls (in seconds)
timeoutSeconds: 10
# Namespace selector for webhooks
# Use this to exclude certain namespaces from validation
#
# For CLUSTER-WIDE operators: Exclude namespaces with namespace-restricted operators
# namespaceSelector:
# matchExpressions:
# - key: dynamo-operator
# operator: NotIn
# values: ["namespace-restricted"]
#
# For NAMESPACE-RESTRICTED operators: Leave empty (auto-configured)
namespaceSelector: {}
# cert-manager integration (optional)
certManager:
# Enable cert-manager for automatic certificate management
# Requires cert-manager to be installed in the cluster
# When enabled, disables automatic certificate generation
enabled: false
# Certificate configuration for cert-manager
certificate:
# Certificate duration (e.g., "8760h" for 1 year)
duration: "8760h"
# Time before expiration to renew certificate (e.g., "360h" for 15 days)
renewBefore: "360h"
# Root CA configuration
rootCA:
# Duration for root CA certificate (e.g., "87600h" for 10 years)
duration: "87600h"
# Time before expiration to renew root CA (e.g., "720h" for 30 days)
renewBefore: "720h"
......@@ -144,6 +144,67 @@ dynamo-operator:
# -- Whether to enable SSH key generation for MPI Run
enabled: true
# Webhook configuration for admission control and validation
webhook:
# -- Whether to enable admission webhooks for resource validation. When enabled, the operator will validate DynamoComponentDeployment and DynamoGraphDeployment resources before they are created or updated in the cluster. Enabled by default for production-ready validation and better error reporting.
enabled: true
# Certificate configuration for webhook TLS
certificateSecret:
# -- Name of the Kubernetes secret containing webhook TLS certificates. The secret must contain three keys: tls.crt (server certificate), tls.key (server private key), and ca.crt (Certificate Authority certificate).
name: webhook-server-cert
# -- Whether to manage the certificate secret externally. When false (default), certificates are automatically generated via Helm hooks during installation. When true, you must create the secret manually before installing the chart.
external: false
# -- Certificate validity duration in days for auto-generated certificates. Only used when certManager.enabled=false and certificateSecret.external=false. After this duration, certificates will expire and need to be regenerated.
certificateValidity: 365
# Container image for certificate generation and CA injection jobs
# Only used when certManager.enabled=false and certificateSecret.external=false
certGenerator:
image:
# -- Container image repository for certificate generation jobs. This image must contain both openssl and kubectl commands.
repository: bitnami/kubectl
# -- Container image tag for certificate generation jobs
tag: latest
# -- Image pull policy for certificate generation jobs
pullPolicy: IfNotPresent
# -- CA bundle (base64 encoded) for webhook validation. Only used when certificateSecret.external=true. For automatic certificate generation or cert-manager integration, leave this empty as it will be injected automatically.
caBundle: ""
# -- Webhook failure policy controls how Kubernetes handles requests when the webhook is unavailable. 'Fail' (recommended for production) rejects requests if the webhook cannot be reached, ensuring strict validation. 'Ignore' allows requests through if the webhook is unavailable, providing availability over validation guarantees.
failurePolicy: Fail
# -- Timeout in seconds for webhook validation calls. If the webhook doesn't respond within this time, the request will be handled according to the failurePolicy.
timeoutSeconds: 10
# Namespace selector for webhook scope control
# -- Custom namespace selector for webhook validation. Use this to include or exclude specific namespaces from webhook validation. For CLUSTER-WIDE operators, you can exclude namespaces managed by namespace-restricted operators by using: matchExpressions: [{ key: "dynamo-operator", operator: "NotIn", values: ["namespace-restricted"] }]. For NAMESPACE-RESTRICTED operators, leave empty as it will be auto-configured to match only the operator's namespace.
namespaceSelector: {}
# cert-manager integration for automated certificate lifecycle management
certManager:
# -- Whether to use cert-manager for automatic certificate management. Requires cert-manager to be installed in the cluster. When enabled, cert-manager will automatically generate, renew, and rotate certificates, and the automatic certificate generation via Helm hooks will be disabled.
enabled: false
# Certificate configuration for cert-manager
certificate:
# -- Certificate duration for webhook certificates managed by cert-manager (e.g., "8760h" for 1 year). cert-manager will automatically renew the certificate before it expires.
duration: "8760h"
# -- Time before certificate expiration to trigger renewal (e.g., "360h" for 15 days). cert-manager will attempt to renew the certificate when this threshold is reached.
renewBefore: "360h"
# Root CA configuration for cert-manager
rootCA:
# -- Duration for the root CA certificate (e.g., "87600h" for 10 years). The root CA typically has a much longer lifetime than the leaf certificates it signs.
duration: "87600h"
# -- Time before root CA expiration to trigger renewal (e.g., "720h" for 30 days). Renewing a CA can be disruptive as all signed certificates must be reissued.
renewBefore: "720h"
# Grove component - distributed inference orchestration
grove:
......
......@@ -65,6 +65,8 @@ import (
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/rbac"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/secret"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/secrets"
internalwebhook "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/webhook"
webhookvalidation "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/webhook/validation"
istioclientsetscheme "istio.io/client-go/pkg/clientset/versioned/scheme"
//+kubebuilder:scaffold:imports
)
......@@ -147,6 +149,7 @@ func main() {
var namespaceScopeLeaseRenewInterval time.Duration
var operatorVersion string
var discoveryBackend string
var enableWebhooks bool
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
flag.BoolVar(&enableLeaderElection, "leader-elect", false,
......@@ -156,6 +159,9 @@ func main() {
"If set the metrics endpoint is served securely")
flag.BoolVar(&enableHTTP2, "enable-http2", false,
"If set, HTTP/2 will be enabled for the metrics and webhook servers")
flag.BoolVar(&enableWebhooks, "enable-webhooks", false,
"Enable admission webhooks for validation. When enabled, controllers skip validation "+
"(webhooks handle it). When disabled, controllers perform validation.")
flag.StringVar(&restrictedNamespace, "restrictedNamespace", "",
"Enable resources filtering, only the resources belonging to the given namespace will be handled.")
flag.StringVar(&leaderElectionID, "leader-election-id", "", "Leader election id"+
......@@ -290,6 +296,12 @@ func main() {
}
webhookServer := webhook.NewServer(webhook.Options{
// Bind to all interfaces so the Service can reach the webhook server
Host: "0.0.0.0",
// Must match the port exposed by the manager container and targeted by the Service.
Port: 9443,
// Must match the mountPath of the webhook certificate secret in the Deployment.
CertDir: "/tmp/k8s-webhook-server/serving-certs",
TLSOpts: tlsOpts,
})
......@@ -580,10 +592,67 @@ func main() {
Client: mgr.GetClient(),
Recorder: mgr.GetEventRecorderFor("dynamomodel"),
EndpointClient: modelendpoint.NewClient(),
Config: ctrlConfig,
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "DynamoModel")
os.Exit(1)
}
// Set webhooks enabled flag in config
ctrlConfig.WebhooksEnabled = enableWebhooks
if enableWebhooks {
setupLog.Info("Webhooks are enabled - webhooks will validate, controllers will skip validation")
} else {
setupLog.Info("Webhooks are disabled - controllers will validate (defense in depth)")
}
// Configure webhooks with lease-based namespace exclusion (only if enabled)
// In cluster-wide mode, inject ctrlConfig.ExcludedNamespaces (leaseWatcher) so webhooks can defer
// to namespace-restricted operators. In namespace-restricted mode, webhooks validate without checking
// leases (ExcludedNamespaces is nil). The webhooks use LeaseAwareValidator wrapper to add coordination.
if enableWebhooks {
if ctrlConfig.RestrictedNamespace == "" {
// Cluster-wide mode: inject the same ExcludedNamespaces used by controllers
setupLog.Info("Configuring webhooks with lease-based namespace exclusion for cluster-wide mode")
internalwebhook.SetExcludedNamespaces(ctrlConfig.ExcludedNamespaces)
} else {
// Namespace-restricted mode: no exclusion checking needed (validators not wrapped)
setupLog.Info("Configuring webhooks for namespace-restricted mode (no lease checking)",
"restrictedNamespace", ctrlConfig.RestrictedNamespace)
internalwebhook.SetExcludedNamespaces(nil)
}
// Register validation webhook handlers
setupLog.Info("Registering validation webhooks")
dcdHandler := webhookvalidation.NewDynamoComponentDeploymentHandler()
if err = dcdHandler.RegisterWithManager(mgr); err != nil {
setupLog.Error(err, "unable to register webhook", "webhook", "DynamoComponentDeployment")
os.Exit(1)
}
dgdHandler := webhookvalidation.NewDynamoGraphDeploymentHandler()
if err = dgdHandler.RegisterWithManager(mgr); err != nil {
setupLog.Error(err, "unable to register webhook", "webhook", "DynamoGraphDeployment")
os.Exit(1)
}
dmHandler := webhookvalidation.NewDynamoModelHandler()
if err = dmHandler.RegisterWithManager(mgr); err != nil {
setupLog.Error(err, "unable to register webhook", "webhook", "DynamoModel")
os.Exit(1)
}
isClusterWide := ctrlConfig.RestrictedNamespace == ""
dgdrHandler := webhookvalidation.NewDynamoGraphDeploymentRequestHandler(isClusterWide)
if err = dgdrHandler.RegisterWithManager(mgr); err != nil {
setupLog.Error(err, "unable to register webhook", "webhook", "DynamoGraphDeploymentRequest")
os.Exit(1)
}
setupLog.Info("Validation webhooks registered successfully")
}
//+kubebuilder:scaffold:builder
if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
......
......@@ -35,6 +35,7 @@ processor:
# - "DynamoComponentDeploymentOverridesSpec$"
- "DynamoComponentDeploymentStatus$"
- "BaseStatus$"
- "ExcludedNamespacesChecker"
render:
# Output format - use markdown instead of default asciidoc
......
......@@ -42,6 +42,7 @@ import (
commoncontroller "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/dynamo"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/modelendpoint"
webhookvalidation "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/webhook/validation"
)
const (
......@@ -69,6 +70,7 @@ type DynamoModelReconciler struct {
client.Client
Recorder record.EventRecorder
EndpointClient *modelendpoint.Client
Config commoncontroller.Config
}
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamomodels,verbs=get;list;watch;create;update;patch;delete
......@@ -95,6 +97,45 @@ func (r *DynamoModelReconciler) Reconcile(ctx context.Context, req ctrl.Request)
logs = logs.WithValues("dynamoModel", model.Name, "namespace", model.Namespace, "baseModelName", model.Spec.BaseModelName)
logs.Info("Reconciling DynamoModel")
// Validate the DynamoModel spec (defense in depth - only when webhooks are disabled)
if !r.Config.WebhooksEnabled {
validator := webhookvalidation.NewDynamoModelValidator(model)
if _, err := validator.Validate(); err != nil {
logs.Error(err, "DynamoModel validation failed, refusing to reconcile")
// Set validation error condition
meta.SetStatusCondition(&model.Status.Conditions, metav1.Condition{
Type: "Valid",
Status: metav1.ConditionFalse,
ObservedGeneration: model.Generation,
Reason: "ValidationFailed",
Message: fmt.Sprintf("Validation failed: %v", err),
})
// Update status and don't requeue (user must fix the spec)
if statusErr := r.Status().Update(ctx, model); statusErr != nil {
logs.Error(statusErr, "Failed to update DynamoModel status with validation error")
return ctrl.Result{}, statusErr
}
// Record event for visibility
r.Recorder.Event(model, corev1.EventTypeWarning, "ValidationFailed", err.Error())
// Don't requeue - user must fix the spec
logs.Info("DynamoModel is invalid, not reconciling until spec is fixed")
return ctrl.Result{}, nil
}
// Set Valid condition to True
meta.SetStatusCondition(&model.Status.Conditions, metav1.Condition{
Type: "Valid",
Status: metav1.ConditionTrue,
ObservedGeneration: model.Generation,
Reason: "ValidationPassed",
Message: "DynamoModel spec is valid",
})
}
// Handle finalizer using common handler
finalized, err := commoncontroller.HandleFinalizer(ctx, model, r.Client, r)
if err != nil {
......
......@@ -38,6 +38,7 @@ import (
commonconsts "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
commonController "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/dynamo"
webhookvalidation "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/webhook/validation"
networkingv1beta1 "istio.io/client-go/pkg/apis/networking/v1beta1"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
......@@ -126,6 +127,73 @@ func (r *DynamoComponentDeploymentReconciler) Reconcile(ctx context.Context, req
logs = logs.WithValues("dynamoComponentDeployment", dynamoComponentDeployment.Name, "namespace", dynamoComponentDeployment.Namespace)
// Setup defer to handle errors and update status
defer func() {
if err == nil {
return
}
reconcileErr := err
logs.Error(reconcileErr, "Failed to reconcile DynamoComponentDeployment.")
r.Recorder.Eventf(dynamoComponentDeployment, corev1.EventTypeWarning, "ReconcileError",
"Failed to reconcile DynamoComponentDeployment: %v", reconcileErr)
if _, statusErr := r.setStatusConditions(ctx, req,
metav1.Condition{
Type: v1alpha1.DynamoGraphDeploymentConditionTypeAvailable,
Status: metav1.ConditionFalse,
Reason: "Reconciling",
Message: fmt.Sprintf("Failed to reconcile DynamoComponentDeployment: %v", reconcileErr),
},
); statusErr != nil {
logs.Error(statusErr, "Failed to update DynamoComponentDeployment status after reconcile error")
}
}()
// Validate the DynamoComponentDeployment spec (defense in depth - only when webhooks are disabled)
if !r.Config.WebhooksEnabled {
validator := webhookvalidation.NewDynamoComponentDeploymentValidator(dynamoComponentDeployment)
if _, validationErr := validator.Validate(); validationErr != nil {
logs.Error(validationErr, "DynamoComponentDeployment validation failed, refusing to reconcile")
// Set validation error condition
meta.SetStatusCondition(&dynamoComponentDeployment.Status.Conditions, metav1.Condition{
Type: "Valid",
Status: metav1.ConditionFalse,
ObservedGeneration: dynamoComponentDeployment.Generation,
Reason: "ValidationFailed",
Message: fmt.Sprintf("Validation failed: %v", validationErr),
})
// Update status and don't requeue (user must fix the spec)
if statusErr := r.Status().Update(ctx, dynamoComponentDeployment); statusErr != nil {
logs.Error(statusErr, "Failed to update DynamoComponentDeployment status with validation error")
err = statusErr
return ctrl.Result{}, err
}
// Record event for visibility
r.Recorder.Event(dynamoComponentDeployment, corev1.EventTypeWarning, "ValidationFailed", validationErr.Error())
// Don't requeue - user must fix the spec
logs.Info("DynamoComponentDeployment is invalid, not reconciling until spec is fixed")
err = nil
return ctrl.Result{}, nil
}
// Set Valid condition to True and persist it
meta.SetStatusCondition(&dynamoComponentDeployment.Status.Conditions, metav1.Condition{
Type: "Valid",
Status: metav1.ConditionTrue,
ObservedGeneration: dynamoComponentDeployment.Generation,
Reason: "ValidationPassed",
Message: "DynamoComponentDeployment spec is valid",
})
if statusErr := r.Status().Update(ctx, dynamoComponentDeployment); statusErr != nil {
logs.Error(statusErr, "Failed to update DynamoComponentDeployment status with validation success")
err = statusErr
return ctrl.Result{}, err
}
}
deleted, err := commonController.HandleFinalizer(ctx, dynamoComponentDeployment, r.Client, r)
if err != nil {
logs.Error(err, "Failed to handle finalizer")
......@@ -158,25 +226,6 @@ func (r *DynamoComponentDeploymentReconciler) Reconcile(ctx context.Context, req
}
}
defer func() {
if err == nil {
return
}
logs.Error(err, "Failed to reconcile DynamoComponentDeployment.")
r.Recorder.Eventf(dynamoComponentDeployment, corev1.EventTypeWarning, "ReconcileError", "Failed to reconcile DynamoComponentDeployment: %v", err)
_, err = r.setStatusConditions(ctx, req,
metav1.Condition{
Type: v1alpha1.DynamoGraphDeploymentConditionTypeAvailable,
Status: metav1.ConditionFalse,
Reason: "Reconciling",
Message: fmt.Sprintf("Failed to reconcile DynamoComponentDeployment: %v", err),
},
)
if err != nil {
return
}
}()
modified := false
// Create the appropriate workload resource based on deployment type
......
......@@ -49,6 +49,7 @@ import (
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
commonController "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/dynamo"
webhookvalidation "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/webhook/validation"
rbacv1 "k8s.io/api/rbac/v1"
)
......@@ -150,6 +151,28 @@ func (r *DynamoGraphDeploymentReconciler) Reconcile(ctx context.Context, req ctr
logger.Info("Reconciliation done")
}()
// Validate the DynamoGraphDeployment spec (defense in depth - only when webhooks are disabled)
if !r.Config.WebhooksEnabled {
validator := webhookvalidation.NewDynamoGraphDeploymentValidator(dynamoDeployment)
if _, validationErr := validator.Validate(); validationErr != nil {
logger.Error(validationErr, "DynamoGraphDeployment validation failed, refusing to reconcile")
// Set validation error state and reason (defer will update status)
state = FailedState
reason = Reason("ValidationFailed")
message = Message(fmt.Sprintf("Validation failed: %v", validationErr))
// Record event for visibility
r.Recorder.Event(dynamoDeployment, corev1.EventTypeWarning, "ValidationFailed", validationErr.Error())
// Don't requeue - user must fix the spec
logger.Info("DynamoGraphDeployment is invalid, not reconciling until spec is fixed")
// Return without error so defer updates status but doesn't requeue
return ctrl.Result{}, nil
}
}
deleted, err := commonController.HandleFinalizer(ctx, dynamoDeployment, r.Client, r)
if err != nil {
logger.Error(err, "failed to handle the finalizer")
......
......@@ -20,7 +20,6 @@ package controller
import (
"bytes"
"context"
"errors"
"fmt"
"io"
"text/template"
......@@ -48,6 +47,7 @@ import (
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
commonController "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
webhookvalidation "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/webhook/validation"
)
const (
......@@ -798,22 +798,26 @@ func isOnlineProfiling(dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) boo
// validateSpec validates the DGDR spec
func (r *DynamoGraphDeploymentRequestReconciler) validateSpec(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error {
// Validate profiler image is specified in the new location
if dgdr.Spec.ProfilingConfig.ProfilerImage == "" {
return errors.New("profilingConfig.profilerImage is required")
}
// Basic validation - check that profilingConfig.config is provided
if dgdr.Spec.ProfilingConfig.Config == nil || len(dgdr.Spec.ProfilingConfig.Config.Raw) == 0 {
return errors.New("profilingConfig.config is required and must not be empty")
}
// Use the validator for simple validation (defense in depth - only when webhooks are disabled)
if !r.Config.WebhooksEnabled {
isClusterWide := r.Config.RestrictedNamespace == ""
validator := webhookvalidation.NewDynamoGraphDeploymentRequestValidator(dgdr, isClusterWide)
warnings, err := validator.Validate()
if err != nil {
return err
}
// Validate enableGpuDiscovery is only true for cluster-wide operators
if dgdr.Spec.EnableGpuDiscovery && r.Config.RestrictedNamespace != "" {
return errors.New("enableGpuDiscovery can only be set to true for cluster-wide operators. Namespace-restricted operators cannot access cluster nodes for GPU discovery. Please set enableGpuDiscovery to false and provide hardware configuration (hardware.min_num_gpus_per_engine, hardware.max_num_gpus_per_engine, hardware.num_gpus_per_node) in profilingConfig.config")
// Log warnings if any
if len(warnings) > 0 {
logger := log.FromContext(ctx)
for _, warning := range warnings {
logger.Info("Validation warning", "warning", warning)
}
}
}
// Validate ConfigMap if provided (for the DGD base config)
// This requires cluster access and cannot be done in the stateless validator
if dgdr.Spec.ProfilingConfig.ConfigMapRef != nil {
cm := &corev1.ConfigMap{}
err := r.Get(ctx, types.NamespacedName{
......@@ -840,28 +844,6 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateSpec(ctx context.Contex
}
}
// Parse config to validate structure
var config map[string]interface{}
if err := yaml.Unmarshal(dgdr.Spec.ProfilingConfig.Config.Raw, &config); err != nil {
return fmt.Errorf("failed to parse profilingConfig.config: %w", err)
}
// Warn if deployment.model or engine.backend are specified in config (they will be overwritten by spec fields)
if engineConfig, ok := config["engine"].(map[string]interface{}); ok {
if backend, ok := engineConfig["backend"].(string); ok && backend != "" && backend != dgdr.Spec.Backend {
logger := log.FromContext(ctx)
logger.Info("Warning: profilingConfig.config.engine.backend will be overwritten by spec.backend",
"configBackend", backend, "specBackend", dgdr.Spec.Backend)
}
}
if deployment, ok := config["deployment"].(map[string]interface{}); ok {
if model, ok := deployment["model"].(string); ok && model != "" && model != dgdr.Spec.Model {
logger := log.FromContext(ctx)
logger.Info("Warning: profilingConfig.config.deployment.model will be overwritten by spec.model",
"configModel", model, "specModel", dgdr.Spec.Model)
}
}
// The profiler will validate the rest of the configuration
return nil
}
......
......@@ -79,6 +79,11 @@ type Config struct {
// DiscoveryBackend is the discovery backend to use. By default, will rely on ETCD for discovery. Can be set to "kubernetes" to use Kubernetes API for service discovery.
DiscoveryBackend string
// WebhooksEnabled indicates whether admission webhooks are enabled
// When true, controllers skip validation (webhooks handle it)
// When false, controllers perform validation (defense in depth)
WebhooksEnabled bool
}
// RBACConfig holds configuration for RBAC management
......
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package webhook
import (
"context"
"k8s.io/apimachinery/pkg/runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
logf "sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
)
var webhookCommonLog = logf.Log.WithName("webhook-common")
// ExcludedNamespacesChecker defines the interface for checking namespace exclusions
// This matches controller_common.ExcludedNamespacesInterface to allow reuse of the
// lease-based coordination mechanism.
type ExcludedNamespacesChecker interface {
Contains(namespace string) bool
}
// webhookExcludedNamespaces holds the excluded namespaces checker (usually leaseWatcher)
// This is set by main.go and shared across all webhook validators
var webhookExcludedNamespaces ExcludedNamespacesChecker
// SetExcludedNamespaces sets the excluded namespaces checker for all webhooks.
// This should be called from main.go before starting the webhook server.
func SetExcludedNamespaces(checker ExcludedNamespacesChecker) {
webhookExcludedNamespaces = checker
}
// GetExcludedNamespaces returns the current excluded namespaces checker.
func GetExcludedNamespaces() ExcludedNamespacesChecker {
return webhookExcludedNamespaces
}
// LeaseAwareValidator wraps a CustomValidator and adds lease-based namespace exclusion logic.
// It checks if a namespace-restricted operator is managing the namespace (via active lease)
// before delegating validation to the underlying validator.
//
// This implements the Decorator pattern to transparently add coordination logic without
// modifying the actual validation implementations.
type LeaseAwareValidator struct {
validator admission.CustomValidator
excludedNamespaces ExcludedNamespacesChecker
}
// NewLeaseAwareValidator creates a new LeaseAwareValidator that wraps the given validator.
// If excludedNamespaces is nil, the wrapper acts as a pass-through (no filtering).
func NewLeaseAwareValidator(validator admission.CustomValidator, excludedNamespaces ExcludedNamespacesChecker) admission.CustomValidator {
if excludedNamespaces == nil {
// No exclusion logic needed, return validator as-is
return validator
}
return &LeaseAwareValidator{
validator: validator,
excludedNamespaces: excludedNamespaces,
}
}
// ValidateCreate implements admission.CustomValidator
func (v *LeaseAwareValidator) ValidateCreate(ctx context.Context, obj runtime.Object) (admission.Warnings, error) {
if v.shouldSkipValidation(obj) {
return nil, nil
}
return v.validator.ValidateCreate(ctx, obj)
}
// ValidateUpdate implements admission.CustomValidator
func (v *LeaseAwareValidator) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) {
if v.shouldSkipValidation(newObj) {
return nil, nil
}
return v.validator.ValidateUpdate(ctx, oldObj, newObj)
}
// ValidateDelete implements admission.CustomValidator
func (v *LeaseAwareValidator) ValidateDelete(ctx context.Context, obj runtime.Object) (admission.Warnings, error) {
if v.shouldSkipValidation(obj) {
return nil, nil
}
return v.validator.ValidateDelete(ctx, obj)
}
// shouldSkipValidation checks if validation should be skipped for the given object
func (v *LeaseAwareValidator) shouldSkipValidation(obj runtime.Object) bool {
// Try to extract namespace from object using client.Object interface
clientObj, ok := obj.(client.Object)
if !ok {
// If we can't determine the namespace, don't skip (fail-safe)
return false
}
namespace := clientObj.GetNamespace()
if v.excludedNamespaces.Contains(namespace) {
webhookCommonLog.Info("skipping validation - namespace has namespace-restricted operator",
"name", clientObj.GetName(),
"namespace", namespace,
"kind", obj.GetObjectKind().GroupVersionKind().Kind)
return true
}
return false
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package validation
import (
"fmt"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
)
// DynamoComponentDeploymentValidator validates DynamoComponentDeployment resources.
// This validator can be used by both webhooks and controllers for consistent validation.
type DynamoComponentDeploymentValidator struct {
deployment *nvidiacomv1alpha1.DynamoComponentDeployment
}
// NewDynamoComponentDeploymentValidator creates a new validator for DynamoComponentDeployment.
func NewDynamoComponentDeploymentValidator(deployment *nvidiacomv1alpha1.DynamoComponentDeployment) *DynamoComponentDeploymentValidator {
return &DynamoComponentDeploymentValidator{
deployment: deployment,
}
}
// Validate performs stateless validation on the DynamoComponentDeployment.
// Returns warnings and error.
func (v *DynamoComponentDeploymentValidator) Validate() (admission.Warnings, error) {
// Validate shared spec fields using SharedSpecValidator
sharedValidator := NewSharedSpecValidator(&v.deployment.Spec.DynamoComponentDeploymentSharedSpec, "spec")
if err := sharedValidator.Validate(); err != nil {
return nil, err
}
// DCD-specific validation would go here (currently none)
return nil, nil
}
// ValidateUpdate performs stateful validation comparing old and new DynamoComponentDeployment.
// Returns warnings and error.
func (v *DynamoComponentDeploymentValidator) ValidateUpdate(old *nvidiacomv1alpha1.DynamoComponentDeployment) (admission.Warnings, error) {
// Validate that BackendFramework is not changed (immutable)
if v.deployment.Spec.BackendFramework != old.Spec.BackendFramework {
warning := "Changing spec.backendFramework may cause unexpected behavior"
return admission.Warnings{warning}, fmt.Errorf("spec.backendFramework is immutable and cannot be changed after creation")
}
return nil, nil
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package validation
import (
"context"
"fmt"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
internalwebhook "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/webhook"
"k8s.io/apimachinery/pkg/runtime"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
)
const (
// DynamoComponentDeploymentWebhookName is the name of the validating webhook handler for DynamoComponentDeployment.
DynamoComponentDeploymentWebhookName = "dynamocomponentdeployment-validating-webhook"
dynamoComponentDeploymentWebhookPath = "/validate-nvidia-com-v1alpha1-dynamocomponentdeployment"
)
// DynamoComponentDeploymentHandler is a handler for validating DynamoComponentDeployment resources.
// It is a thin wrapper around DynamoComponentDeploymentValidator.
type DynamoComponentDeploymentHandler struct{}
// NewDynamoComponentDeploymentHandler creates a new handler for DynamoComponentDeployment Webhook.
func NewDynamoComponentDeploymentHandler() *DynamoComponentDeploymentHandler {
return &DynamoComponentDeploymentHandler{}
}
// ValidateCreate validates a DynamoComponentDeployment create request.
func (h *DynamoComponentDeploymentHandler) ValidateCreate(ctx context.Context, obj runtime.Object) (admission.Warnings, error) {
logger := log.FromContext(ctx).WithName(DynamoComponentDeploymentWebhookName)
deployment, err := castToDynamoComponentDeployment(obj)
if err != nil {
return nil, err
}
logger.Info("validate create", "name", deployment.Name, "namespace", deployment.Namespace)
// Create validator and perform validation
validator := NewDynamoComponentDeploymentValidator(deployment)
return validator.Validate()
}
// ValidateUpdate validates a DynamoComponentDeployment update request.
func (h *DynamoComponentDeploymentHandler) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) {
logger := log.FromContext(ctx).WithName(DynamoComponentDeploymentWebhookName)
newDeployment, err := castToDynamoComponentDeployment(newObj)
if err != nil {
return nil, err
}
logger.Info("validate update", "name", newDeployment.Name, "namespace", newDeployment.Namespace)
// Skip validation if the resource is being deleted (to allow finalizer removal)
if !newDeployment.DeletionTimestamp.IsZero() {
logger.Info("skipping validation for resource being deleted", "name", newDeployment.Name)
return nil, nil
}
oldDeployment, err := castToDynamoComponentDeployment(oldObj)
if err != nil {
return nil, err
}
// Create validator and perform validation
validator := NewDynamoComponentDeploymentValidator(newDeployment)
// Validate stateless rules
warnings, err := validator.Validate()
if err != nil {
return warnings, err
}
// Validate stateful rules (immutability)
updateWarnings, err := validator.ValidateUpdate(oldDeployment)
if err != nil {
return updateWarnings, err
}
// Combine warnings
warnings = append(warnings, updateWarnings...)
return warnings, nil
}
// ValidateDelete validates a DynamoComponentDeployment delete request.
func (h *DynamoComponentDeploymentHandler) ValidateDelete(ctx context.Context, obj runtime.Object) (admission.Warnings, error) {
logger := log.FromContext(ctx).WithName(DynamoComponentDeploymentWebhookName)
deployment, err := castToDynamoComponentDeployment(obj)
if err != nil {
return nil, err
}
logger.Info("validate delete", "name", deployment.Name, "namespace", deployment.Namespace)
// No special validation needed for deletion
return nil, nil
}
// RegisterWithManager registers the webhook with the manager.
// The handler is automatically wrapped with LeaseAwareValidator to add namespace exclusion logic.
func (h *DynamoComponentDeploymentHandler) RegisterWithManager(mgr manager.Manager) error {
// Wrap the handler with lease-aware logic for cluster-wide coordination
validator := internalwebhook.NewLeaseAwareValidator(h, internalwebhook.GetExcludedNamespaces())
webhook := admission.
WithCustomValidator(mgr.GetScheme(), &nvidiacomv1alpha1.DynamoComponentDeployment{}, validator).
WithRecoverPanic(true)
mgr.GetWebhookServer().Register(dynamoComponentDeploymentWebhookPath, webhook)
return nil
}
// castToDynamoComponentDeployment attempts to cast a runtime.Object to a DynamoComponentDeployment.
func castToDynamoComponentDeployment(obj runtime.Object) (*nvidiacomv1alpha1.DynamoComponentDeployment, error) {
deployment, ok := obj.(*nvidiacomv1alpha1.DynamoComponentDeployment)
if !ok {
return nil, fmt.Errorf("expected DynamoComponentDeployment but got %T", obj)
}
return deployment, nil
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment