Unverified Commit 872900f1 authored by Julien Mancuso's avatar Julien Mancuso Committed by GitHub
Browse files

feat: add validation webhooks (#4416)


Signed-off-by: default avatarJulien Mancuso <jmancuso@nvidia.com>
parent b2605a8e
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package validation
import (
"testing"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
func TestDynamoComponentDeploymentValidator_Validate(t *testing.T) {
var (
validReplicas = int32(3)
negativeReplicas = int32(-1)
)
tests := []struct {
name string
deployment *nvidiacomv1alpha1.DynamoComponentDeployment
wantErr bool
errMsg string
}{
{
name: "valid deployment",
deployment: &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-deployment",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Replicas: &validReplicas,
Autoscaling: &nvidiacomv1alpha1.Autoscaling{
Enabled: true,
MinReplicas: 1,
MaxReplicas: 10,
},
},
BackendFramework: "sglang",
},
},
wantErr: false,
},
{
name: "invalid replicas",
deployment: &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-deployment",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Replicas: &negativeReplicas,
},
},
},
wantErr: true,
errMsg: "spec.replicas must be non-negative",
},
{
name: "invalid autoscaling",
deployment: &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-deployment",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Autoscaling: &nvidiacomv1alpha1.Autoscaling{
Enabled: true,
MinReplicas: 5,
MaxReplicas: 3,
},
},
},
},
wantErr: true,
errMsg: "spec.autoscaling.maxReplicas must be > minReplicas",
},
{
name: "invalid ingress",
deployment: &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-deployment",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Ingress: &nvidiacomv1alpha1.IngressSpec{
Enabled: true,
Host: "",
},
},
},
},
wantErr: true,
errMsg: "spec.ingress.host is required when ingress is enabled",
},
{
name: "invalid volume mount",
deployment: &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-deployment",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
VolumeMounts: []nvidiacomv1alpha1.VolumeMount{
{
Name: "data",
UseAsCompilationCache: false,
},
},
},
},
},
wantErr: true,
errMsg: "spec.volumeMounts[0].mountPoint is required when useAsCompilationCache is false",
},
{
name: "invalid shared memory",
deployment: &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-deployment",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
SharedMemory: &nvidiacomv1alpha1.SharedMemorySpec{
Disabled: false,
Size: resource.Quantity{},
},
},
},
},
wantErr: true,
errMsg: "spec.sharedMemory.size is required when disabled is false",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
validator := NewDynamoComponentDeploymentValidator(tt.deployment)
_, err := validator.Validate()
if (err != nil) != tt.wantErr {
t.Errorf("DynamoComponentDeploymentValidator.Validate() error = %v, wantErr %v", err, tt.wantErr)
return
}
if tt.wantErr && err.Error() != tt.errMsg {
t.Errorf("DynamoComponentDeploymentValidator.Validate() error message = %v, want %v", err.Error(), tt.errMsg)
}
})
}
}
func TestDynamoComponentDeploymentValidator_ValidateUpdate(t *testing.T) {
tests := []struct {
name string
oldDeployment *nvidiacomv1alpha1.DynamoComponentDeployment
newDeployment *nvidiacomv1alpha1.DynamoComponentDeployment
wantErr bool
wantWarnings bool
errMsg string
expectedWarnMsg string
}{
{
name: "no changes",
oldDeployment: &nvidiacomv1alpha1.DynamoComponentDeployment{
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
BackendFramework: "sglang",
},
},
newDeployment: &nvidiacomv1alpha1.DynamoComponentDeployment{
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
BackendFramework: "sglang",
},
},
wantErr: false,
},
{
name: "changing backend framework",
oldDeployment: &nvidiacomv1alpha1.DynamoComponentDeployment{
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
BackendFramework: "sglang",
},
},
newDeployment: &nvidiacomv1alpha1.DynamoComponentDeployment{
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
BackendFramework: "vllm",
},
},
wantErr: true,
wantWarnings: true,
errMsg: "spec.backendFramework is immutable and cannot be changed after creation",
expectedWarnMsg: "Changing spec.backendFramework may cause unexpected behavior",
},
{
name: "changing replicas is allowed",
oldDeployment: &nvidiacomv1alpha1.DynamoComponentDeployment{
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Replicas: func() *int32 { r := int32(1); return &r }(),
},
BackendFramework: "sglang",
},
},
newDeployment: &nvidiacomv1alpha1.DynamoComponentDeployment{
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Replicas: func() *int32 { r := int32(3); return &r }(),
},
BackendFramework: "sglang",
},
},
wantErr: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
validator := NewDynamoComponentDeploymentValidator(tt.newDeployment)
warnings, err := validator.ValidateUpdate(tt.oldDeployment)
if (err != nil) != tt.wantErr {
t.Errorf("DynamoComponentDeploymentValidator.ValidateUpdate() error = %v, wantErr %v", err, tt.wantErr)
return
}
if tt.wantErr && err.Error() != tt.errMsg {
t.Errorf("DynamoComponentDeploymentValidator.ValidateUpdate() error message = %v, want %v", err.Error(), tt.errMsg)
}
if tt.wantWarnings && len(warnings) == 0 {
t.Errorf("DynamoComponentDeploymentValidator.ValidateUpdate() expected warnings but got none")
}
if tt.wantWarnings && len(warnings) > 0 && warnings[0] != tt.expectedWarnMsg {
t.Errorf("DynamoComponentDeploymentValidator.ValidateUpdate() warning = %v, want %v", warnings[0], tt.expectedWarnMsg)
}
})
}
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package validation
import (
"errors"
"fmt"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
)
// DynamoGraphDeploymentValidator validates DynamoGraphDeployment resources.
// This validator can be used by both webhooks and controllers for consistent validation.
type DynamoGraphDeploymentValidator struct {
deployment *nvidiacomv1alpha1.DynamoGraphDeployment
}
// NewDynamoGraphDeploymentValidator creates a new validator for DynamoGraphDeployment.
func NewDynamoGraphDeploymentValidator(deployment *nvidiacomv1alpha1.DynamoGraphDeployment) *DynamoGraphDeploymentValidator {
return &DynamoGraphDeploymentValidator{
deployment: deployment,
}
}
// Validate performs stateless validation on the DynamoGraphDeployment.
// Returns warnings and error.
func (v *DynamoGraphDeploymentValidator) Validate() (admission.Warnings, error) {
// Validate that at least one service is specified
if len(v.deployment.Spec.Services) == 0 {
return nil, fmt.Errorf("spec.services must have at least one service")
}
// Validate PVCs
if err := v.validatePVCs(); err != nil {
return nil, err
}
// Validate each service
for serviceName, service := range v.deployment.Spec.Services {
if err := v.validateService(serviceName, service); err != nil {
return nil, err
}
}
return nil, nil
}
// ValidateUpdate performs stateful validation comparing old and new DynamoGraphDeployment.
// Returns warnings and error.
func (v *DynamoGraphDeploymentValidator) ValidateUpdate(old *nvidiacomv1alpha1.DynamoGraphDeployment) (admission.Warnings, error) {
// Validate that BackendFramework is not changed (immutable)
if v.deployment.Spec.BackendFramework != old.Spec.BackendFramework {
warning := "Changing spec.backendFramework may cause unexpected behavior"
return admission.Warnings{warning}, fmt.Errorf("spec.backendFramework is immutable and cannot be changed after creation")
}
return nil, nil
}
// validateService validates a single service configuration using SharedSpecValidator.
func (v *DynamoGraphDeploymentValidator) validateService(serviceName string, service *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec) error {
// Use SharedSpecValidator to validate service spec (which is a DynamoComponentDeploymentSharedSpec)
fieldPath := fmt.Sprintf("spec.services[%s]", serviceName)
sharedValidator := NewSharedSpecValidator(service, fieldPath)
return sharedValidator.Validate()
}
// validatePVCs validates the PVC configurations.
func (v *DynamoGraphDeploymentValidator) validatePVCs() error {
for i, pvc := range v.deployment.Spec.PVCs {
if err := v.validatePVC(i, &pvc); err != nil {
return err
}
}
return nil
}
// validatePVC validates a single PVC configuration.
func (v *DynamoGraphDeploymentValidator) validatePVC(index int, pvc *nvidiacomv1alpha1.PVC) error {
var err error
// Validate name is not nil
if pvc.Name == nil || *pvc.Name == "" {
err = errors.Join(err, fmt.Errorf("spec.pvcs[%d].name is required", index))
}
// Check if create is true
if pvc.Create != nil && *pvc.Create {
// Validate required fields when create is true
if pvc.StorageClass == "" {
err = errors.Join(err, fmt.Errorf("spec.pvcs[%d].storageClass is required when create is true", index))
}
if pvc.Size.IsZero() {
err = errors.Join(err, fmt.Errorf("spec.pvcs[%d].size is required when create is true", index))
}
if pvc.VolumeAccessMode == "" {
err = errors.Join(err, fmt.Errorf("spec.pvcs[%d].volumeAccessMode is required when create is true", index))
}
}
return err
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package validation
import (
"context"
"fmt"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
internalwebhook "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/webhook"
"k8s.io/apimachinery/pkg/runtime"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
)
const (
// DynamoGraphDeploymentWebhookName is the name of the validating webhook handler for DynamoGraphDeployment.
DynamoGraphDeploymentWebhookName = "dynamographdeployment-validating-webhook"
dynamoGraphDeploymentWebhookPath = "/validate-nvidia-com-v1alpha1-dynamographdeployment"
)
// DynamoGraphDeploymentHandler is a handler for validating DynamoGraphDeployment resources.
// It is a thin wrapper around DynamoGraphDeploymentValidator.
type DynamoGraphDeploymentHandler struct{}
// NewDynamoGraphDeploymentHandler creates a new handler for DynamoGraphDeployment Webhook.
func NewDynamoGraphDeploymentHandler() *DynamoGraphDeploymentHandler {
return &DynamoGraphDeploymentHandler{}
}
// ValidateCreate validates a DynamoGraphDeployment create request.
func (h *DynamoGraphDeploymentHandler) ValidateCreate(ctx context.Context, obj runtime.Object) (admission.Warnings, error) {
logger := log.FromContext(ctx).WithName(DynamoGraphDeploymentWebhookName)
deployment, err := castToDynamoGraphDeployment(obj)
if err != nil {
return nil, err
}
logger.Info("validate create", "name", deployment.Name, "namespace", deployment.Namespace)
// Create validator and perform validation
validator := NewDynamoGraphDeploymentValidator(deployment)
return validator.Validate()
}
// ValidateUpdate validates a DynamoGraphDeployment update request.
func (h *DynamoGraphDeploymentHandler) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) {
logger := log.FromContext(ctx).WithName(DynamoGraphDeploymentWebhookName)
newDeployment, err := castToDynamoGraphDeployment(newObj)
if err != nil {
return nil, err
}
logger.Info("validate update", "name", newDeployment.Name, "namespace", newDeployment.Namespace)
// Skip validation if the resource is being deleted (to allow finalizer removal)
if !newDeployment.DeletionTimestamp.IsZero() {
logger.Info("skipping validation for resource being deleted", "name", newDeployment.Name)
return nil, nil
}
oldDeployment, err := castToDynamoGraphDeployment(oldObj)
if err != nil {
return nil, err
}
// Create validator and perform validation
validator := NewDynamoGraphDeploymentValidator(newDeployment)
// Validate stateless rules
warnings, err := validator.Validate()
if err != nil {
return warnings, err
}
// Validate stateful rules (immutability)
updateWarnings, err := validator.ValidateUpdate(oldDeployment)
if err != nil {
return updateWarnings, err
}
// Combine warnings
warnings = append(warnings, updateWarnings...)
return warnings, nil
}
// ValidateDelete validates a DynamoGraphDeployment delete request.
func (h *DynamoGraphDeploymentHandler) ValidateDelete(ctx context.Context, obj runtime.Object) (admission.Warnings, error) {
logger := log.FromContext(ctx).WithName(DynamoGraphDeploymentWebhookName)
deployment, err := castToDynamoGraphDeployment(obj)
if err != nil {
return nil, err
}
logger.Info("validate delete", "name", deployment.Name, "namespace", deployment.Namespace)
// No special validation needed for deletion
return nil, nil
}
// RegisterWithManager registers the webhook with the manager.
// The handler is automatically wrapped with LeaseAwareValidator to add namespace exclusion logic.
func (h *DynamoGraphDeploymentHandler) RegisterWithManager(mgr manager.Manager) error {
// Wrap the handler with lease-aware logic for cluster-wide coordination
validator := internalwebhook.NewLeaseAwareValidator(h, internalwebhook.GetExcludedNamespaces())
webhook := admission.
WithCustomValidator(mgr.GetScheme(), &nvidiacomv1alpha1.DynamoGraphDeployment{}, validator).
WithRecoverPanic(true)
mgr.GetWebhookServer().Register(dynamoGraphDeploymentWebhookPath, webhook)
return nil
}
// castToDynamoGraphDeployment attempts to cast a runtime.Object to a DynamoGraphDeployment.
func castToDynamoGraphDeployment(obj runtime.Object) (*nvidiacomv1alpha1.DynamoGraphDeployment, error) {
deployment, ok := obj.(*nvidiacomv1alpha1.DynamoGraphDeployment)
if !ok {
return nil, fmt.Errorf("expected DynamoGraphDeployment but got %T", obj)
}
return deployment, nil
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package validation
import (
"strings"
"testing"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
func TestDynamoGraphDeploymentValidator_Validate(t *testing.T) {
var (
validReplicas = int32(3)
negativeReplicas = int32(-1)
pvcName = "test-pvc"
trueVal = true
falseVal = false
)
tests := []struct {
name string
deployment *nvidiacomv1alpha1.DynamoGraphDeployment
wantErr bool
errMsg string
errContains bool
}{
{
name: "valid deployment with services",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-graph",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "sglang",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {
Replicas: &validReplicas,
},
},
},
},
wantErr: false,
},
{
name: "no services",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-graph",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{},
},
},
wantErr: true,
errMsg: "spec.services must have at least one service",
},
{
name: "service with invalid replicas",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-graph",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {
Replicas: &negativeReplicas,
},
},
},
},
wantErr: true,
errMsg: "spec.services[main].replicas must be non-negative",
},
{
name: "service with invalid autoscaling",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-graph",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"prefill": {
Autoscaling: &nvidiacomv1alpha1.Autoscaling{
Enabled: true,
MinReplicas: 10,
MaxReplicas: 5,
},
},
},
},
},
wantErr: true,
errMsg: "spec.services[prefill].autoscaling.maxReplicas must be > minReplicas",
},
{
name: "service with invalid ingress",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-graph",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"gateway": {
Ingress: &nvidiacomv1alpha1.IngressSpec{
Enabled: true,
Host: "",
},
},
},
},
},
wantErr: true,
errMsg: "spec.services[gateway].ingress.host is required when ingress is enabled",
},
{
name: "pvc with create=true and missing storageClass",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-graph",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
PVCs: []nvidiacomv1alpha1.PVC{
{
Create: &trueVal,
Name: &pvcName,
StorageClass: "",
Size: resource.MustParse("10Gi"),
VolumeAccessMode: corev1.ReadWriteOnce,
},
},
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {},
},
},
},
wantErr: true,
errMsg: "spec.pvcs[0].storageClass is required when create is true",
},
{
name: "pvc with create=true and missing size",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-graph",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
PVCs: []nvidiacomv1alpha1.PVC{
{
Create: &trueVal,
Name: &pvcName,
StorageClass: "standard",
Size: resource.Quantity{},
VolumeAccessMode: corev1.ReadWriteOnce,
},
},
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {},
},
},
},
wantErr: true,
errMsg: "spec.pvcs[0].size is required when create is true",
},
{
name: "pvc with create=true and missing volumeAccessMode",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-graph",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
PVCs: []nvidiacomv1alpha1.PVC{
{
Create: &trueVal,
Name: &pvcName,
StorageClass: "standard",
Size: resource.MustParse("10Gi"),
VolumeAccessMode: "",
},
},
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {},
},
},
},
wantErr: true,
errMsg: "spec.pvcs[0].volumeAccessMode is required when create is true",
},
{
name: "pvc with create=false and missing fields",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-graph",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
PVCs: []nvidiacomv1alpha1.PVC{
{
Create: &falseVal,
Name: &pvcName,
},
},
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {},
},
},
},
wantErr: false,
},
{
name: "pvc with missing name",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-graph",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
PVCs: []nvidiacomv1alpha1.PVC{
{
Create: &falseVal,
},
},
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {},
},
},
},
wantErr: true,
errMsg: "spec.pvcs[0].name is required",
},
{
name: "pvc with multiple errors (name, storageClass, size, volumeAccessMode all missing)",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-graph",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
PVCs: []nvidiacomv1alpha1.PVC{
{
Create: &trueVal,
},
},
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {},
},
},
},
wantErr: true,
errMsg: "spec.pvcs[0].name is required\nspec.pvcs[0].storageClass is required when create is true\nspec.pvcs[0].size is required when create is true\nspec.pvcs[0].volumeAccessMode is required when create is true",
errContains: true,
},
{
name: "valid pvc with create=true",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-graph",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
PVCs: []nvidiacomv1alpha1.PVC{
{
Create: &trueVal,
Name: &pvcName,
StorageClass: "standard",
Size: resource.MustParse("10Gi"),
VolumeAccessMode: corev1.ReadWriteOnce,
},
},
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {},
},
},
},
wantErr: false,
},
{
name: "service with invalid volume mount",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-graph",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {
VolumeMounts: []nvidiacomv1alpha1.VolumeMount{
{
Name: "data",
UseAsCompilationCache: false,
},
},
},
},
},
},
wantErr: true,
errMsg: "spec.services[main].volumeMounts[0].mountPoint is required when useAsCompilationCache is false",
},
{
name: "service with invalid shared memory",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-graph",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {
SharedMemory: &nvidiacomv1alpha1.SharedMemorySpec{
Disabled: false,
Size: resource.Quantity{},
},
},
},
},
},
wantErr: true,
errMsg: "spec.services[main].sharedMemory.size is required when disabled is false",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
validator := NewDynamoGraphDeploymentValidator(tt.deployment)
_, err := validator.Validate()
if (err != nil) != tt.wantErr {
t.Errorf("DynamoGraphDeploymentValidator.Validate() error = %v, wantErr %v", err, tt.wantErr)
return
}
if tt.wantErr {
if tt.errContains {
// For multiple errors, check that all expected error messages are present
errStr := err.Error()
for _, expectedMsg := range strings.Split(tt.errMsg, "\n") {
if !strings.Contains(errStr, expectedMsg) {
t.Errorf("DynamoGraphDeploymentValidator.Validate() error message = %v, want to contain %v", errStr, expectedMsg)
}
}
} else {
if err.Error() != tt.errMsg {
t.Errorf("DynamoGraphDeploymentValidator.Validate() error message = %v, want %v", err.Error(), tt.errMsg)
}
}
}
})
}
}
func TestDynamoGraphDeploymentValidator_ValidateUpdate(t *testing.T) {
tests := []struct {
name string
oldDeployment *nvidiacomv1alpha1.DynamoGraphDeployment
newDeployment *nvidiacomv1alpha1.DynamoGraphDeployment
wantErr bool
wantWarnings bool
errMsg string
expectedWarnMsg string
}{
{
name: "no changes",
oldDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "sglang",
},
},
newDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "sglang",
},
},
wantErr: false,
},
{
name: "changing backend framework",
oldDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "sglang",
},
},
newDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "vllm",
},
},
wantErr: true,
wantWarnings: true,
errMsg: "spec.backendFramework is immutable and cannot be changed after creation",
expectedWarnMsg: "Changing spec.backendFramework may cause unexpected behavior",
},
{
name: "adding new service is allowed",
oldDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "sglang",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {},
},
},
},
newDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "sglang",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {},
"prefill": {},
},
},
},
wantErr: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
validator := NewDynamoGraphDeploymentValidator(tt.newDeployment)
warnings, err := validator.ValidateUpdate(tt.oldDeployment)
if (err != nil) != tt.wantErr {
t.Errorf("DynamoGraphDeploymentValidator.ValidateUpdate() error = %v, wantErr %v", err, tt.wantErr)
return
}
if tt.wantErr && err.Error() != tt.errMsg {
t.Errorf("DynamoGraphDeploymentValidator.ValidateUpdate() error message = %v, want %v", err.Error(), tt.errMsg)
}
if tt.wantWarnings && len(warnings) == 0 {
t.Errorf("DynamoGraphDeploymentValidator.ValidateUpdate() expected warnings but got none")
}
if tt.wantWarnings && len(warnings) > 0 && warnings[0] != tt.expectedWarnMsg {
t.Errorf("DynamoGraphDeploymentValidator.ValidateUpdate() warning = %v, want %v", warnings[0], tt.expectedWarnMsg)
}
})
}
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package validation
import (
"errors"
"fmt"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
"k8s.io/apimachinery/pkg/util/yaml"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
)
// DynamoGraphDeploymentRequestValidator validates DynamoGraphDeploymentRequest resources.
// This validator can be used by both webhooks and controllers for consistent validation.
type DynamoGraphDeploymentRequestValidator struct {
request *nvidiacomv1alpha1.DynamoGraphDeploymentRequest
isClusterWideOperator bool
}
// NewDynamoGraphDeploymentRequestValidator creates a new validator for DynamoGraphDeploymentRequest.
// The isClusterWide parameter indicates whether the operator is running in cluster-wide or namespace-restricted mode.
func NewDynamoGraphDeploymentRequestValidator(request *nvidiacomv1alpha1.DynamoGraphDeploymentRequest, isClusterWide bool) *DynamoGraphDeploymentRequestValidator {
return &DynamoGraphDeploymentRequestValidator{
request: request,
isClusterWideOperator: isClusterWide,
}
}
// Validate performs stateless validation on the DynamoGraphDeploymentRequest.
// Returns warnings and error.
func (v *DynamoGraphDeploymentRequestValidator) Validate() (admission.Warnings, error) {
var warnings admission.Warnings
var err error
// Validate profiler image is specified
if v.request.Spec.ProfilingConfig.ProfilerImage == "" {
err = errors.Join(err, errors.New("spec.profilingConfig.profilerImage is required"))
}
// Validate that profilingConfig.config is provided
if v.request.Spec.ProfilingConfig.Config == nil || len(v.request.Spec.ProfilingConfig.Config.Raw) == 0 {
err = errors.Join(err, errors.New("spec.profilingConfig.config is required and must not be empty"))
}
// Validate enableGpuDiscovery is only true for cluster-wide operators
if v.request.Spec.EnableGpuDiscovery && !v.isClusterWideOperator {
err = errors.Join(err, errors.New("spec.enableGpuDiscovery can only be set to true for cluster-wide operators. Namespace-restricted operators cannot access cluster nodes for GPU discovery. Please set enableGpuDiscovery to false and provide hardware configuration (hardware.min_num_gpus_per_engine, hardware.max_num_gpus_per_engine, hardware.num_gpus_per_node) in spec.profilingConfig.config"))
}
// Parse config to validate structure (only if config is present)
if v.request.Spec.ProfilingConfig.Config != nil && len(v.request.Spec.ProfilingConfig.Config.Raw) > 0 {
var config map[string]interface{}
if parseErr := yaml.Unmarshal(v.request.Spec.ProfilingConfig.Config.Raw, &config); parseErr != nil {
err = errors.Join(err, fmt.Errorf("failed to parse spec.profilingConfig.config: %w", parseErr))
} else {
// Warn if deployment.model or engine.backend are specified in config (they will be overwritten by spec fields)
if engineConfig, ok := config["engine"].(map[string]interface{}); ok {
if backend, ok := engineConfig["backend"].(string); ok && backend != "" && backend != v.request.Spec.Backend {
warnings = append(warnings, fmt.Sprintf("spec.profilingConfig.config.engine.backend (%s) will be overwritten by spec.backend (%s)", backend, v.request.Spec.Backend))
}
}
if deployment, ok := config["deployment"].(map[string]interface{}); ok {
if model, ok := deployment["model"].(string); ok && model != "" && model != v.request.Spec.Model {
warnings = append(warnings, fmt.Sprintf("spec.profilingConfig.config.deployment.model (%s) will be overwritten by spec.model (%s)", model, v.request.Spec.Model))
}
}
}
}
return warnings, err
}
// ValidateUpdate performs stateful validation comparing old and new DynamoGraphDeploymentRequest.
// Returns warnings and error.
func (v *DynamoGraphDeploymentRequestValidator) ValidateUpdate(old *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (admission.Warnings, error) {
// TODO: Add update validation logic for DynamoGraphDeploymentRequest
// Placeholder for future immutability checks
return nil, nil
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package validation
import (
"context"
"fmt"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
internalwebhook "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/webhook"
"k8s.io/apimachinery/pkg/runtime"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
)
const (
// DynamoGraphDeploymentRequestWebhookName is the name of the validating webhook handler for DynamoGraphDeploymentRequest.
DynamoGraphDeploymentRequestWebhookName = "dynamographdeploymentrequest-validating-webhook"
dynamoGraphDeploymentRequestWebhookPath = "/validate-nvidia-com-v1alpha1-dynamographdeploymentrequest"
)
// DynamoGraphDeploymentRequestHandler is a handler for validating DynamoGraphDeploymentRequest resources.
// It is a thin wrapper around DynamoGraphDeploymentRequestValidator.
type DynamoGraphDeploymentRequestHandler struct {
isClusterWideOperator bool
}
// NewDynamoGraphDeploymentRequestHandler creates a new handler for DynamoGraphDeploymentRequest Webhook.
// The isClusterWide parameter indicates whether the operator is running in cluster-wide or namespace-restricted mode.
func NewDynamoGraphDeploymentRequestHandler(isClusterWide bool) *DynamoGraphDeploymentRequestHandler {
return &DynamoGraphDeploymentRequestHandler{
isClusterWideOperator: isClusterWide,
}
}
// ValidateCreate validates a DynamoGraphDeploymentRequest create request.
func (h *DynamoGraphDeploymentRequestHandler) ValidateCreate(ctx context.Context, obj runtime.Object) (admission.Warnings, error) {
logger := log.FromContext(ctx).WithName(DynamoGraphDeploymentRequestWebhookName)
request, err := castToDynamoGraphDeploymentRequest(obj)
if err != nil {
return nil, err
}
logger.Info("validate create", "name", request.Name, "namespace", request.Namespace)
// Create validator and perform validation
validator := NewDynamoGraphDeploymentRequestValidator(request, h.isClusterWideOperator)
return validator.Validate()
}
// ValidateUpdate validates a DynamoGraphDeploymentRequest update request.
func (h *DynamoGraphDeploymentRequestHandler) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) {
logger := log.FromContext(ctx).WithName(DynamoGraphDeploymentRequestWebhookName)
newRequest, err := castToDynamoGraphDeploymentRequest(newObj)
if err != nil {
return nil, err
}
logger.Info("validate update", "name", newRequest.Name, "namespace", newRequest.Namespace)
// Skip validation if the resource is being deleted (to allow finalizer removal)
if !newRequest.DeletionTimestamp.IsZero() {
logger.Info("skipping validation for resource being deleted", "name", newRequest.Name)
return nil, nil
}
oldRequest, err := castToDynamoGraphDeploymentRequest(oldObj)
if err != nil {
return nil, err
}
// Create validator and perform validation
validator := NewDynamoGraphDeploymentRequestValidator(newRequest, h.isClusterWideOperator)
// Validate stateless rules
warnings, err := validator.Validate()
if err != nil {
return warnings, err
}
// Validate stateful rules (immutability)
updateWarnings, err := validator.ValidateUpdate(oldRequest)
if err != nil {
return updateWarnings, err
}
// Combine warnings
warnings = append(warnings, updateWarnings...)
return warnings, nil
}
// ValidateDelete validates a DynamoGraphDeploymentRequest delete request.
func (h *DynamoGraphDeploymentRequestHandler) ValidateDelete(ctx context.Context, obj runtime.Object) (admission.Warnings, error) {
logger := log.FromContext(ctx).WithName(DynamoGraphDeploymentRequestWebhookName)
request, err := castToDynamoGraphDeploymentRequest(obj)
if err != nil {
return nil, err
}
logger.Info("validate delete", "name", request.Name, "namespace", request.Namespace)
// No special validation needed for deletion
return nil, nil
}
// RegisterWithManager registers the webhook with the manager.
// The handler is automatically wrapped with LeaseAwareValidator to add namespace exclusion logic.
func (h *DynamoGraphDeploymentRequestHandler) RegisterWithManager(mgr manager.Manager) error {
// Wrap the handler with lease-aware logic for cluster-wide coordination
validator := internalwebhook.NewLeaseAwareValidator(h, internalwebhook.GetExcludedNamespaces())
webhook := admission.
WithCustomValidator(mgr.GetScheme(), &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{}, validator).
WithRecoverPanic(true)
mgr.GetWebhookServer().Register(dynamoGraphDeploymentRequestWebhookPath, webhook)
return nil
}
// castToDynamoGraphDeploymentRequest attempts to cast a runtime.Object to a DynamoGraphDeploymentRequest.
func castToDynamoGraphDeploymentRequest(obj runtime.Object) (*nvidiacomv1alpha1.DynamoGraphDeploymentRequest, error) {
request, ok := obj.(*nvidiacomv1alpha1.DynamoGraphDeploymentRequest)
if !ok {
return nil, fmt.Errorf("expected DynamoGraphDeploymentRequest but got %T", obj)
}
return request, nil
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package validation
import (
"strings"
"testing"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
validConfig := `{"engine": {"backend": "vllm"}, "deployment": {"model": "test-model"}}`
configWithDifferentBackend := `{"engine": {"backend": "sglang"}}`
configWithDifferentModel := `{"deployment": {"model": "different-model"}}`
invalidYAML := `{invalid yaml`
tests := []struct {
name string
request *nvidiacomv1alpha1.DynamoGraphDeploymentRequest
isClusterWide bool
wantErr bool
errMsg string
wantWarnings bool
expectedWarning string
errContains bool
}{
{
name: "valid request",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(validConfig),
},
},
},
},
isClusterWide: true,
wantErr: false,
},
{
name: "missing profiler image",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "",
Config: &apiextensionsv1.JSON{
Raw: []byte(validConfig),
},
},
},
},
isClusterWide: true,
wantErr: true,
errMsg: "spec.profilingConfig.profilerImage is required",
},
{
name: "missing profiling config",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: nil,
},
},
},
isClusterWide: true,
wantErr: true,
errMsg: "spec.profilingConfig.config is required and must not be empty",
},
{
name: "empty profiling config",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte{},
},
},
},
},
isClusterWide: true,
wantErr: true,
errMsg: "spec.profilingConfig.config is required and must not be empty",
},
{
name: "enableGpuDiscovery true for cluster-wide operator",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
EnableGpuDiscovery: true,
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(validConfig),
},
},
},
},
isClusterWide: true,
wantErr: false,
},
{
name: "enableGpuDiscovery true for namespace-restricted operator",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
EnableGpuDiscovery: true,
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(validConfig),
},
},
},
},
isClusterWide: false,
wantErr: true,
errMsg: "spec.enableGpuDiscovery can only be set to true for cluster-wide operators. Namespace-restricted operators cannot access cluster nodes for GPU discovery. Please set enableGpuDiscovery to false and provide hardware configuration (hardware.min_num_gpus_per_engine, hardware.max_num_gpus_per_engine, hardware.num_gpus_per_node) in spec.profilingConfig.config",
},
{
name: "enableGpuDiscovery false for namespace-restricted operator",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
EnableGpuDiscovery: false,
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(validConfig),
},
},
},
},
isClusterWide: false,
wantErr: false,
},
{
name: "invalid config YAML",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(invalidYAML),
},
},
},
},
isClusterWide: true,
wantErr: true,
errMsg: "failed to parse spec.profilingConfig.config: error converting YAML to JSON: yaml: line 1: did not find expected ',' or '}'",
},
{
name: "warning for different backend in config",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(configWithDifferentBackend),
},
},
},
},
isClusterWide: true,
wantErr: false,
wantWarnings: true,
expectedWarning: "spec.profilingConfig.config.engine.backend (sglang) will be overwritten by spec.backend (vllm)",
},
{
name: "warning for different model in config",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(configWithDifferentModel),
},
},
},
},
isClusterWide: true,
wantErr: false,
wantWarnings: true,
expectedWarning: "spec.profilingConfig.config.deployment.model (different-model) will be overwritten by spec.model (llama-3-8b)",
},
{
name: "multiple errors (missing profiler image, missing config, and enableGpuDiscovery for namespace-restricted)",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
EnableGpuDiscovery: true,
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "",
Config: nil,
},
},
},
isClusterWide: false,
wantErr: true,
errMsg: "spec.profilingConfig.profilerImage is required\nspec.profilingConfig.config is required and must not be empty\nspec.enableGpuDiscovery can only be set to true for cluster-wide operators",
errContains: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
validator := NewDynamoGraphDeploymentRequestValidator(tt.request, tt.isClusterWide)
warnings, err := validator.Validate()
if (err != nil) != tt.wantErr {
t.Errorf("DynamoGraphDeploymentRequestValidator.Validate() error = %v, wantErr %v", err, tt.wantErr)
return
}
if tt.wantErr {
if tt.errContains {
// For multiple errors, check that all expected error messages are present
errStr := err.Error()
for _, expectedMsg := range strings.Split(tt.errMsg, "\n") {
if !strings.Contains(errStr, expectedMsg) {
t.Errorf("DynamoGraphDeploymentRequestValidator.Validate() error message = %v, want to contain %v", errStr, expectedMsg)
}
}
} else {
if err.Error() != tt.errMsg {
t.Errorf("DynamoGraphDeploymentRequestValidator.Validate() error message = %v, want %v", err.Error(), tt.errMsg)
}
}
}
if tt.wantWarnings && len(warnings) == 0 {
t.Errorf("DynamoGraphDeploymentRequestValidator.Validate() expected warnings but got none")
}
if tt.wantWarnings && len(warnings) > 0 && warnings[0] != tt.expectedWarning {
t.Errorf("DynamoGraphDeploymentRequestValidator.Validate() warning = %v, want %v", warnings[0], tt.expectedWarning)
}
})
}
}
func TestDynamoGraphDeploymentRequestValidator_ValidateUpdate(t *testing.T) {
validConfig := `{"engine": {"backend": "vllm"}}`
tests := []struct {
name string
oldRequest *nvidiacomv1alpha1.DynamoGraphDeploymentRequest
newRequest *nvidiacomv1alpha1.DynamoGraphDeploymentRequest
wantErr bool
wantWarnings bool
}{
{
name: "no changes",
oldRequest: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(validConfig),
},
},
},
},
newRequest: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(validConfig),
},
},
},
},
wantErr: false,
},
{
name: "changing model name is allowed",
oldRequest: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(validConfig),
},
},
},
},
newRequest: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-70b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(validConfig),
},
},
},
},
wantErr: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
validator := NewDynamoGraphDeploymentRequestValidator(tt.newRequest, true)
warnings, err := validator.ValidateUpdate(tt.oldRequest)
if (err != nil) != tt.wantErr {
t.Errorf("DynamoGraphDeploymentRequestValidator.ValidateUpdate() error = %v, wantErr %v", err, tt.wantErr)
return
}
if tt.wantWarnings && len(warnings) == 0 {
t.Errorf("DynamoGraphDeploymentRequestValidator.ValidateUpdate() expected warnings but got none")
}
})
}
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package validation
import (
"fmt"
"strings"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
)
// DynamoModelValidator validates DynamoModel resources.
// This validator can be used by both webhooks and controllers for consistent validation.
type DynamoModelValidator struct {
model *nvidiacomv1alpha1.DynamoModel
}
// NewDynamoModelValidator creates a new validator for DynamoModel.
func NewDynamoModelValidator(model *nvidiacomv1alpha1.DynamoModel) *DynamoModelValidator {
return &DynamoModelValidator{
model: model,
}
}
// Validate performs stateless validation on the DynamoModel.
// Returns warnings and error.
func (v *DynamoModelValidator) Validate() (admission.Warnings, error) {
// Validate modelName is not empty
if v.model.Spec.ModelName == "" {
return nil, fmt.Errorf("spec.modelName is required")
}
// Validate baseModelName is not empty
if v.model.Spec.BaseModelName == "" {
return nil, fmt.Errorf("spec.baseModelName is required")
}
// Validate LoRA model requirements
if v.model.Spec.ModelType == "lora" {
if v.model.Spec.Source == nil {
return nil, fmt.Errorf("spec.source is required when modelType is 'lora'")
}
if v.model.Spec.Source.URI == "" {
return nil, fmt.Errorf("spec.source.uri must be specified when modelType is 'lora'")
}
// Validate URI format
if err := v.validateSourceURI(v.model.Spec.Source.URI); err != nil {
return nil, err
}
}
return nil, nil
}
// ValidateUpdate performs stateful validation comparing old and new DynamoModel.
// Returns warnings and error.
func (v *DynamoModelValidator) ValidateUpdate(old *nvidiacomv1alpha1.DynamoModel) (admission.Warnings, error) {
var warnings admission.Warnings
// modelType is immutable
if v.model.Spec.ModelType != old.Spec.ModelType {
warnings = append(warnings, "Changing spec.modelType may cause unexpected behavior")
return warnings, fmt.Errorf("spec.modelType is immutable and cannot be changed after creation")
}
// baseModelName is immutable
if v.model.Spec.BaseModelName != old.Spec.BaseModelName {
warnings = append(warnings, "Changing spec.baseModelName will break endpoint discovery")
return warnings, fmt.Errorf("spec.baseModelName is immutable and cannot be changed after creation")
}
return nil, nil
}
// validateSourceURI validates the model source URI format.
func (v *DynamoModelValidator) validateSourceURI(uri string) error {
if uri == "" {
return fmt.Errorf("source URI cannot be empty")
}
// Check for supported schemes
if !strings.HasPrefix(uri, "s3://") && !strings.HasPrefix(uri, "hf://") {
return fmt.Errorf("source URI must start with 's3://' or 'hf://', got: %s", uri)
}
return nil
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package validation
import (
"context"
"fmt"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
internalwebhook "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/webhook"
"k8s.io/apimachinery/pkg/runtime"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
)
const (
// DynamoModelWebhookName is the name of the validating webhook handler for DynamoModel.
DynamoModelWebhookName = "dynamomodel-validating-webhook"
dynamoModelWebhookPath = "/validate-nvidia-com-v1alpha1-dynamomodel"
)
// DynamoModelHandler is a handler for validating DynamoModel resources.
// It is a thin wrapper around DynamoModelValidator.
type DynamoModelHandler struct{}
// NewDynamoModelHandler creates a new handler for DynamoModel Webhook.
func NewDynamoModelHandler() *DynamoModelHandler {
return &DynamoModelHandler{}
}
// ValidateCreate validates a DynamoModel create request.
func (h *DynamoModelHandler) ValidateCreate(ctx context.Context, obj runtime.Object) (admission.Warnings, error) {
logger := log.FromContext(ctx).WithName(DynamoModelWebhookName)
model, err := castToDynamoModel(obj)
if err != nil {
return nil, err
}
logger.Info("validate create", "name", model.Name, "namespace", model.Namespace)
// Create validator and perform validation
validator := NewDynamoModelValidator(model)
return validator.Validate()
}
// ValidateUpdate validates a DynamoModel update request.
func (h *DynamoModelHandler) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) {
logger := log.FromContext(ctx).WithName(DynamoModelWebhookName)
newModel, err := castToDynamoModel(newObj)
if err != nil {
return nil, err
}
logger.Info("validate update", "name", newModel.Name, "namespace", newModel.Namespace)
// Skip validation if the resource is being deleted (to allow finalizer removal)
if !newModel.DeletionTimestamp.IsZero() {
logger.Info("skipping validation for resource being deleted", "name", newModel.Name)
return nil, nil
}
oldModel, err := castToDynamoModel(oldObj)
if err != nil {
return nil, err
}
// Create validator and perform validation
validator := NewDynamoModelValidator(newModel)
// Validate stateless rules
warnings, err := validator.Validate()
if err != nil {
return warnings, err
}
// Validate stateful rules (immutability)
updateWarnings, err := validator.ValidateUpdate(oldModel)
if err != nil {
return updateWarnings, err
}
// Combine warnings
warnings = append(warnings, updateWarnings...)
return warnings, nil
}
// ValidateDelete validates a DynamoModel delete request.
func (h *DynamoModelHandler) ValidateDelete(ctx context.Context, obj runtime.Object) (admission.Warnings, error) {
logger := log.FromContext(ctx).WithName(DynamoModelWebhookName)
model, err := castToDynamoModel(obj)
if err != nil {
return nil, err
}
logger.Info("validate delete", "name", model.Name, "namespace", model.Namespace)
// No special validation needed for deletion
return nil, nil
}
// RegisterWithManager registers the webhook with the manager.
// The handler is automatically wrapped with LeaseAwareValidator to add namespace exclusion logic.
func (h *DynamoModelHandler) RegisterWithManager(mgr manager.Manager) error {
// Wrap the handler with lease-aware logic for cluster-wide coordination
validator := internalwebhook.NewLeaseAwareValidator(h, internalwebhook.GetExcludedNamespaces())
webhook := admission.
WithCustomValidator(mgr.GetScheme(), &nvidiacomv1alpha1.DynamoModel{}, validator).
WithRecoverPanic(true)
mgr.GetWebhookServer().Register(dynamoModelWebhookPath, webhook)
return nil
}
// castToDynamoModel attempts to cast a runtime.Object to a DynamoModel.
func castToDynamoModel(obj runtime.Object) (*nvidiacomv1alpha1.DynamoModel, error) {
model, ok := obj.(*nvidiacomv1alpha1.DynamoModel)
if !ok {
return nil, fmt.Errorf("expected DynamoModel but got %T", obj)
}
return model, nil
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package validation
import (
"testing"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
func TestDynamoModelValidator_Validate(t *testing.T) {
tests := []struct {
name string
model *nvidiacomv1alpha1.DynamoModel
wantErr bool
errMsg string
}{
{
name: "valid base model",
model: &nvidiacomv1alpha1.DynamoModel{
ObjectMeta: metav1.ObjectMeta{
Name: "test-model",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "llama-3-8b",
BaseModelName: "llama-3-8b",
ModelType: "base",
},
},
wantErr: false,
},
{
name: "valid lora model with s3 source",
model: &nvidiacomv1alpha1.DynamoModel{
ObjectMeta: metav1.ObjectMeta{
Name: "test-lora",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "llama-3-8b-custom",
BaseModelName: "llama-3-8b",
ModelType: "lora",
Source: &nvidiacomv1alpha1.ModelSource{
URI: "s3://my-bucket/lora-adapter",
},
},
},
wantErr: false,
},
{
name: "valid lora model with hf source",
model: &nvidiacomv1alpha1.DynamoModel{
ObjectMeta: metav1.ObjectMeta{
Name: "test-lora",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "llama-3-8b-custom",
BaseModelName: "llama-3-8b",
ModelType: "lora",
Source: &nvidiacomv1alpha1.ModelSource{
URI: "hf://organization/model-name",
},
},
},
wantErr: false,
},
{
name: "missing modelName",
model: &nvidiacomv1alpha1.DynamoModel{
ObjectMeta: metav1.ObjectMeta{
Name: "test-model",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "",
BaseModelName: "llama-3-8b",
ModelType: "base",
},
},
wantErr: true,
errMsg: "spec.modelName is required",
},
{
name: "missing baseModelName",
model: &nvidiacomv1alpha1.DynamoModel{
ObjectMeta: metav1.ObjectMeta{
Name: "test-model",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "llama-3-8b",
BaseModelName: "",
ModelType: "base",
},
},
wantErr: true,
errMsg: "spec.baseModelName is required",
},
{
name: "lora without source",
model: &nvidiacomv1alpha1.DynamoModel{
ObjectMeta: metav1.ObjectMeta{
Name: "test-lora",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "llama-3-8b-custom",
BaseModelName: "llama-3-8b",
ModelType: "lora",
Source: nil,
},
},
wantErr: true,
errMsg: "spec.source is required when modelType is 'lora'",
},
{
name: "lora with empty URI",
model: &nvidiacomv1alpha1.DynamoModel{
ObjectMeta: metav1.ObjectMeta{
Name: "test-lora",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "llama-3-8b-custom",
BaseModelName: "llama-3-8b",
ModelType: "lora",
Source: &nvidiacomv1alpha1.ModelSource{
URI: "",
},
},
},
wantErr: true,
errMsg: "spec.source.uri must be specified when modelType is 'lora'",
},
{
name: "lora with invalid URI scheme",
model: &nvidiacomv1alpha1.DynamoModel{
ObjectMeta: metav1.ObjectMeta{
Name: "test-lora",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "llama-3-8b-custom",
BaseModelName: "llama-3-8b",
ModelType: "lora",
Source: &nvidiacomv1alpha1.ModelSource{
URI: "http://example.com/model",
},
},
},
wantErr: true,
errMsg: "source URI must start with 's3://' or 'hf://', got: http://example.com/model",
},
{
name: "lora with file:// URI scheme",
model: &nvidiacomv1alpha1.DynamoModel{
ObjectMeta: metav1.ObjectMeta{
Name: "test-lora",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "llama-3-8b-custom",
BaseModelName: "llama-3-8b",
ModelType: "lora",
Source: &nvidiacomv1alpha1.ModelSource{
URI: "file:///local/path",
},
},
},
wantErr: true,
errMsg: "source URI must start with 's3://' or 'hf://', got: file:///local/path",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
validator := NewDynamoModelValidator(tt.model)
_, err := validator.Validate()
if (err != nil) != tt.wantErr {
t.Errorf("DynamoModelValidator.Validate() error = %v, wantErr %v", err, tt.wantErr)
return
}
if tt.wantErr && err.Error() != tt.errMsg {
t.Errorf("DynamoModelValidator.Validate() error message = %v, want %v", err.Error(), tt.errMsg)
}
})
}
}
func TestDynamoModelValidator_ValidateUpdate(t *testing.T) {
tests := []struct {
name string
oldModel *nvidiacomv1alpha1.DynamoModel
newModel *nvidiacomv1alpha1.DynamoModel
wantErr bool
wantWarnings bool
errMsg string
expectedWarnMsg string
}{
{
name: "no changes",
oldModel: &nvidiacomv1alpha1.DynamoModel{
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "llama-3-8b",
BaseModelName: "llama-3-8b",
ModelType: "base",
},
},
newModel: &nvidiacomv1alpha1.DynamoModel{
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "llama-3-8b",
BaseModelName: "llama-3-8b",
ModelType: "base",
},
},
wantErr: false,
},
{
name: "changing modelType",
oldModel: &nvidiacomv1alpha1.DynamoModel{
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "llama-3-8b",
BaseModelName: "llama-3-8b",
ModelType: "base",
},
},
newModel: &nvidiacomv1alpha1.DynamoModel{
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "llama-3-8b",
BaseModelName: "llama-3-8b",
ModelType: "lora",
Source: &nvidiacomv1alpha1.ModelSource{
URI: "s3://bucket/adapter",
},
},
},
wantErr: true,
wantWarnings: true,
errMsg: "spec.modelType is immutable and cannot be changed after creation",
expectedWarnMsg: "Changing spec.modelType may cause unexpected behavior",
},
{
name: "changing baseModelName",
oldModel: &nvidiacomv1alpha1.DynamoModel{
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "llama-3-8b",
BaseModelName: "llama-3-8b",
ModelType: "base",
},
},
newModel: &nvidiacomv1alpha1.DynamoModel{
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "llama-3-8b",
BaseModelName: "llama-3-70b",
ModelType: "base",
},
},
wantErr: true,
wantWarnings: true,
errMsg: "spec.baseModelName is immutable and cannot be changed after creation",
expectedWarnMsg: "Changing spec.baseModelName will break endpoint discovery",
},
{
name: "changing modelName is allowed",
oldModel: &nvidiacomv1alpha1.DynamoModel{
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "llama-3-8b",
BaseModelName: "llama-3-8b",
ModelType: "base",
},
},
newModel: &nvidiacomv1alpha1.DynamoModel{
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "llama-3-8b-renamed",
BaseModelName: "llama-3-8b",
ModelType: "base",
},
},
wantErr: false,
},
{
name: "updating source URI for lora is allowed",
oldModel: &nvidiacomv1alpha1.DynamoModel{
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "llama-3-8b-custom",
BaseModelName: "llama-3-8b",
ModelType: "lora",
Source: &nvidiacomv1alpha1.ModelSource{
URI: "s3://bucket/adapter-v1",
},
},
},
newModel: &nvidiacomv1alpha1.DynamoModel{
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "llama-3-8b-custom",
BaseModelName: "llama-3-8b",
ModelType: "lora",
Source: &nvidiacomv1alpha1.ModelSource{
URI: "s3://bucket/adapter-v2",
},
},
},
wantErr: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
validator := NewDynamoModelValidator(tt.newModel)
warnings, err := validator.ValidateUpdate(tt.oldModel)
if (err != nil) != tt.wantErr {
t.Errorf("DynamoModelValidator.ValidateUpdate() error = %v, wantErr %v", err, tt.wantErr)
return
}
if tt.wantErr && err.Error() != tt.errMsg {
t.Errorf("DynamoModelValidator.ValidateUpdate() error message = %v, want %v", err.Error(), tt.errMsg)
}
if tt.wantWarnings && len(warnings) == 0 {
t.Errorf("DynamoModelValidator.ValidateUpdate() expected warnings but got none")
}
if tt.wantWarnings && len(warnings) > 0 && warnings[0] != tt.expectedWarnMsg {
t.Errorf("DynamoModelValidator.ValidateUpdate() warning = %v, want %v", warnings[0], tt.expectedWarnMsg)
}
})
}
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package validation
import (
"fmt"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
)
// SharedSpecValidator validates DynamoComponentDeploymentSharedSpec fields.
// This validator is used by both DynamoComponentDeploymentValidator and DynamoGraphDeploymentValidator
// to provide consistent validation logic for shared spec fields.
type SharedSpecValidator struct {
spec *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec
fieldPath string // e.g., "spec" for DCD, "spec.services[foo]" for DGD
}
// NewSharedSpecValidator creates a new validator for DynamoComponentDeploymentSharedSpec.
// fieldPath is used to provide context in error messages (e.g., "spec" or "spec.services[main]").
func NewSharedSpecValidator(spec *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec, fieldPath string) *SharedSpecValidator {
return &SharedSpecValidator{
spec: spec,
fieldPath: fieldPath,
}
}
// Validate performs validation on the shared spec fields.
// Returns an error if validation fails.
func (v *SharedSpecValidator) Validate() error {
// Validate replicas if specified
if v.spec.Replicas != nil && *v.spec.Replicas < 0 {
return fmt.Errorf("%s.replicas must be non-negative", v.fieldPath)
}
// Validate autoscaling configuration if specified
if v.spec.Autoscaling != nil {
if err := v.validateAutoscaling(); err != nil {
return err
}
}
// Validate ingress configuration if enabled
if v.spec.Ingress != nil && v.spec.Ingress.Enabled {
if err := v.validateIngress(); err != nil {
return err
}
}
// Validate volume mounts
if err := v.validateVolumeMounts(); err != nil {
return err
}
// Validate shared memory
if v.spec.SharedMemory != nil {
if err := v.validateSharedMemory(); err != nil {
return err
}
}
return nil
}
// validateAutoscaling validates the autoscaling configuration.
func (v *SharedSpecValidator) validateAutoscaling() error {
autoscaling := v.spec.Autoscaling
if !autoscaling.Enabled {
return nil
}
// Validate minReplicas
if autoscaling.MinReplicas < 1 {
return fmt.Errorf("%s.autoscaling.minReplicas must be >= 1", v.fieldPath)
}
// Validate maxReplicas
if autoscaling.MaxReplicas <= autoscaling.MinReplicas {
return fmt.Errorf("%s.autoscaling.maxReplicas must be > minReplicas", v.fieldPath)
}
return nil
}
// validateIngress validates the ingress configuration.
func (v *SharedSpecValidator) validateIngress() error {
if v.spec.Ingress.Host == "" {
return fmt.Errorf("%s.ingress.host is required when ingress is enabled", v.fieldPath)
}
return nil
}
// validateVolumeMounts validates the volume mount configurations.
func (v *SharedSpecValidator) validateVolumeMounts() error {
for i, volumeMount := range v.spec.VolumeMounts {
if err := v.validateVolumeMount(i, &volumeMount); err != nil {
return err
}
}
return nil
}
// validateVolumeMount validates a single volume mount configuration.
func (v *SharedSpecValidator) validateVolumeMount(index int, volumeMount *nvidiacomv1alpha1.VolumeMount) error {
// If useAsCompilationCache is false, mountPoint is required
if !volumeMount.UseAsCompilationCache && volumeMount.MountPoint == "" {
return fmt.Errorf("%s.volumeMounts[%d].mountPoint is required when useAsCompilationCache is false", v.fieldPath, index)
}
return nil
}
// validateSharedMemory validates the shared memory configuration.
func (v *SharedSpecValidator) validateSharedMemory() error {
// If disabled is false (i.e., shared memory is enabled), size is required
if !v.spec.SharedMemory.Disabled && v.spec.SharedMemory.Size.IsZero() {
return fmt.Errorf("%s.sharedMemory.size is required when disabled is false", v.fieldPath)
}
return nil
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package validation
import (
"testing"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
"k8s.io/apimachinery/pkg/api/resource"
)
func TestSharedSpecValidator_Validate(t *testing.T) {
var (
negativeReplicas = int32(-1)
validReplicas = int32(3)
)
tests := []struct {
name string
spec *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec
fieldPath string
wantErr bool
errMsg string
}{
{
name: "valid spec with all fields",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Replicas: &validReplicas,
Autoscaling: &nvidiacomv1alpha1.Autoscaling{
Enabled: true,
MinReplicas: 1,
MaxReplicas: 10,
},
Ingress: &nvidiacomv1alpha1.IngressSpec{
Enabled: true,
Host: "example.com",
},
VolumeMounts: []nvidiacomv1alpha1.VolumeMount{
{
Name: "cache",
MountPoint: "/cache",
},
{
Name: "compilation",
UseAsCompilationCache: true,
},
},
SharedMemory: &nvidiacomv1alpha1.SharedMemorySpec{
Disabled: false,
Size: resource.MustParse("1Gi"),
},
},
fieldPath: "spec",
wantErr: false,
},
{
name: "negative replicas",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Replicas: &negativeReplicas,
},
fieldPath: "spec",
wantErr: true,
errMsg: "spec.replicas must be non-negative",
},
{
name: "autoscaling minReplicas too low",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Autoscaling: &nvidiacomv1alpha1.Autoscaling{
Enabled: true,
MinReplicas: 0,
MaxReplicas: 10,
},
},
fieldPath: "spec",
wantErr: true,
errMsg: "spec.autoscaling.minReplicas must be >= 1",
},
{
name: "autoscaling maxReplicas less than minReplicas",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Autoscaling: &nvidiacomv1alpha1.Autoscaling{
Enabled: true,
MinReplicas: 5,
MaxReplicas: 3,
},
},
fieldPath: "spec",
wantErr: true,
errMsg: "spec.autoscaling.maxReplicas must be > minReplicas",
},
{
name: "autoscaling disabled - no validation",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Autoscaling: &nvidiacomv1alpha1.Autoscaling{
Enabled: false,
MinReplicas: 0,
MaxReplicas: 0,
},
},
fieldPath: "spec",
wantErr: false,
},
{
name: "ingress enabled without host",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Ingress: &nvidiacomv1alpha1.IngressSpec{
Enabled: true,
Host: "",
},
},
fieldPath: "spec",
wantErr: true,
errMsg: "spec.ingress.host is required when ingress is enabled",
},
{
name: "ingress disabled - no validation",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Ingress: &nvidiacomv1alpha1.IngressSpec{
Enabled: false,
Host: "",
},
},
fieldPath: "spec",
wantErr: false,
},
{
name: "volume mount without mountPoint and not compilation cache",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
VolumeMounts: []nvidiacomv1alpha1.VolumeMount{
{
Name: "data",
MountPoint: "",
UseAsCompilationCache: false,
},
},
},
fieldPath: "spec",
wantErr: true,
errMsg: "spec.volumeMounts[0].mountPoint is required when useAsCompilationCache is false",
},
{
name: "volume mount with mountPoint",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
VolumeMounts: []nvidiacomv1alpha1.VolumeMount{
{
Name: "data",
MountPoint: "/data",
},
},
},
fieldPath: "spec",
wantErr: false,
},
{
name: "volume mount as compilation cache without mountPoint",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
VolumeMounts: []nvidiacomv1alpha1.VolumeMount{
{
Name: "cache",
UseAsCompilationCache: true,
},
},
},
fieldPath: "spec",
wantErr: false,
},
{
name: "shared memory enabled without size",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
SharedMemory: &nvidiacomv1alpha1.SharedMemorySpec{
Disabled: false,
Size: resource.Quantity{},
},
},
fieldPath: "spec",
wantErr: true,
errMsg: "spec.sharedMemory.size is required when disabled is false",
},
{
name: "shared memory enabled with size",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
SharedMemory: &nvidiacomv1alpha1.SharedMemorySpec{
Disabled: false,
Size: resource.MustParse("2Gi"),
},
},
fieldPath: "spec",
wantErr: false,
},
{
name: "shared memory disabled without size",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
SharedMemory: &nvidiacomv1alpha1.SharedMemorySpec{
Disabled: true,
Size: resource.Quantity{},
},
},
fieldPath: "spec",
wantErr: false,
},
{
name: "custom field path for service validation",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Replicas: &negativeReplicas,
},
fieldPath: "spec.services[main]",
wantErr: true,
errMsg: "spec.services[main].replicas must be non-negative",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
validator := NewSharedSpecValidator(tt.spec, tt.fieldPath)
err := validator.Validate()
if (err != nil) != tt.wantErr {
t.Errorf("SharedSpecValidator.Validate() error = %v, wantErr %v", err, tt.wantErr)
return
}
if tt.wantErr && err.Error() != tt.errMsg {
t.Errorf("SharedSpecValidator.Validate() error message = %v, want %v", err.Error(), tt.errMsg)
}
})
}
}
...@@ -7,5 +7,6 @@ Deployment Guide ...@@ -7,5 +7,6 @@ Deployment Guide
Kubernetes Quickstart <../kubernetes/README> Kubernetes Quickstart <../kubernetes/README>
Detailed Installation Guide <../kubernetes/installation_guide> Detailed Installation Guide <../kubernetes/installation_guide>
Dynamo Operator <../kubernetes/dynamo_operator> Dynamo Operator <../kubernetes/dynamo_operator>
Webhooks <../kubernetes/webhooks>
Minikube Setup <../kubernetes/deployment/minikube> Minikube Setup <../kubernetes/deployment/minikube>
Managing Models with DynamoModel <../kubernetes/deployment/dynamomodel-guide> Managing Models with DynamoModel <../kubernetes/deployment/dynamomodel-guide>
...@@ -115,6 +115,21 @@ For a user-focused guide on deploying and managing models with DynamoModel, see: ...@@ -115,6 +115,21 @@ For a user-focused guide on deploying and managing models with DynamoModel, see:
**📖 [Managing Models with DynamoModel Guide](./deployment/dynamomodel-guide.md)** **📖 [Managing Models with DynamoModel Guide](./deployment/dynamomodel-guide.md)**
## Webhooks
The Dynamo Operator uses **Kubernetes admission webhooks** for real-time validation of custom resources before they are persisted to the cluster. Webhooks are **enabled by default** and ensure that invalid configurations are rejected immediately at the API server level.
**Key Features:**
- ✅ Shared certificate infrastructure across all webhook types
- ✅ Automatic certificate generation (for testing/development)
- ✅ cert-manager integration (for production)
- ✅ Multi-operator support with lease-based coordination
- ✅ Immutability enforcement for critical fields
For complete documentation on webhooks, certificate management, and troubleshooting, see:
**📖 [Webhooks Guide](./webhooks.md)**
## Installation ## Installation
### Quick Install with Helm ### Quick Install with Helm
......
# Webhooks
This document describes the webhook functionality in the Dynamo Operator, including validation webhooks, certificate management, and troubleshooting.
## Table of Contents
- [Overview](#overview)
- [Architecture](#architecture)
- [Configuration](#configuration)
- [Enabling/Disabling Webhooks](#enablingdisabling-webhooks)
- [Certificate Management Options](#certificate-management-options)
- [Advanced Configuration](#advanced-configuration)
- [Certificate Management](#certificate-management)
- [Automatic Certificates (Default)](#automatic-certificates-default)
- [cert-manager Integration](#cert-manager-integration)
- [External Certificates](#external-certificates)
- [Multi-Operator Deployments](#multi-operator-deployments)
- [Troubleshooting](#troubleshooting)
---
## Overview
The Dynamo Operator uses **Kubernetes admission webhooks** to provide real-time validation and mutation of custom resources. Currently, the operator implements **validation webhooks** that ensure invalid configurations are rejected immediately at the API server level, providing faster feedback to users compared to controller-based validation.
All webhook types (validating, mutating, conversion, etc.) share the same **webhook server** and **TLS certificate infrastructure**, making certificate management consistent across all webhook operations.
### Key Features
-**Enabled by default** - Zero-touch validation out of the box
-**Shared certificate infrastructure** - All webhook types use the same TLS certificates
-**Automatic certificate generation** - No manual certificate management required
-**Defense in depth** - Controllers validate when webhooks are disabled
-**cert-manager integration** - Optional integration for automated certificate lifecycle
-**Multi-operator support** - Lease-based coordination for cluster-wide and namespace-restricted deployments
-**Immutability enforcement** - Critical fields protected via CEL validation rules
### Current Webhook Types
- **Validating Webhooks**: Validate custom resource specifications before persistence
- `DynamoComponentDeployment` validation
- `DynamoGraphDeployment` validation
- `DynamoModel` validation
**Note:** Future releases may add mutating webhooks (for defaults/transformations) and conversion webhooks (for CRD version migrations). All will use the same certificate infrastructure described in this document.
---
## Architecture
```
┌─────────────────────────────────────────────────────────────────┐
│ API Server │
│ 1. User submits CR (kubectl apply) │
│ 2. API server calls ValidatingWebhookConfiguration │
└────────────────────────┬────────────────────────────────────────┘
│ HTTPS (TLS required)
┌─────────────────────────────────────────────────────────────────┐
│ Webhook Server (in Operator Pod) │
│ 3. Validates CR against business rules │
│ 4. Returns admit/deny decision + warnings │
└─────────────────────────────────────────────────────────────────┘
┌─────────────────────────────────────────────────────────────────┐
│ API Server │
│ 5. If admitted: Persist CR to etcd │
│ 6. If denied: Return error to user │
└─────────────────────────────────────────────────────────────────┘
```
### Validation Flow
1. **Webhook validation** (if enabled): Validates at API server level
2. **CEL validation**: Kubernetes-native immutability checks (always active)
3. **Controller validation** (if webhooks disabled): Defense-in-depth validation during reconciliation
---
## Configuration
### Enabling/Disabling Webhooks
Webhooks are **enabled by default**. To disable them:
```yaml
# Platform-level values.yaml
dynamo-operator:
webhook:
enabled: false
```
**When to disable webhooks:**
- During development/testing when rapid iteration is needed
- In environments where admission webhooks are not supported
- When troubleshooting validation issues
**Note:** When webhooks are disabled, controllers perform validation during reconciliation (defense in depth).
---
### Certificate Management Options
The operator supports three certificate management modes:
| Mode | Description | Use Case |
|------|-------------|----------|
| **Automatic (Default)** | Helm hooks generate self-signed certificates | Testing and development environments |
| **cert-manager** | Integrate with cert-manager for automated lifecycle | Production deployments with cert-manager |
| **External** | Bring your own certificates | Production deployments with custom PKI |
---
### Advanced Configuration
#### Complete Configuration Reference
```yaml
dynamo-operator:
webhook:
# Enable/disable validation webhooks
enabled: true
# Certificate management
certManager:
enabled: false
issuerRef:
kind: Issuer
name: selfsigned-issuer
# Certificate secret configuration
certificateSecret:
name: webhook-server-cert
external: false
# Certificate validity period (automatic generation only)
certificateValidity: 3650 # 10 years
# Certificate generator image (automatic generation only)
certGenerator:
image:
repository: bitnami/kubectl
tag: latest
# Webhook behavior configuration
failurePolicy: Fail # Fail (reject on error) or Ignore (allow on error)
timeoutSeconds: 10 # Webhook timeout
# Namespace filtering (advanced)
namespaceSelector: {} # Kubernetes label selector for namespaces
```
#### Failure Policy
```yaml
# Fail: Reject resources if webhook is unavailable (recommended for production)
webhook:
failurePolicy: Fail
# Ignore: Allow resources if webhook is unavailable (use with caution)
webhook:
failurePolicy: Ignore
```
**Recommendation:** Use `Fail` in production to ensure validation is always enforced. Only use `Ignore` if you need high availability and can tolerate occasional invalid resources.
#### Namespace Filtering
Control which namespaces are validated (applies to **cluster-wide operator** only):
```yaml
# Only validate resources in namespaces with specific labels
webhook:
namespaceSelector:
matchLabels:
dynamo-validation: enabled
# Or exclude specific namespaces
webhook:
namespaceSelector:
matchExpressions:
- key: dynamo-validation
operator: NotIn
values: ["disabled"]
```
**Note:** For **namespace-restricted operators**, the namespace selector is automatically set to validate only the operator's namespace. This configuration is ignored in namespace-restricted mode.
---
## Certificate Management
### Automatic Certificates (Default)
**Zero configuration required!** Certificates are automatically generated during `helm install` and `helm upgrade`.
#### How It Works
1. **Pre-install/pre-upgrade hook**: Generates self-signed TLS certificates
- Root CA (valid 10 years)
- Server certificate (valid 10 years)
- Stores in Secret: `<release>-webhook-server-cert`
2. **Post-install/post-upgrade hook**: Injects CA bundle into `ValidatingWebhookConfiguration`
- Reads `ca.crt` from Secret
- Patches `ValidatingWebhookConfiguration` with base64-encoded CA bundle
3. **Operator pod**: Mounts certificate secret and serves webhook on port 9443
#### Certificate Validity
- **Root CA**: 10 years
- **Server Certificate**: 10 years (same as Root CA)
- **Automatic rotation**: Certificates are re-generated on every `helm upgrade`
#### Smart Certificate Generation
The certificate generation hook is intelligent:
-**Checks existing certificates** before generating new ones
-**Skips generation** if valid certificates exist (valid for 30+ days with correct SANs)
-**Regenerates** only when needed (missing, expiring soon, or incorrect SANs)
This means:
- Fast `helm upgrade` operations (no unnecessary cert generation)
- Safe to run `helm upgrade` frequently
- Certificates persist across reinstalls (stored in Secret)
#### Manual Certificate Rotation
If you need to rotate certificates manually:
```bash
# Delete the certificate secret
kubectl delete secret <release>-webhook-server-cert -n <namespace>
# Upgrade the release to regenerate certificates
helm upgrade <release> dynamo-platform -n <namespace>
```
---
### cert-manager Integration
For clusters with cert-manager installed, you can enable automated certificate lifecycle management.
#### Prerequisites
1. **cert-manager installed** (v1.0+)
2. **CA issuer configured** (e.g., `selfsigned-issuer`)
#### Configuration
```yaml
dynamo-operator:
webhook:
certManager:
enabled: true
issuerRef:
kind: Issuer # Or ClusterIssuer
name: selfsigned-issuer # Your issuer name
```
#### How It Works
1. **Helm creates Certificate resource**: Requests TLS certificate from cert-manager
2. **cert-manager generates certificate**: Based on configured issuer
3. **cert-manager stores in Secret**: `<release>-webhook-server-cert`
4. **cert-manager ca-injector**: Automatically injects CA bundle into `ValidatingWebhookConfiguration`
5. **Operator pod**: Mounts certificate secret and serves webhook
#### Benefits Over Automatic Mode
-**Automated rotation**: cert-manager renews certificates before expiration
-**Custom validity periods**: Configure certificate lifetime
-**CA rotation support**: ca-injector handles CA updates automatically
-**Integration with existing PKI**: Use your organization's certificate infrastructure
#### Certificate Rotation
With cert-manager, certificate rotation is **fully automated**:
1. **Leaf certificate rotation** (default: every year)
- cert-manager auto-renews before expiration
- controller-runtime auto-reloads new certificate
- **No pod restart required**
- **No caBundle update required** (same Root CA)
2. **Root CA rotation** (every 10 years)
- cert-manager rotates Root CA
- ca-injector auto-updates caBundle in `ValidatingWebhookConfiguration`
- **No manual intervention required**
#### Example: Self-Signed Issuer
```yaml
apiVersion: cert-manager.io/v1
kind: Issuer
metadata:
name: selfsigned-issuer
namespace: dynamo-system
spec:
selfSigned: {}
---
# Enable in platform values.yaml
dynamo-operator:
webhook:
certManager:
enabled: true
issuerRef:
kind: Issuer
name: selfsigned-issuer
```
---
### External Certificates
Bring your own certificates for custom PKI requirements.
#### Steps
1. **Create certificate secret manually**:
```bash
kubectl create secret tls <release>-webhook-server-cert \
--cert=tls.crt \
--key=tls.key \
-n <namespace>
# Also add ca.crt to the secret
kubectl patch secret <release>-webhook-server-cert -n <namespace> \
--type='json' \
-p='[{"op": "add", "path": "/data/ca.crt", "value": "'$(base64 -w0 < ca.crt)'"}]'
```
2. **Configure operator to use external secret**:
```yaml
dynamo-operator:
webhook:
certificateSecret:
external: true
caBundle: <base64-encoded-ca-cert> # Must manually specify
```
3. **Deploy operator**:
```bash
helm install dynamo-platform . -n <namespace> -f values.yaml
```
#### Certificate Requirements
- **Secret name**: Must match `webhook.certificateSecret.name` (default: `webhook-server-cert`)
- **Secret keys**: `tls.crt`, `tls.key`, `ca.crt`
- **Certificate SAN**: Must include `<service-name>.<namespace>.svc`
- Example: `dynamo-platform-dynamo-operator-webhook-service.dynamo-system.svc`
---
## Multi-Operator Deployments
The operator supports running both **cluster-wide** and **namespace-restricted** instances simultaneously using a **lease-based coordination mechanism**.
### Scenario
```
Cluster:
├─ Operator A (cluster-wide, namespace: platform-system)
│ └─ Validates all namespaces EXCEPT team-a
└─ Operator B (namespace-restricted, namespace: team-a)
└─ Validates only team-a namespace
```
### How It Works
1. **Namespace-restricted operator** creates a Lease in its namespace
2. **Cluster-wide operator** watches for Leases named `dynamo-operator-ns-lock`
3. **Cluster-wide operator** skips validation for namespaces with active Leases
4. **Namespace-restricted operator** validates resources in its namespace
### Lease Configuration
The lease mechanism is **automatically configured** based on deployment mode:
```yaml
# Cluster-wide operator (default)
namespaceRestriction:
enabled: false
# → Watches for leases in all namespaces
# → Skips validation for namespaces with active leases
# Namespace-restricted operator
namespaceRestriction:
enabled: true
namespace: team-a
# → Creates lease in team-a namespace
# → Does NOT check for leases (no cluster permissions)
```
### Deployment Example
```bash
# 1. Deploy cluster-wide operator
helm install platform-operator dynamo-platform \
-n platform-system \
--set namespaceRestriction.enabled=false
# 2. Deploy namespace-restricted operator for team-a
helm install team-a-operator dynamo-platform \
-n team-a \
--set namespaceRestriction.enabled=true \
--set namespaceRestriction.namespace=team-a
```
### ValidatingWebhookConfiguration Naming
The webhook configuration name reflects the deployment mode:
- **Cluster-wide**: `<release>-validating`
- **Namespace-restricted**: `<release>-validating-<namespace>`
Example:
```bash
# Cluster-wide
platform-operator-validating
# Namespace-restricted (team-a)
team-a-operator-validating-team-a
```
This allows multiple webhook configurations to coexist without conflicts.
### Lease Health
If the namespace-restricted operator is deleted or becomes unhealthy:
- Lease expires after `leaseDuration + gracePeriod` (default: ~30 seconds)
- Cluster-wide operator automatically resumes validation for that namespace
---
## Troubleshooting
### Webhook Not Called
**Symptoms:**
- Invalid resources are accepted
- No validation errors in logs
**Checks:**
1. **Verify webhook is enabled**:
```bash
kubectl get validatingwebhookconfiguration | grep dynamo
```
2. **Check webhook configuration**:
```bash
kubectl get validatingwebhookconfiguration <name> -o yaml
# Verify:
# - caBundle is present and non-empty
# - clientConfig.service points to correct service
# - webhooks[].namespaceSelector matches your namespace
```
3. **Verify webhook service exists**:
```bash
kubectl get service -n <namespace> | grep webhook
```
4. **Check operator logs for webhook startup**:
```bash
kubectl logs -n <namespace> deployment/<release>-dynamo-operator | grep webhook
# Should see: "Webhooks are enabled - webhooks will validate, controllers will skip validation"
# Should see: "Starting webhook server"
```
---
### Connection Refused Errors
**Symptoms:**
```
Error from server (InternalError): Internal error occurred: failed calling webhook:
Post "https://...webhook-service...:443/validate-...": dial tcp ...:443: connect: connection refused
```
**Checks:**
1. **Verify operator pod is running**:
```bash
kubectl get pods -n <namespace> -l app.kubernetes.io/name=dynamo-operator
```
2. **Check webhook server is listening**:
```bash
# Port-forward to pod
kubectl port-forward -n <namespace> pod/<operator-pod> 9443:9443
# In another terminal, test connection
curl -k https://localhost:9443/validate-nvidia-com-v1alpha1-dynamocomponentdeployment
# Should NOT get "connection refused"
```
3. **Verify webhook port in deployment**:
```bash
kubectl get deployment -n <namespace> <release>-dynamo-operator -o yaml | grep -A5 "containerPort: 9443"
```
4. **Check for webhook initialization errors**:
```bash
kubectl logs -n <namespace> deployment/<release>-dynamo-operator | grep -i error
```
---
### Certificate Errors
**Symptoms:**
```
Error from server (InternalError): Internal error occurred: failed calling webhook:
x509: certificate signed by unknown authority
```
**Checks:**
1. **Verify caBundle is present**:
```bash
kubectl get validatingwebhookconfiguration <name> -o jsonpath='{.webhooks[0].clientConfig.caBundle}' | base64 -d
# Should output a valid PEM certificate
```
2. **Verify certificate secret exists**:
```bash
kubectl get secret -n <namespace> <release>-webhook-server-cert
```
3. **Check certificate validity**:
```bash
kubectl get secret -n <namespace> <release>-webhook-server-cert -o jsonpath='{.data.tls\.crt}' | base64 -d | openssl x509 -noout -text
# Check:
# - Not expired
# - SAN includes: <service-name>.<namespace>.svc
```
4. **Check CA injection job logs**:
```bash
kubectl logs -n <namespace> job/<release>-webhook-ca-inject-<revision>
```
---
### Helm Hook Job Failures
**Symptoms:**
- `helm install` or `helm upgrade` hangs or fails
- Certificate generation errors
**Checks:**
1. **List hook jobs**:
```bash
kubectl get jobs -n <namespace> | grep webhook
```
2. **Check job logs**:
```bash
# Certificate generation
kubectl logs -n <namespace> job/<release>-webhook-cert-gen-<revision>
# CA injection
kubectl logs -n <namespace> job/<release>-webhook-ca-inject-<revision>
```
3. **Check RBAC permissions**:
```bash
# Verify ServiceAccount exists
kubectl get sa -n <namespace> <release>-webhook-ca-inject
# Verify ClusterRole and ClusterRoleBinding exist
kubectl get clusterrole <release>-webhook-ca-inject
kubectl get clusterrolebinding <release>-webhook-ca-inject
```
4. **Manual cleanup**:
```bash
# Delete failed jobs
kubectl delete job -n <namespace> <release>-webhook-cert-gen-<revision>
kubectl delete job -n <namespace> <release>-webhook-ca-inject-<revision>
# Retry helm upgrade
helm upgrade <release> dynamo-platform -n <namespace>
```
---
### Validation Errors Not Clear
**Symptoms:**
- Webhook rejects resource but error message is unclear
**Solution:**
Check operator logs for detailed validation errors:
```bash
kubectl logs -n <namespace> deployment/<release>-dynamo-operator | grep "validate create\|validate update"
```
Webhook logs include:
- Resource name and namespace
- Validation errors with context
- Warnings for immutable field changes
---
### Stuck Deleting Resources
**Symptoms:**
- Resource stuck in "Terminating" state
- Webhook blocks finalizer removal
**Solution:**
The webhook automatically skips validation for resources being deleted. If stuck:
1. **Check if webhook is blocking**:
```bash
kubectl describe <resource-type> <name> -n <namespace>
# Look for events mentioning webhook errors
```
2. **Temporarily disable webhook**:
```bash
# Option 1: Delete ValidatingWebhookConfiguration
kubectl delete validatingwebhookconfiguration <name>
# Option 2: Set failurePolicy to Ignore
kubectl patch validatingwebhookconfiguration <name> \
--type='json' \
-p='[{"op": "replace", "path": "/webhooks/0/failurePolicy", "value": "Ignore"}]'
```
3. **Delete resource again**:
```bash
kubectl delete <resource-type> <name> -n <namespace>
```
4. **Re-enable webhook**:
```bash
helm upgrade <release> dynamo-platform -n <namespace>
```
---
## Best Practices
### Production Deployments
1.**Keep webhooks enabled** (default) for real-time validation
2.**Use `failurePolicy: Fail`** (default) to ensure validation is enforced
3.**Monitor webhook latency** - Validation adds ~10-50ms per resource operation
4.**Use cert-manager** for automated certificate lifecycle in large deployments
5.**Test webhook configuration** in staging before production
### Development Deployments
1.**Disable webhooks** for rapid iteration if needed
2.**Use `failurePolicy: Ignore`** if webhook availability is problematic
3.**Keep automatic certificates** (simpler than cert-manager for dev)
### Multi-Tenant Deployments
1.**Deploy one cluster-wide operator** for platform-wide validation
2.**Deploy namespace-restricted operators** for tenant-specific namespaces
3.**Monitor lease health** to ensure coordination works correctly
4.**Use unique release names** per namespace to avoid naming conflicts
---
## Additional Resources
- [Kubernetes Admission Webhooks](https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/)
- [cert-manager Documentation](https://cert-manager.io/docs/)
- [Kubebuilder Webhook Tutorial](https://book.kubebuilder.io/cronjob-tutorial/webhook-implementation.html)
- [CEL Validation Rules](https://kubernetes.io/docs/reference/using-api/cel/)
---
## Support
For issues or questions:
- Check [Troubleshooting](#troubleshooting) section
- Review operator logs: `kubectl logs -n <namespace> deployment/<release>-dynamo-operator`
- Open an issue on GitHub
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment