Unverified Commit 872900f1 authored by Julien Mancuso's avatar Julien Mancuso Committed by GitHub
Browse files

feat: add validation webhooks (#4416)


Signed-off-by: default avatarJulien Mancuso <jmancuso@nvidia.com>
parent b2605a8e
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package validation
import (
"testing"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
func TestDynamoComponentDeploymentValidator_Validate(t *testing.T) {
var (
validReplicas = int32(3)
negativeReplicas = int32(-1)
)
tests := []struct {
name string
deployment *nvidiacomv1alpha1.DynamoComponentDeployment
wantErr bool
errMsg string
}{
{
name: "valid deployment",
deployment: &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-deployment",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Replicas: &validReplicas,
Autoscaling: &nvidiacomv1alpha1.Autoscaling{
Enabled: true,
MinReplicas: 1,
MaxReplicas: 10,
},
},
BackendFramework: "sglang",
},
},
wantErr: false,
},
{
name: "invalid replicas",
deployment: &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-deployment",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Replicas: &negativeReplicas,
},
},
},
wantErr: true,
errMsg: "spec.replicas must be non-negative",
},
{
name: "invalid autoscaling",
deployment: &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-deployment",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Autoscaling: &nvidiacomv1alpha1.Autoscaling{
Enabled: true,
MinReplicas: 5,
MaxReplicas: 3,
},
},
},
},
wantErr: true,
errMsg: "spec.autoscaling.maxReplicas must be > minReplicas",
},
{
name: "invalid ingress",
deployment: &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-deployment",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Ingress: &nvidiacomv1alpha1.IngressSpec{
Enabled: true,
Host: "",
},
},
},
},
wantErr: true,
errMsg: "spec.ingress.host is required when ingress is enabled",
},
{
name: "invalid volume mount",
deployment: &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-deployment",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
VolumeMounts: []nvidiacomv1alpha1.VolumeMount{
{
Name: "data",
UseAsCompilationCache: false,
},
},
},
},
},
wantErr: true,
errMsg: "spec.volumeMounts[0].mountPoint is required when useAsCompilationCache is false",
},
{
name: "invalid shared memory",
deployment: &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-deployment",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
SharedMemory: &nvidiacomv1alpha1.SharedMemorySpec{
Disabled: false,
Size: resource.Quantity{},
},
},
},
},
wantErr: true,
errMsg: "spec.sharedMemory.size is required when disabled is false",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
validator := NewDynamoComponentDeploymentValidator(tt.deployment)
_, err := validator.Validate()
if (err != nil) != tt.wantErr {
t.Errorf("DynamoComponentDeploymentValidator.Validate() error = %v, wantErr %v", err, tt.wantErr)
return
}
if tt.wantErr && err.Error() != tt.errMsg {
t.Errorf("DynamoComponentDeploymentValidator.Validate() error message = %v, want %v", err.Error(), tt.errMsg)
}
})
}
}
func TestDynamoComponentDeploymentValidator_ValidateUpdate(t *testing.T) {
tests := []struct {
name string
oldDeployment *nvidiacomv1alpha1.DynamoComponentDeployment
newDeployment *nvidiacomv1alpha1.DynamoComponentDeployment
wantErr bool
wantWarnings bool
errMsg string
expectedWarnMsg string
}{
{
name: "no changes",
oldDeployment: &nvidiacomv1alpha1.DynamoComponentDeployment{
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
BackendFramework: "sglang",
},
},
newDeployment: &nvidiacomv1alpha1.DynamoComponentDeployment{
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
BackendFramework: "sglang",
},
},
wantErr: false,
},
{
name: "changing backend framework",
oldDeployment: &nvidiacomv1alpha1.DynamoComponentDeployment{
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
BackendFramework: "sglang",
},
},
newDeployment: &nvidiacomv1alpha1.DynamoComponentDeployment{
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
BackendFramework: "vllm",
},
},
wantErr: true,
wantWarnings: true,
errMsg: "spec.backendFramework is immutable and cannot be changed after creation",
expectedWarnMsg: "Changing spec.backendFramework may cause unexpected behavior",
},
{
name: "changing replicas is allowed",
oldDeployment: &nvidiacomv1alpha1.DynamoComponentDeployment{
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Replicas: func() *int32 { r := int32(1); return &r }(),
},
BackendFramework: "sglang",
},
},
newDeployment: &nvidiacomv1alpha1.DynamoComponentDeployment{
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Replicas: func() *int32 { r := int32(3); return &r }(),
},
BackendFramework: "sglang",
},
},
wantErr: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
validator := NewDynamoComponentDeploymentValidator(tt.newDeployment)
warnings, err := validator.ValidateUpdate(tt.oldDeployment)
if (err != nil) != tt.wantErr {
t.Errorf("DynamoComponentDeploymentValidator.ValidateUpdate() error = %v, wantErr %v", err, tt.wantErr)
return
}
if tt.wantErr && err.Error() != tt.errMsg {
t.Errorf("DynamoComponentDeploymentValidator.ValidateUpdate() error message = %v, want %v", err.Error(), tt.errMsg)
}
if tt.wantWarnings && len(warnings) == 0 {
t.Errorf("DynamoComponentDeploymentValidator.ValidateUpdate() expected warnings but got none")
}
if tt.wantWarnings && len(warnings) > 0 && warnings[0] != tt.expectedWarnMsg {
t.Errorf("DynamoComponentDeploymentValidator.ValidateUpdate() warning = %v, want %v", warnings[0], tt.expectedWarnMsg)
}
})
}
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package validation
import (
"errors"
"fmt"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
)
// DynamoGraphDeploymentValidator validates DynamoGraphDeployment resources.
// This validator can be used by both webhooks and controllers for consistent validation.
type DynamoGraphDeploymentValidator struct {
deployment *nvidiacomv1alpha1.DynamoGraphDeployment
}
// NewDynamoGraphDeploymentValidator creates a new validator for DynamoGraphDeployment.
func NewDynamoGraphDeploymentValidator(deployment *nvidiacomv1alpha1.DynamoGraphDeployment) *DynamoGraphDeploymentValidator {
return &DynamoGraphDeploymentValidator{
deployment: deployment,
}
}
// Validate performs stateless validation on the DynamoGraphDeployment.
// Returns warnings and error.
func (v *DynamoGraphDeploymentValidator) Validate() (admission.Warnings, error) {
// Validate that at least one service is specified
if len(v.deployment.Spec.Services) == 0 {
return nil, fmt.Errorf("spec.services must have at least one service")
}
// Validate PVCs
if err := v.validatePVCs(); err != nil {
return nil, err
}
// Validate each service
for serviceName, service := range v.deployment.Spec.Services {
if err := v.validateService(serviceName, service); err != nil {
return nil, err
}
}
return nil, nil
}
// ValidateUpdate performs stateful validation comparing old and new DynamoGraphDeployment.
// Returns warnings and error.
func (v *DynamoGraphDeploymentValidator) ValidateUpdate(old *nvidiacomv1alpha1.DynamoGraphDeployment) (admission.Warnings, error) {
// Validate that BackendFramework is not changed (immutable)
if v.deployment.Spec.BackendFramework != old.Spec.BackendFramework {
warning := "Changing spec.backendFramework may cause unexpected behavior"
return admission.Warnings{warning}, fmt.Errorf("spec.backendFramework is immutable and cannot be changed after creation")
}
return nil, nil
}
// validateService validates a single service configuration using SharedSpecValidator.
func (v *DynamoGraphDeploymentValidator) validateService(serviceName string, service *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec) error {
// Use SharedSpecValidator to validate service spec (which is a DynamoComponentDeploymentSharedSpec)
fieldPath := fmt.Sprintf("spec.services[%s]", serviceName)
sharedValidator := NewSharedSpecValidator(service, fieldPath)
return sharedValidator.Validate()
}
// validatePVCs validates the PVC configurations.
func (v *DynamoGraphDeploymentValidator) validatePVCs() error {
for i, pvc := range v.deployment.Spec.PVCs {
if err := v.validatePVC(i, &pvc); err != nil {
return err
}
}
return nil
}
// validatePVC validates a single PVC configuration.
func (v *DynamoGraphDeploymentValidator) validatePVC(index int, pvc *nvidiacomv1alpha1.PVC) error {
var err error
// Validate name is not nil
if pvc.Name == nil || *pvc.Name == "" {
err = errors.Join(err, fmt.Errorf("spec.pvcs[%d].name is required", index))
}
// Check if create is true
if pvc.Create != nil && *pvc.Create {
// Validate required fields when create is true
if pvc.StorageClass == "" {
err = errors.Join(err, fmt.Errorf("spec.pvcs[%d].storageClass is required when create is true", index))
}
if pvc.Size.IsZero() {
err = errors.Join(err, fmt.Errorf("spec.pvcs[%d].size is required when create is true", index))
}
if pvc.VolumeAccessMode == "" {
err = errors.Join(err, fmt.Errorf("spec.pvcs[%d].volumeAccessMode is required when create is true", index))
}
}
return err
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package validation
import (
"context"
"fmt"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
internalwebhook "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/webhook"
"k8s.io/apimachinery/pkg/runtime"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
)
const (
// DynamoGraphDeploymentWebhookName is the name of the validating webhook handler for DynamoGraphDeployment.
DynamoGraphDeploymentWebhookName = "dynamographdeployment-validating-webhook"
dynamoGraphDeploymentWebhookPath = "/validate-nvidia-com-v1alpha1-dynamographdeployment"
)
// DynamoGraphDeploymentHandler is a handler for validating DynamoGraphDeployment resources.
// It is a thin wrapper around DynamoGraphDeploymentValidator.
type DynamoGraphDeploymentHandler struct{}
// NewDynamoGraphDeploymentHandler creates a new handler for DynamoGraphDeployment Webhook.
func NewDynamoGraphDeploymentHandler() *DynamoGraphDeploymentHandler {
return &DynamoGraphDeploymentHandler{}
}
// ValidateCreate validates a DynamoGraphDeployment create request.
func (h *DynamoGraphDeploymentHandler) ValidateCreate(ctx context.Context, obj runtime.Object) (admission.Warnings, error) {
logger := log.FromContext(ctx).WithName(DynamoGraphDeploymentWebhookName)
deployment, err := castToDynamoGraphDeployment(obj)
if err != nil {
return nil, err
}
logger.Info("validate create", "name", deployment.Name, "namespace", deployment.Namespace)
// Create validator and perform validation
validator := NewDynamoGraphDeploymentValidator(deployment)
return validator.Validate()
}
// ValidateUpdate validates a DynamoGraphDeployment update request.
func (h *DynamoGraphDeploymentHandler) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) {
logger := log.FromContext(ctx).WithName(DynamoGraphDeploymentWebhookName)
newDeployment, err := castToDynamoGraphDeployment(newObj)
if err != nil {
return nil, err
}
logger.Info("validate update", "name", newDeployment.Name, "namespace", newDeployment.Namespace)
// Skip validation if the resource is being deleted (to allow finalizer removal)
if !newDeployment.DeletionTimestamp.IsZero() {
logger.Info("skipping validation for resource being deleted", "name", newDeployment.Name)
return nil, nil
}
oldDeployment, err := castToDynamoGraphDeployment(oldObj)
if err != nil {
return nil, err
}
// Create validator and perform validation
validator := NewDynamoGraphDeploymentValidator(newDeployment)
// Validate stateless rules
warnings, err := validator.Validate()
if err != nil {
return warnings, err
}
// Validate stateful rules (immutability)
updateWarnings, err := validator.ValidateUpdate(oldDeployment)
if err != nil {
return updateWarnings, err
}
// Combine warnings
warnings = append(warnings, updateWarnings...)
return warnings, nil
}
// ValidateDelete validates a DynamoGraphDeployment delete request.
func (h *DynamoGraphDeploymentHandler) ValidateDelete(ctx context.Context, obj runtime.Object) (admission.Warnings, error) {
logger := log.FromContext(ctx).WithName(DynamoGraphDeploymentWebhookName)
deployment, err := castToDynamoGraphDeployment(obj)
if err != nil {
return nil, err
}
logger.Info("validate delete", "name", deployment.Name, "namespace", deployment.Namespace)
// No special validation needed for deletion
return nil, nil
}
// RegisterWithManager registers the webhook with the manager.
// The handler is automatically wrapped with LeaseAwareValidator to add namespace exclusion logic.
func (h *DynamoGraphDeploymentHandler) RegisterWithManager(mgr manager.Manager) error {
// Wrap the handler with lease-aware logic for cluster-wide coordination
validator := internalwebhook.NewLeaseAwareValidator(h, internalwebhook.GetExcludedNamespaces())
webhook := admission.
WithCustomValidator(mgr.GetScheme(), &nvidiacomv1alpha1.DynamoGraphDeployment{}, validator).
WithRecoverPanic(true)
mgr.GetWebhookServer().Register(dynamoGraphDeploymentWebhookPath, webhook)
return nil
}
// castToDynamoGraphDeployment attempts to cast a runtime.Object to a DynamoGraphDeployment.
func castToDynamoGraphDeployment(obj runtime.Object) (*nvidiacomv1alpha1.DynamoGraphDeployment, error) {
deployment, ok := obj.(*nvidiacomv1alpha1.DynamoGraphDeployment)
if !ok {
return nil, fmt.Errorf("expected DynamoGraphDeployment but got %T", obj)
}
return deployment, nil
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package validation
import (
"strings"
"testing"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
func TestDynamoGraphDeploymentValidator_Validate(t *testing.T) {
var (
validReplicas = int32(3)
negativeReplicas = int32(-1)
pvcName = "test-pvc"
trueVal = true
falseVal = false
)
tests := []struct {
name string
deployment *nvidiacomv1alpha1.DynamoGraphDeployment
wantErr bool
errMsg string
errContains bool
}{
{
name: "valid deployment with services",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-graph",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "sglang",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {
Replicas: &validReplicas,
},
},
},
},
wantErr: false,
},
{
name: "no services",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-graph",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{},
},
},
wantErr: true,
errMsg: "spec.services must have at least one service",
},
{
name: "service with invalid replicas",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-graph",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {
Replicas: &negativeReplicas,
},
},
},
},
wantErr: true,
errMsg: "spec.services[main].replicas must be non-negative",
},
{
name: "service with invalid autoscaling",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-graph",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"prefill": {
Autoscaling: &nvidiacomv1alpha1.Autoscaling{
Enabled: true,
MinReplicas: 10,
MaxReplicas: 5,
},
},
},
},
},
wantErr: true,
errMsg: "spec.services[prefill].autoscaling.maxReplicas must be > minReplicas",
},
{
name: "service with invalid ingress",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-graph",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"gateway": {
Ingress: &nvidiacomv1alpha1.IngressSpec{
Enabled: true,
Host: "",
},
},
},
},
},
wantErr: true,
errMsg: "spec.services[gateway].ingress.host is required when ingress is enabled",
},
{
name: "pvc with create=true and missing storageClass",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-graph",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
PVCs: []nvidiacomv1alpha1.PVC{
{
Create: &trueVal,
Name: &pvcName,
StorageClass: "",
Size: resource.MustParse("10Gi"),
VolumeAccessMode: corev1.ReadWriteOnce,
},
},
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {},
},
},
},
wantErr: true,
errMsg: "spec.pvcs[0].storageClass is required when create is true",
},
{
name: "pvc with create=true and missing size",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-graph",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
PVCs: []nvidiacomv1alpha1.PVC{
{
Create: &trueVal,
Name: &pvcName,
StorageClass: "standard",
Size: resource.Quantity{},
VolumeAccessMode: corev1.ReadWriteOnce,
},
},
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {},
},
},
},
wantErr: true,
errMsg: "spec.pvcs[0].size is required when create is true",
},
{
name: "pvc with create=true and missing volumeAccessMode",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-graph",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
PVCs: []nvidiacomv1alpha1.PVC{
{
Create: &trueVal,
Name: &pvcName,
StorageClass: "standard",
Size: resource.MustParse("10Gi"),
VolumeAccessMode: "",
},
},
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {},
},
},
},
wantErr: true,
errMsg: "spec.pvcs[0].volumeAccessMode is required when create is true",
},
{
name: "pvc with create=false and missing fields",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-graph",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
PVCs: []nvidiacomv1alpha1.PVC{
{
Create: &falseVal,
Name: &pvcName,
},
},
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {},
},
},
},
wantErr: false,
},
{
name: "pvc with missing name",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-graph",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
PVCs: []nvidiacomv1alpha1.PVC{
{
Create: &falseVal,
},
},
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {},
},
},
},
wantErr: true,
errMsg: "spec.pvcs[0].name is required",
},
{
name: "pvc with multiple errors (name, storageClass, size, volumeAccessMode all missing)",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-graph",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
PVCs: []nvidiacomv1alpha1.PVC{
{
Create: &trueVal,
},
},
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {},
},
},
},
wantErr: true,
errMsg: "spec.pvcs[0].name is required\nspec.pvcs[0].storageClass is required when create is true\nspec.pvcs[0].size is required when create is true\nspec.pvcs[0].volumeAccessMode is required when create is true",
errContains: true,
},
{
name: "valid pvc with create=true",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-graph",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
PVCs: []nvidiacomv1alpha1.PVC{
{
Create: &trueVal,
Name: &pvcName,
StorageClass: "standard",
Size: resource.MustParse("10Gi"),
VolumeAccessMode: corev1.ReadWriteOnce,
},
},
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {},
},
},
},
wantErr: false,
},
{
name: "service with invalid volume mount",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-graph",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {
VolumeMounts: []nvidiacomv1alpha1.VolumeMount{
{
Name: "data",
UseAsCompilationCache: false,
},
},
},
},
},
},
wantErr: true,
errMsg: "spec.services[main].volumeMounts[0].mountPoint is required when useAsCompilationCache is false",
},
{
name: "service with invalid shared memory",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-graph",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {
SharedMemory: &nvidiacomv1alpha1.SharedMemorySpec{
Disabled: false,
Size: resource.Quantity{},
},
},
},
},
},
wantErr: true,
errMsg: "spec.services[main].sharedMemory.size is required when disabled is false",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
validator := NewDynamoGraphDeploymentValidator(tt.deployment)
_, err := validator.Validate()
if (err != nil) != tt.wantErr {
t.Errorf("DynamoGraphDeploymentValidator.Validate() error = %v, wantErr %v", err, tt.wantErr)
return
}
if tt.wantErr {
if tt.errContains {
// For multiple errors, check that all expected error messages are present
errStr := err.Error()
for _, expectedMsg := range strings.Split(tt.errMsg, "\n") {
if !strings.Contains(errStr, expectedMsg) {
t.Errorf("DynamoGraphDeploymentValidator.Validate() error message = %v, want to contain %v", errStr, expectedMsg)
}
}
} else {
if err.Error() != tt.errMsg {
t.Errorf("DynamoGraphDeploymentValidator.Validate() error message = %v, want %v", err.Error(), tt.errMsg)
}
}
}
})
}
}
func TestDynamoGraphDeploymentValidator_ValidateUpdate(t *testing.T) {
tests := []struct {
name string
oldDeployment *nvidiacomv1alpha1.DynamoGraphDeployment
newDeployment *nvidiacomv1alpha1.DynamoGraphDeployment
wantErr bool
wantWarnings bool
errMsg string
expectedWarnMsg string
}{
{
name: "no changes",
oldDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "sglang",
},
},
newDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "sglang",
},
},
wantErr: false,
},
{
name: "changing backend framework",
oldDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "sglang",
},
},
newDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "vllm",
},
},
wantErr: true,
wantWarnings: true,
errMsg: "spec.backendFramework is immutable and cannot be changed after creation",
expectedWarnMsg: "Changing spec.backendFramework may cause unexpected behavior",
},
{
name: "adding new service is allowed",
oldDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "sglang",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {},
},
},
},
newDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "sglang",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {},
"prefill": {},
},
},
},
wantErr: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
validator := NewDynamoGraphDeploymentValidator(tt.newDeployment)
warnings, err := validator.ValidateUpdate(tt.oldDeployment)
if (err != nil) != tt.wantErr {
t.Errorf("DynamoGraphDeploymentValidator.ValidateUpdate() error = %v, wantErr %v", err, tt.wantErr)
return
}
if tt.wantErr && err.Error() != tt.errMsg {
t.Errorf("DynamoGraphDeploymentValidator.ValidateUpdate() error message = %v, want %v", err.Error(), tt.errMsg)
}
if tt.wantWarnings && len(warnings) == 0 {
t.Errorf("DynamoGraphDeploymentValidator.ValidateUpdate() expected warnings but got none")
}
if tt.wantWarnings && len(warnings) > 0 && warnings[0] != tt.expectedWarnMsg {
t.Errorf("DynamoGraphDeploymentValidator.ValidateUpdate() warning = %v, want %v", warnings[0], tt.expectedWarnMsg)
}
})
}
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package validation
import (
"errors"
"fmt"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
"k8s.io/apimachinery/pkg/util/yaml"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
)
// DynamoGraphDeploymentRequestValidator validates DynamoGraphDeploymentRequest resources.
// This validator can be used by both webhooks and controllers for consistent validation.
type DynamoGraphDeploymentRequestValidator struct {
request *nvidiacomv1alpha1.DynamoGraphDeploymentRequest
isClusterWideOperator bool
}
// NewDynamoGraphDeploymentRequestValidator creates a new validator for DynamoGraphDeploymentRequest.
// The isClusterWide parameter indicates whether the operator is running in cluster-wide or namespace-restricted mode.
func NewDynamoGraphDeploymentRequestValidator(request *nvidiacomv1alpha1.DynamoGraphDeploymentRequest, isClusterWide bool) *DynamoGraphDeploymentRequestValidator {
return &DynamoGraphDeploymentRequestValidator{
request: request,
isClusterWideOperator: isClusterWide,
}
}
// Validate performs stateless validation on the DynamoGraphDeploymentRequest.
// Returns warnings and error.
func (v *DynamoGraphDeploymentRequestValidator) Validate() (admission.Warnings, error) {
var warnings admission.Warnings
var err error
// Validate profiler image is specified
if v.request.Spec.ProfilingConfig.ProfilerImage == "" {
err = errors.Join(err, errors.New("spec.profilingConfig.profilerImage is required"))
}
// Validate that profilingConfig.config is provided
if v.request.Spec.ProfilingConfig.Config == nil || len(v.request.Spec.ProfilingConfig.Config.Raw) == 0 {
err = errors.Join(err, errors.New("spec.profilingConfig.config is required and must not be empty"))
}
// Validate enableGpuDiscovery is only true for cluster-wide operators
if v.request.Spec.EnableGpuDiscovery && !v.isClusterWideOperator {
err = errors.Join(err, errors.New("spec.enableGpuDiscovery can only be set to true for cluster-wide operators. Namespace-restricted operators cannot access cluster nodes for GPU discovery. Please set enableGpuDiscovery to false and provide hardware configuration (hardware.min_num_gpus_per_engine, hardware.max_num_gpus_per_engine, hardware.num_gpus_per_node) in spec.profilingConfig.config"))
}
// Parse config to validate structure (only if config is present)
if v.request.Spec.ProfilingConfig.Config != nil && len(v.request.Spec.ProfilingConfig.Config.Raw) > 0 {
var config map[string]interface{}
if parseErr := yaml.Unmarshal(v.request.Spec.ProfilingConfig.Config.Raw, &config); parseErr != nil {
err = errors.Join(err, fmt.Errorf("failed to parse spec.profilingConfig.config: %w", parseErr))
} else {
// Warn if deployment.model or engine.backend are specified in config (they will be overwritten by spec fields)
if engineConfig, ok := config["engine"].(map[string]interface{}); ok {
if backend, ok := engineConfig["backend"].(string); ok && backend != "" && backend != v.request.Spec.Backend {
warnings = append(warnings, fmt.Sprintf("spec.profilingConfig.config.engine.backend (%s) will be overwritten by spec.backend (%s)", backend, v.request.Spec.Backend))
}
}
if deployment, ok := config["deployment"].(map[string]interface{}); ok {
if model, ok := deployment["model"].(string); ok && model != "" && model != v.request.Spec.Model {
warnings = append(warnings, fmt.Sprintf("spec.profilingConfig.config.deployment.model (%s) will be overwritten by spec.model (%s)", model, v.request.Spec.Model))
}
}
}
}
return warnings, err
}
// ValidateUpdate performs stateful validation comparing old and new DynamoGraphDeploymentRequest.
// Returns warnings and error.
func (v *DynamoGraphDeploymentRequestValidator) ValidateUpdate(old *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (admission.Warnings, error) {
// TODO: Add update validation logic for DynamoGraphDeploymentRequest
// Placeholder for future immutability checks
return nil, nil
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package validation
import (
"context"
"fmt"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
internalwebhook "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/webhook"
"k8s.io/apimachinery/pkg/runtime"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
)
const (
// DynamoGraphDeploymentRequestWebhookName is the name of the validating webhook handler for DynamoGraphDeploymentRequest.
DynamoGraphDeploymentRequestWebhookName = "dynamographdeploymentrequest-validating-webhook"
dynamoGraphDeploymentRequestWebhookPath = "/validate-nvidia-com-v1alpha1-dynamographdeploymentrequest"
)
// DynamoGraphDeploymentRequestHandler is a handler for validating DynamoGraphDeploymentRequest resources.
// It is a thin wrapper around DynamoGraphDeploymentRequestValidator.
type DynamoGraphDeploymentRequestHandler struct {
isClusterWideOperator bool
}
// NewDynamoGraphDeploymentRequestHandler creates a new handler for DynamoGraphDeploymentRequest Webhook.
// The isClusterWide parameter indicates whether the operator is running in cluster-wide or namespace-restricted mode.
func NewDynamoGraphDeploymentRequestHandler(isClusterWide bool) *DynamoGraphDeploymentRequestHandler {
return &DynamoGraphDeploymentRequestHandler{
isClusterWideOperator: isClusterWide,
}
}
// ValidateCreate validates a DynamoGraphDeploymentRequest create request.
func (h *DynamoGraphDeploymentRequestHandler) ValidateCreate(ctx context.Context, obj runtime.Object) (admission.Warnings, error) {
logger := log.FromContext(ctx).WithName(DynamoGraphDeploymentRequestWebhookName)
request, err := castToDynamoGraphDeploymentRequest(obj)
if err != nil {
return nil, err
}
logger.Info("validate create", "name", request.Name, "namespace", request.Namespace)
// Create validator and perform validation
validator := NewDynamoGraphDeploymentRequestValidator(request, h.isClusterWideOperator)
return validator.Validate()
}
// ValidateUpdate validates a DynamoGraphDeploymentRequest update request.
func (h *DynamoGraphDeploymentRequestHandler) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) {
logger := log.FromContext(ctx).WithName(DynamoGraphDeploymentRequestWebhookName)
newRequest, err := castToDynamoGraphDeploymentRequest(newObj)
if err != nil {
return nil, err
}
logger.Info("validate update", "name", newRequest.Name, "namespace", newRequest.Namespace)
// Skip validation if the resource is being deleted (to allow finalizer removal)
if !newRequest.DeletionTimestamp.IsZero() {
logger.Info("skipping validation for resource being deleted", "name", newRequest.Name)
return nil, nil
}
oldRequest, err := castToDynamoGraphDeploymentRequest(oldObj)
if err != nil {
return nil, err
}
// Create validator and perform validation
validator := NewDynamoGraphDeploymentRequestValidator(newRequest, h.isClusterWideOperator)
// Validate stateless rules
warnings, err := validator.Validate()
if err != nil {
return warnings, err
}
// Validate stateful rules (immutability)
updateWarnings, err := validator.ValidateUpdate(oldRequest)
if err != nil {
return updateWarnings, err
}
// Combine warnings
warnings = append(warnings, updateWarnings...)
return warnings, nil
}
// ValidateDelete validates a DynamoGraphDeploymentRequest delete request.
func (h *DynamoGraphDeploymentRequestHandler) ValidateDelete(ctx context.Context, obj runtime.Object) (admission.Warnings, error) {
logger := log.FromContext(ctx).WithName(DynamoGraphDeploymentRequestWebhookName)
request, err := castToDynamoGraphDeploymentRequest(obj)
if err != nil {
return nil, err
}
logger.Info("validate delete", "name", request.Name, "namespace", request.Namespace)
// No special validation needed for deletion
return nil, nil
}
// RegisterWithManager registers the webhook with the manager.
// The handler is automatically wrapped with LeaseAwareValidator to add namespace exclusion logic.
func (h *DynamoGraphDeploymentRequestHandler) RegisterWithManager(mgr manager.Manager) error {
// Wrap the handler with lease-aware logic for cluster-wide coordination
validator := internalwebhook.NewLeaseAwareValidator(h, internalwebhook.GetExcludedNamespaces())
webhook := admission.
WithCustomValidator(mgr.GetScheme(), &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{}, validator).
WithRecoverPanic(true)
mgr.GetWebhookServer().Register(dynamoGraphDeploymentRequestWebhookPath, webhook)
return nil
}
// castToDynamoGraphDeploymentRequest attempts to cast a runtime.Object to a DynamoGraphDeploymentRequest.
func castToDynamoGraphDeploymentRequest(obj runtime.Object) (*nvidiacomv1alpha1.DynamoGraphDeploymentRequest, error) {
request, ok := obj.(*nvidiacomv1alpha1.DynamoGraphDeploymentRequest)
if !ok {
return nil, fmt.Errorf("expected DynamoGraphDeploymentRequest but got %T", obj)
}
return request, nil
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package validation
import (
"strings"
"testing"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
validConfig := `{"engine": {"backend": "vllm"}, "deployment": {"model": "test-model"}}`
configWithDifferentBackend := `{"engine": {"backend": "sglang"}}`
configWithDifferentModel := `{"deployment": {"model": "different-model"}}`
invalidYAML := `{invalid yaml`
tests := []struct {
name string
request *nvidiacomv1alpha1.DynamoGraphDeploymentRequest
isClusterWide bool
wantErr bool
errMsg string
wantWarnings bool
expectedWarning string
errContains bool
}{
{
name: "valid request",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(validConfig),
},
},
},
},
isClusterWide: true,
wantErr: false,
},
{
name: "missing profiler image",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "",
Config: &apiextensionsv1.JSON{
Raw: []byte(validConfig),
},
},
},
},
isClusterWide: true,
wantErr: true,
errMsg: "spec.profilingConfig.profilerImage is required",
},
{
name: "missing profiling config",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: nil,
},
},
},
isClusterWide: true,
wantErr: true,
errMsg: "spec.profilingConfig.config is required and must not be empty",
},
{
name: "empty profiling config",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte{},
},
},
},
},
isClusterWide: true,
wantErr: true,
errMsg: "spec.profilingConfig.config is required and must not be empty",
},
{
name: "enableGpuDiscovery true for cluster-wide operator",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
EnableGpuDiscovery: true,
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(validConfig),
},
},
},
},
isClusterWide: true,
wantErr: false,
},
{
name: "enableGpuDiscovery true for namespace-restricted operator",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
EnableGpuDiscovery: true,
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(validConfig),
},
},
},
},
isClusterWide: false,
wantErr: true,
errMsg: "spec.enableGpuDiscovery can only be set to true for cluster-wide operators. Namespace-restricted operators cannot access cluster nodes for GPU discovery. Please set enableGpuDiscovery to false and provide hardware configuration (hardware.min_num_gpus_per_engine, hardware.max_num_gpus_per_engine, hardware.num_gpus_per_node) in spec.profilingConfig.config",
},
{
name: "enableGpuDiscovery false for namespace-restricted operator",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
EnableGpuDiscovery: false,
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(validConfig),
},
},
},
},
isClusterWide: false,
wantErr: false,
},
{
name: "invalid config YAML",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(invalidYAML),
},
},
},
},
isClusterWide: true,
wantErr: true,
errMsg: "failed to parse spec.profilingConfig.config: error converting YAML to JSON: yaml: line 1: did not find expected ',' or '}'",
},
{
name: "warning for different backend in config",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(configWithDifferentBackend),
},
},
},
},
isClusterWide: true,
wantErr: false,
wantWarnings: true,
expectedWarning: "spec.profilingConfig.config.engine.backend (sglang) will be overwritten by spec.backend (vllm)",
},
{
name: "warning for different model in config",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(configWithDifferentModel),
},
},
},
},
isClusterWide: true,
wantErr: false,
wantWarnings: true,
expectedWarning: "spec.profilingConfig.config.deployment.model (different-model) will be overwritten by spec.model (llama-3-8b)",
},
{
name: "multiple errors (missing profiler image, missing config, and enableGpuDiscovery for namespace-restricted)",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
EnableGpuDiscovery: true,
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "",
Config: nil,
},
},
},
isClusterWide: false,
wantErr: true,
errMsg: "spec.profilingConfig.profilerImage is required\nspec.profilingConfig.config is required and must not be empty\nspec.enableGpuDiscovery can only be set to true for cluster-wide operators",
errContains: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
validator := NewDynamoGraphDeploymentRequestValidator(tt.request, tt.isClusterWide)
warnings, err := validator.Validate()
if (err != nil) != tt.wantErr {
t.Errorf("DynamoGraphDeploymentRequestValidator.Validate() error = %v, wantErr %v", err, tt.wantErr)
return
}
if tt.wantErr {
if tt.errContains {
// For multiple errors, check that all expected error messages are present
errStr := err.Error()
for _, expectedMsg := range strings.Split(tt.errMsg, "\n") {
if !strings.Contains(errStr, expectedMsg) {
t.Errorf("DynamoGraphDeploymentRequestValidator.Validate() error message = %v, want to contain %v", errStr, expectedMsg)
}
}
} else {
if err.Error() != tt.errMsg {
t.Errorf("DynamoGraphDeploymentRequestValidator.Validate() error message = %v, want %v", err.Error(), tt.errMsg)
}
}
}
if tt.wantWarnings && len(warnings) == 0 {
t.Errorf("DynamoGraphDeploymentRequestValidator.Validate() expected warnings but got none")
}
if tt.wantWarnings && len(warnings) > 0 && warnings[0] != tt.expectedWarning {
t.Errorf("DynamoGraphDeploymentRequestValidator.Validate() warning = %v, want %v", warnings[0], tt.expectedWarning)
}
})
}
}
func TestDynamoGraphDeploymentRequestValidator_ValidateUpdate(t *testing.T) {
validConfig := `{"engine": {"backend": "vllm"}}`
tests := []struct {
name string
oldRequest *nvidiacomv1alpha1.DynamoGraphDeploymentRequest
newRequest *nvidiacomv1alpha1.DynamoGraphDeploymentRequest
wantErr bool
wantWarnings bool
}{
{
name: "no changes",
oldRequest: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(validConfig),
},
},
},
},
newRequest: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(validConfig),
},
},
},
},
wantErr: false,
},
{
name: "changing model name is allowed",
oldRequest: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(validConfig),
},
},
},
},
newRequest: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-70b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(validConfig),
},
},
},
},
wantErr: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
validator := NewDynamoGraphDeploymentRequestValidator(tt.newRequest, true)
warnings, err := validator.ValidateUpdate(tt.oldRequest)
if (err != nil) != tt.wantErr {
t.Errorf("DynamoGraphDeploymentRequestValidator.ValidateUpdate() error = %v, wantErr %v", err, tt.wantErr)
return
}
if tt.wantWarnings && len(warnings) == 0 {
t.Errorf("DynamoGraphDeploymentRequestValidator.ValidateUpdate() expected warnings but got none")
}
})
}
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package validation
import (
"fmt"
"strings"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
)
// DynamoModelValidator validates DynamoModel resources.
// This validator can be used by both webhooks and controllers for consistent validation.
type DynamoModelValidator struct {
model *nvidiacomv1alpha1.DynamoModel
}
// NewDynamoModelValidator creates a new validator for DynamoModel.
func NewDynamoModelValidator(model *nvidiacomv1alpha1.DynamoModel) *DynamoModelValidator {
return &DynamoModelValidator{
model: model,
}
}
// Validate performs stateless validation on the DynamoModel.
// Returns warnings and error.
func (v *DynamoModelValidator) Validate() (admission.Warnings, error) {
// Validate modelName is not empty
if v.model.Spec.ModelName == "" {
return nil, fmt.Errorf("spec.modelName is required")
}
// Validate baseModelName is not empty
if v.model.Spec.BaseModelName == "" {
return nil, fmt.Errorf("spec.baseModelName is required")
}
// Validate LoRA model requirements
if v.model.Spec.ModelType == "lora" {
if v.model.Spec.Source == nil {
return nil, fmt.Errorf("spec.source is required when modelType is 'lora'")
}
if v.model.Spec.Source.URI == "" {
return nil, fmt.Errorf("spec.source.uri must be specified when modelType is 'lora'")
}
// Validate URI format
if err := v.validateSourceURI(v.model.Spec.Source.URI); err != nil {
return nil, err
}
}
return nil, nil
}
// ValidateUpdate performs stateful validation comparing old and new DynamoModel.
// Returns warnings and error.
func (v *DynamoModelValidator) ValidateUpdate(old *nvidiacomv1alpha1.DynamoModel) (admission.Warnings, error) {
var warnings admission.Warnings
// modelType is immutable
if v.model.Spec.ModelType != old.Spec.ModelType {
warnings = append(warnings, "Changing spec.modelType may cause unexpected behavior")
return warnings, fmt.Errorf("spec.modelType is immutable and cannot be changed after creation")
}
// baseModelName is immutable
if v.model.Spec.BaseModelName != old.Spec.BaseModelName {
warnings = append(warnings, "Changing spec.baseModelName will break endpoint discovery")
return warnings, fmt.Errorf("spec.baseModelName is immutable and cannot be changed after creation")
}
return nil, nil
}
// validateSourceURI validates the model source URI format.
func (v *DynamoModelValidator) validateSourceURI(uri string) error {
if uri == "" {
return fmt.Errorf("source URI cannot be empty")
}
// Check for supported schemes
if !strings.HasPrefix(uri, "s3://") && !strings.HasPrefix(uri, "hf://") {
return fmt.Errorf("source URI must start with 's3://' or 'hf://', got: %s", uri)
}
return nil
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package validation
import (
"context"
"fmt"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
internalwebhook "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/webhook"
"k8s.io/apimachinery/pkg/runtime"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
)
const (
// DynamoModelWebhookName is the name of the validating webhook handler for DynamoModel.
DynamoModelWebhookName = "dynamomodel-validating-webhook"
dynamoModelWebhookPath = "/validate-nvidia-com-v1alpha1-dynamomodel"
)
// DynamoModelHandler is a handler for validating DynamoModel resources.
// It is a thin wrapper around DynamoModelValidator.
type DynamoModelHandler struct{}
// NewDynamoModelHandler creates a new handler for DynamoModel Webhook.
func NewDynamoModelHandler() *DynamoModelHandler {
return &DynamoModelHandler{}
}
// ValidateCreate validates a DynamoModel create request.
func (h *DynamoModelHandler) ValidateCreate(ctx context.Context, obj runtime.Object) (admission.Warnings, error) {
logger := log.FromContext(ctx).WithName(DynamoModelWebhookName)
model, err := castToDynamoModel(obj)
if err != nil {
return nil, err
}
logger.Info("validate create", "name", model.Name, "namespace", model.Namespace)
// Create validator and perform validation
validator := NewDynamoModelValidator(model)
return validator.Validate()
}
// ValidateUpdate validates a DynamoModel update request.
func (h *DynamoModelHandler) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) {
logger := log.FromContext(ctx).WithName(DynamoModelWebhookName)
newModel, err := castToDynamoModel(newObj)
if err != nil {
return nil, err
}
logger.Info("validate update", "name", newModel.Name, "namespace", newModel.Namespace)
// Skip validation if the resource is being deleted (to allow finalizer removal)
if !newModel.DeletionTimestamp.IsZero() {
logger.Info("skipping validation for resource being deleted", "name", newModel.Name)
return nil, nil
}
oldModel, err := castToDynamoModel(oldObj)
if err != nil {
return nil, err
}
// Create validator and perform validation
validator := NewDynamoModelValidator(newModel)
// Validate stateless rules
warnings, err := validator.Validate()
if err != nil {
return warnings, err
}
// Validate stateful rules (immutability)
updateWarnings, err := validator.ValidateUpdate(oldModel)
if err != nil {
return updateWarnings, err
}
// Combine warnings
warnings = append(warnings, updateWarnings...)
return warnings, nil
}
// ValidateDelete validates a DynamoModel delete request.
func (h *DynamoModelHandler) ValidateDelete(ctx context.Context, obj runtime.Object) (admission.Warnings, error) {
logger := log.FromContext(ctx).WithName(DynamoModelWebhookName)
model, err := castToDynamoModel(obj)
if err != nil {
return nil, err
}
logger.Info("validate delete", "name", model.Name, "namespace", model.Namespace)
// No special validation needed for deletion
return nil, nil
}
// RegisterWithManager registers the webhook with the manager.
// The handler is automatically wrapped with LeaseAwareValidator to add namespace exclusion logic.
func (h *DynamoModelHandler) RegisterWithManager(mgr manager.Manager) error {
// Wrap the handler with lease-aware logic for cluster-wide coordination
validator := internalwebhook.NewLeaseAwareValidator(h, internalwebhook.GetExcludedNamespaces())
webhook := admission.
WithCustomValidator(mgr.GetScheme(), &nvidiacomv1alpha1.DynamoModel{}, validator).
WithRecoverPanic(true)
mgr.GetWebhookServer().Register(dynamoModelWebhookPath, webhook)
return nil
}
// castToDynamoModel attempts to cast a runtime.Object to a DynamoModel.
func castToDynamoModel(obj runtime.Object) (*nvidiacomv1alpha1.DynamoModel, error) {
model, ok := obj.(*nvidiacomv1alpha1.DynamoModel)
if !ok {
return nil, fmt.Errorf("expected DynamoModel but got %T", obj)
}
return model, nil
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package validation
import (
"testing"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
func TestDynamoModelValidator_Validate(t *testing.T) {
tests := []struct {
name string
model *nvidiacomv1alpha1.DynamoModel
wantErr bool
errMsg string
}{
{
name: "valid base model",
model: &nvidiacomv1alpha1.DynamoModel{
ObjectMeta: metav1.ObjectMeta{
Name: "test-model",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "llama-3-8b",
BaseModelName: "llama-3-8b",
ModelType: "base",
},
},
wantErr: false,
},
{
name: "valid lora model with s3 source",
model: &nvidiacomv1alpha1.DynamoModel{
ObjectMeta: metav1.ObjectMeta{
Name: "test-lora",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "llama-3-8b-custom",
BaseModelName: "llama-3-8b",
ModelType: "lora",
Source: &nvidiacomv1alpha1.ModelSource{
URI: "s3://my-bucket/lora-adapter",
},
},
},
wantErr: false,
},
{
name: "valid lora model with hf source",
model: &nvidiacomv1alpha1.DynamoModel{
ObjectMeta: metav1.ObjectMeta{
Name: "test-lora",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "llama-3-8b-custom",
BaseModelName: "llama-3-8b",
ModelType: "lora",
Source: &nvidiacomv1alpha1.ModelSource{
URI: "hf://organization/model-name",
},
},
},
wantErr: false,
},
{
name: "missing modelName",
model: &nvidiacomv1alpha1.DynamoModel{
ObjectMeta: metav1.ObjectMeta{
Name: "test-model",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "",
BaseModelName: "llama-3-8b",
ModelType: "base",
},
},
wantErr: true,
errMsg: "spec.modelName is required",
},
{
name: "missing baseModelName",
model: &nvidiacomv1alpha1.DynamoModel{
ObjectMeta: metav1.ObjectMeta{
Name: "test-model",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "llama-3-8b",
BaseModelName: "",
ModelType: "base",
},
},
wantErr: true,
errMsg: "spec.baseModelName is required",
},
{
name: "lora without source",
model: &nvidiacomv1alpha1.DynamoModel{
ObjectMeta: metav1.ObjectMeta{
Name: "test-lora",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "llama-3-8b-custom",
BaseModelName: "llama-3-8b",
ModelType: "lora",
Source: nil,
},
},
wantErr: true,
errMsg: "spec.source is required when modelType is 'lora'",
},
{
name: "lora with empty URI",
model: &nvidiacomv1alpha1.DynamoModel{
ObjectMeta: metav1.ObjectMeta{
Name: "test-lora",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "llama-3-8b-custom",
BaseModelName: "llama-3-8b",
ModelType: "lora",
Source: &nvidiacomv1alpha1.ModelSource{
URI: "",
},
},
},
wantErr: true,
errMsg: "spec.source.uri must be specified when modelType is 'lora'",
},
{
name: "lora with invalid URI scheme",
model: &nvidiacomv1alpha1.DynamoModel{
ObjectMeta: metav1.ObjectMeta{
Name: "test-lora",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "llama-3-8b-custom",
BaseModelName: "llama-3-8b",
ModelType: "lora",
Source: &nvidiacomv1alpha1.ModelSource{
URI: "http://example.com/model",
},
},
},
wantErr: true,
errMsg: "source URI must start with 's3://' or 'hf://', got: http://example.com/model",
},
{
name: "lora with file:// URI scheme",
model: &nvidiacomv1alpha1.DynamoModel{
ObjectMeta: metav1.ObjectMeta{
Name: "test-lora",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "llama-3-8b-custom",
BaseModelName: "llama-3-8b",
ModelType: "lora",
Source: &nvidiacomv1alpha1.ModelSource{
URI: "file:///local/path",
},
},
},
wantErr: true,
errMsg: "source URI must start with 's3://' or 'hf://', got: file:///local/path",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
validator := NewDynamoModelValidator(tt.model)
_, err := validator.Validate()
if (err != nil) != tt.wantErr {
t.Errorf("DynamoModelValidator.Validate() error = %v, wantErr %v", err, tt.wantErr)
return
}
if tt.wantErr && err.Error() != tt.errMsg {
t.Errorf("DynamoModelValidator.Validate() error message = %v, want %v", err.Error(), tt.errMsg)
}
})
}
}
func TestDynamoModelValidator_ValidateUpdate(t *testing.T) {
tests := []struct {
name string
oldModel *nvidiacomv1alpha1.DynamoModel
newModel *nvidiacomv1alpha1.DynamoModel
wantErr bool
wantWarnings bool
errMsg string
expectedWarnMsg string
}{
{
name: "no changes",
oldModel: &nvidiacomv1alpha1.DynamoModel{
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "llama-3-8b",
BaseModelName: "llama-3-8b",
ModelType: "base",
},
},
newModel: &nvidiacomv1alpha1.DynamoModel{
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "llama-3-8b",
BaseModelName: "llama-3-8b",
ModelType: "base",
},
},
wantErr: false,
},
{
name: "changing modelType",
oldModel: &nvidiacomv1alpha1.DynamoModel{
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "llama-3-8b",
BaseModelName: "llama-3-8b",
ModelType: "base",
},
},
newModel: &nvidiacomv1alpha1.DynamoModel{
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "llama-3-8b",
BaseModelName: "llama-3-8b",
ModelType: "lora",
Source: &nvidiacomv1alpha1.ModelSource{
URI: "s3://bucket/adapter",
},
},
},
wantErr: true,
wantWarnings: true,
errMsg: "spec.modelType is immutable and cannot be changed after creation",
expectedWarnMsg: "Changing spec.modelType may cause unexpected behavior",
},
{
name: "changing baseModelName",
oldModel: &nvidiacomv1alpha1.DynamoModel{
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "llama-3-8b",
BaseModelName: "llama-3-8b",
ModelType: "base",
},
},
newModel: &nvidiacomv1alpha1.DynamoModel{
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "llama-3-8b",
BaseModelName: "llama-3-70b",
ModelType: "base",
},
},
wantErr: true,
wantWarnings: true,
errMsg: "spec.baseModelName is immutable and cannot be changed after creation",
expectedWarnMsg: "Changing spec.baseModelName will break endpoint discovery",
},
{
name: "changing modelName is allowed",
oldModel: &nvidiacomv1alpha1.DynamoModel{
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "llama-3-8b",
BaseModelName: "llama-3-8b",
ModelType: "base",
},
},
newModel: &nvidiacomv1alpha1.DynamoModel{
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "llama-3-8b-renamed",
BaseModelName: "llama-3-8b",
ModelType: "base",
},
},
wantErr: false,
},
{
name: "updating source URI for lora is allowed",
oldModel: &nvidiacomv1alpha1.DynamoModel{
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "llama-3-8b-custom",
BaseModelName: "llama-3-8b",
ModelType: "lora",
Source: &nvidiacomv1alpha1.ModelSource{
URI: "s3://bucket/adapter-v1",
},
},
},
newModel: &nvidiacomv1alpha1.DynamoModel{
Spec: nvidiacomv1alpha1.DynamoModelSpec{
ModelName: "llama-3-8b-custom",
BaseModelName: "llama-3-8b",
ModelType: "lora",
Source: &nvidiacomv1alpha1.ModelSource{
URI: "s3://bucket/adapter-v2",
},
},
},
wantErr: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
validator := NewDynamoModelValidator(tt.newModel)
warnings, err := validator.ValidateUpdate(tt.oldModel)
if (err != nil) != tt.wantErr {
t.Errorf("DynamoModelValidator.ValidateUpdate() error = %v, wantErr %v", err, tt.wantErr)
return
}
if tt.wantErr && err.Error() != tt.errMsg {
t.Errorf("DynamoModelValidator.ValidateUpdate() error message = %v, want %v", err.Error(), tt.errMsg)
}
if tt.wantWarnings && len(warnings) == 0 {
t.Errorf("DynamoModelValidator.ValidateUpdate() expected warnings but got none")
}
if tt.wantWarnings && len(warnings) > 0 && warnings[0] != tt.expectedWarnMsg {
t.Errorf("DynamoModelValidator.ValidateUpdate() warning = %v, want %v", warnings[0], tt.expectedWarnMsg)
}
})
}
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package validation
import (
"fmt"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
)
// SharedSpecValidator validates DynamoComponentDeploymentSharedSpec fields.
// This validator is used by both DynamoComponentDeploymentValidator and DynamoGraphDeploymentValidator
// to provide consistent validation logic for shared spec fields.
type SharedSpecValidator struct {
spec *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec
fieldPath string // e.g., "spec" for DCD, "spec.services[foo]" for DGD
}
// NewSharedSpecValidator creates a new validator for DynamoComponentDeploymentSharedSpec.
// fieldPath is used to provide context in error messages (e.g., "spec" or "spec.services[main]").
func NewSharedSpecValidator(spec *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec, fieldPath string) *SharedSpecValidator {
return &SharedSpecValidator{
spec: spec,
fieldPath: fieldPath,
}
}
// Validate performs validation on the shared spec fields.
// Returns an error if validation fails.
func (v *SharedSpecValidator) Validate() error {
// Validate replicas if specified
if v.spec.Replicas != nil && *v.spec.Replicas < 0 {
return fmt.Errorf("%s.replicas must be non-negative", v.fieldPath)
}
// Validate autoscaling configuration if specified
if v.spec.Autoscaling != nil {
if err := v.validateAutoscaling(); err != nil {
return err
}
}
// Validate ingress configuration if enabled
if v.spec.Ingress != nil && v.spec.Ingress.Enabled {
if err := v.validateIngress(); err != nil {
return err
}
}
// Validate volume mounts
if err := v.validateVolumeMounts(); err != nil {
return err
}
// Validate shared memory
if v.spec.SharedMemory != nil {
if err := v.validateSharedMemory(); err != nil {
return err
}
}
return nil
}
// validateAutoscaling validates the autoscaling configuration.
func (v *SharedSpecValidator) validateAutoscaling() error {
autoscaling := v.spec.Autoscaling
if !autoscaling.Enabled {
return nil
}
// Validate minReplicas
if autoscaling.MinReplicas < 1 {
return fmt.Errorf("%s.autoscaling.minReplicas must be >= 1", v.fieldPath)
}
// Validate maxReplicas
if autoscaling.MaxReplicas <= autoscaling.MinReplicas {
return fmt.Errorf("%s.autoscaling.maxReplicas must be > minReplicas", v.fieldPath)
}
return nil
}
// validateIngress validates the ingress configuration.
func (v *SharedSpecValidator) validateIngress() error {
if v.spec.Ingress.Host == "" {
return fmt.Errorf("%s.ingress.host is required when ingress is enabled", v.fieldPath)
}
return nil
}
// validateVolumeMounts validates the volume mount configurations.
func (v *SharedSpecValidator) validateVolumeMounts() error {
for i, volumeMount := range v.spec.VolumeMounts {
if err := v.validateVolumeMount(i, &volumeMount); err != nil {
return err
}
}
return nil
}
// validateVolumeMount validates a single volume mount configuration.
func (v *SharedSpecValidator) validateVolumeMount(index int, volumeMount *nvidiacomv1alpha1.VolumeMount) error {
// If useAsCompilationCache is false, mountPoint is required
if !volumeMount.UseAsCompilationCache && volumeMount.MountPoint == "" {
return fmt.Errorf("%s.volumeMounts[%d].mountPoint is required when useAsCompilationCache is false", v.fieldPath, index)
}
return nil
}
// validateSharedMemory validates the shared memory configuration.
func (v *SharedSpecValidator) validateSharedMemory() error {
// If disabled is false (i.e., shared memory is enabled), size is required
if !v.spec.SharedMemory.Disabled && v.spec.SharedMemory.Size.IsZero() {
return fmt.Errorf("%s.sharedMemory.size is required when disabled is false", v.fieldPath)
}
return nil
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package validation
import (
"testing"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
"k8s.io/apimachinery/pkg/api/resource"
)
func TestSharedSpecValidator_Validate(t *testing.T) {
var (
negativeReplicas = int32(-1)
validReplicas = int32(3)
)
tests := []struct {
name string
spec *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec
fieldPath string
wantErr bool
errMsg string
}{
{
name: "valid spec with all fields",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Replicas: &validReplicas,
Autoscaling: &nvidiacomv1alpha1.Autoscaling{
Enabled: true,
MinReplicas: 1,
MaxReplicas: 10,
},
Ingress: &nvidiacomv1alpha1.IngressSpec{
Enabled: true,
Host: "example.com",
},
VolumeMounts: []nvidiacomv1alpha1.VolumeMount{
{
Name: "cache",
MountPoint: "/cache",
},
{
Name: "compilation",
UseAsCompilationCache: true,
},
},
SharedMemory: &nvidiacomv1alpha1.SharedMemorySpec{
Disabled: false,
Size: resource.MustParse("1Gi"),
},
},
fieldPath: "spec",
wantErr: false,
},
{
name: "negative replicas",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Replicas: &negativeReplicas,
},
fieldPath: "spec",
wantErr: true,
errMsg: "spec.replicas must be non-negative",
},
{
name: "autoscaling minReplicas too low",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Autoscaling: &nvidiacomv1alpha1.Autoscaling{
Enabled: true,
MinReplicas: 0,
MaxReplicas: 10,
},
},
fieldPath: "spec",
wantErr: true,
errMsg: "spec.autoscaling.minReplicas must be >= 1",
},
{
name: "autoscaling maxReplicas less than minReplicas",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Autoscaling: &nvidiacomv1alpha1.Autoscaling{
Enabled: true,
MinReplicas: 5,
MaxReplicas: 3,
},
},
fieldPath: "spec",
wantErr: true,
errMsg: "spec.autoscaling.maxReplicas must be > minReplicas",
},
{
name: "autoscaling disabled - no validation",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Autoscaling: &nvidiacomv1alpha1.Autoscaling{
Enabled: false,
MinReplicas: 0,
MaxReplicas: 0,
},
},
fieldPath: "spec",
wantErr: false,
},
{
name: "ingress enabled without host",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Ingress: &nvidiacomv1alpha1.IngressSpec{
Enabled: true,
Host: "",
},
},
fieldPath: "spec",
wantErr: true,
errMsg: "spec.ingress.host is required when ingress is enabled",
},
{
name: "ingress disabled - no validation",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Ingress: &nvidiacomv1alpha1.IngressSpec{
Enabled: false,
Host: "",
},
},
fieldPath: "spec",
wantErr: false,
},
{
name: "volume mount without mountPoint and not compilation cache",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
VolumeMounts: []nvidiacomv1alpha1.VolumeMount{
{
Name: "data",
MountPoint: "",
UseAsCompilationCache: false,
},
},
},
fieldPath: "spec",
wantErr: true,
errMsg: "spec.volumeMounts[0].mountPoint is required when useAsCompilationCache is false",
},
{
name: "volume mount with mountPoint",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
VolumeMounts: []nvidiacomv1alpha1.VolumeMount{
{
Name: "data",
MountPoint: "/data",
},
},
},
fieldPath: "spec",
wantErr: false,
},
{
name: "volume mount as compilation cache without mountPoint",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
VolumeMounts: []nvidiacomv1alpha1.VolumeMount{
{
Name: "cache",
UseAsCompilationCache: true,
},
},
},
fieldPath: "spec",
wantErr: false,
},
{
name: "shared memory enabled without size",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
SharedMemory: &nvidiacomv1alpha1.SharedMemorySpec{
Disabled: false,
Size: resource.Quantity{},
},
},
fieldPath: "spec",
wantErr: true,
errMsg: "spec.sharedMemory.size is required when disabled is false",
},
{
name: "shared memory enabled with size",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
SharedMemory: &nvidiacomv1alpha1.SharedMemorySpec{
Disabled: false,
Size: resource.MustParse("2Gi"),
},
},
fieldPath: "spec",
wantErr: false,
},
{
name: "shared memory disabled without size",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
SharedMemory: &nvidiacomv1alpha1.SharedMemorySpec{
Disabled: true,
Size: resource.Quantity{},
},
},
fieldPath: "spec",
wantErr: false,
},
{
name: "custom field path for service validation",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Replicas: &negativeReplicas,
},
fieldPath: "spec.services[main]",
wantErr: true,
errMsg: "spec.services[main].replicas must be non-negative",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
validator := NewSharedSpecValidator(tt.spec, tt.fieldPath)
err := validator.Validate()
if (err != nil) != tt.wantErr {
t.Errorf("SharedSpecValidator.Validate() error = %v, wantErr %v", err, tt.wantErr)
return
}
if tt.wantErr && err.Error() != tt.errMsg {
t.Errorf("SharedSpecValidator.Validate() error message = %v, want %v", err.Error(), tt.errMsg)
}
})
}
}
...@@ -7,5 +7,6 @@ Deployment Guide ...@@ -7,5 +7,6 @@ Deployment Guide
Kubernetes Quickstart <../kubernetes/README> Kubernetes Quickstart <../kubernetes/README>
Detailed Installation Guide <../kubernetes/installation_guide> Detailed Installation Guide <../kubernetes/installation_guide>
Dynamo Operator <../kubernetes/dynamo_operator> Dynamo Operator <../kubernetes/dynamo_operator>
Webhooks <../kubernetes/webhooks>
Minikube Setup <../kubernetes/deployment/minikube> Minikube Setup <../kubernetes/deployment/minikube>
Managing Models with DynamoModel <../kubernetes/deployment/dynamomodel-guide> Managing Models with DynamoModel <../kubernetes/deployment/dynamomodel-guide>
...@@ -115,6 +115,21 @@ For a user-focused guide on deploying and managing models with DynamoModel, see: ...@@ -115,6 +115,21 @@ For a user-focused guide on deploying and managing models with DynamoModel, see:
**📖 [Managing Models with DynamoModel Guide](./deployment/dynamomodel-guide.md)** **📖 [Managing Models with DynamoModel Guide](./deployment/dynamomodel-guide.md)**
## Webhooks
The Dynamo Operator uses **Kubernetes admission webhooks** for real-time validation of custom resources before they are persisted to the cluster. Webhooks are **enabled by default** and ensure that invalid configurations are rejected immediately at the API server level.
**Key Features:**
- ✅ Shared certificate infrastructure across all webhook types
- ✅ Automatic certificate generation (for testing/development)
- ✅ cert-manager integration (for production)
- ✅ Multi-operator support with lease-based coordination
- ✅ Immutability enforcement for critical fields
For complete documentation on webhooks, certificate management, and troubleshooting, see:
**📖 [Webhooks Guide](./webhooks.md)**
## Installation ## Installation
### Quick Install with Helm ### Quick Install with Helm
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment