/* * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package validation import ( "context" "errors" "fmt" "strconv" "strings" nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts" controllercommon "github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common" "github.com/ai-dynamo/dynamo/deploy/operator/internal/dynamo/epp" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/webhook/admission" ) // SharedSpecValidator validates DynamoComponentDeploymentSharedSpec fields. // This validator is used by both DynamoComponentDeploymentValidator and DynamoGraphDeploymentValidator // to provide consistent validation logic for shared spec fields. type SharedSpecValidator struct { spec *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec fieldPath string // e.g., "spec" for DCD, "spec.services[foo]" for DGD calculatedNamespace string // The namespace that will be used: {k8s_namespace}-{dgd_name} mgr ctrl.Manager // Optional: for API group detection via discovery client } // NewSharedSpecValidator creates a new validator for DynamoComponentDeploymentSharedSpec. // fieldPath is used to provide context in error messages (e.g., "spec" or "spec.services[main]"). // calculatedNamespace is the namespace the operator will use: // - If GlobalDynamoNamespace is true: "dynamo" (global constant) // - Otherwise: {k8s_namespace}-{dgd_name} func NewSharedSpecValidator(spec *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec, fieldPath string, calculatedNamespace string) *SharedSpecValidator { return &SharedSpecValidator{ spec: spec, fieldPath: fieldPath, calculatedNamespace: calculatedNamespace, mgr: nil, } } // NewSharedSpecValidatorWithManager creates a validator with a manager for API group detection. // This allows the validator to check for API group availability (e.g., inference.networking.k8s.io) when validating EPP components. func NewSharedSpecValidatorWithManager(spec *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec, fieldPath string, calculatedNamespace string, mgr ctrl.Manager) *SharedSpecValidator { return &SharedSpecValidator{ spec: spec, fieldPath: fieldPath, calculatedNamespace: calculatedNamespace, mgr: mgr, } } // Validate performs validation on the shared spec fields. // Context is required for any operations that may need to query the cluster (e.g., CRD checks). // Returns warnings (e.g., deprecation notices) and error if validation fails. func (v *SharedSpecValidator) Validate(ctx context.Context) (admission.Warnings, error) { // Collect warnings (e.g., deprecation notices) var warnings admission.Warnings // Warn about deprecated dynamoNamespace field if v.spec.DynamoNamespace != nil && *v.spec.DynamoNamespace != "" { warnings = append(warnings, fmt.Sprintf( "%s.dynamoNamespace is deprecated and ignored. Value '%s' will be replaced with '%s'. "+ "Remove this field from your configuration", v.fieldPath, *v.spec.DynamoNamespace, v.calculatedNamespace)) } // Validate replicas if specified if v.spec.Replicas != nil && *v.spec.Replicas < 0 { return nil, fmt.Errorf("%s.replicas must be non-negative", v.fieldPath) } // Validate ingress configuration if enabled if v.spec.Ingress != nil && v.spec.Ingress.Enabled { if err := v.validateIngress(); err != nil { return nil, err } } // Validate volume mounts if err := v.validateVolumeMounts(); err != nil { return nil, err } // Validate shared memory if v.spec.SharedMemory != nil { if err := v.validateSharedMemory(); err != nil { return nil, err } } // Check for deprecated autoscaling field //nolint:staticcheck // SA1019: Intentionally checking deprecated field to warn users if v.spec.Autoscaling != nil { warnings = append(warnings, fmt.Sprintf( "%s.autoscaling is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter "+ "with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md", v.fieldPath)) } // Validate frontend sidecar container name conflicts if err := v.validateFrontendSidecar(); err != nil { return nil, err } // Validate service-level annotations if err := v.validateServiceAnnotations(); err != nil { return nil, err } // Validate EPP-specific constraints if err := v.validateEPPConfig(ctx); err != nil { return nil, err } // Validate GPU memory service configuration (intra-pod GMS) if err := v.validateGPUMemoryService(); err != nil { return nil, err } // Validate GMS failover constraints if err := v.validateFailover(); err != nil { return nil, err } return warnings, nil } // validateIngress validates the ingress configuration. func (v *SharedSpecValidator) validateIngress() error { if v.spec.Ingress.Host == "" { return fmt.Errorf("%s.ingress.host is required when ingress is enabled", v.fieldPath) } return nil } // validateVolumeMounts validates the volume mount configurations. func (v *SharedSpecValidator) validateVolumeMounts() error { for i, volumeMount := range v.spec.VolumeMounts { if err := v.validateVolumeMount(i, &volumeMount); err != nil { return err } } return nil } // validateVolumeMount validates a single volume mount configuration. func (v *SharedSpecValidator) validateVolumeMount(index int, volumeMount *nvidiacomv1alpha1.VolumeMount) error { // If useAsCompilationCache is false, mountPoint is required if !volumeMount.UseAsCompilationCache && volumeMount.MountPoint == "" { return fmt.Errorf("%s.volumeMounts[%d].mountPoint is required when useAsCompilationCache is false", v.fieldPath, index) } return nil } // validateSharedMemory validates the shared memory configuration. func (v *SharedSpecValidator) validateSharedMemory() error { // If disabled is false (i.e., shared memory is enabled), size is required if !v.spec.SharedMemory.Disabled && v.spec.SharedMemory.Size.IsZero() { return fmt.Errorf("%s.sharedMemory.size is required when disabled is false", v.fieldPath) } return nil } // validateEPPConfig validates EPP-specific configuration constraints. func (v *SharedSpecValidator) validateEPPConfig(ctx context.Context) error { // Only validate if this is an EPP component if v.spec.ComponentType != consts.ComponentTypeEPP { return nil } // Check if InferencePool API group is available in the cluster (if manager is provided) if v.mgr != nil { if err := v.checkInferencePoolAPIAvailability(ctx); err != nil { return fmt.Errorf("%s: cannot deploy EPP component: %w", v.fieldPath, err) } } // EPP must be single-node (cannot be multinode) if v.spec.IsMultinode() { return fmt.Errorf("%s: EPP component cannot be multinode (multinode field must be nil or nodeCount must be 1)", v.fieldPath) } // EPP should have exactly 1 replica (optional constraint - can be relaxed if needed) if v.spec.Replicas != nil && *v.spec.Replicas != 1 { return fmt.Errorf("%s: EPP component must have exactly 1 replica (found %d replicas)", v.fieldPath, *v.spec.Replicas) } // EPP components MUST have EPPConfig if v.spec.EPPConfig == nil { return fmt.Errorf("%s.eppConfig is required for EPP components", v.fieldPath) } // Either ConfigMapRef or Config must be specified (no default) if v.spec.EPPConfig.ConfigMapRef == nil && v.spec.EPPConfig.Config == nil { return fmt.Errorf("%s.eppConfig: either configMapRef or config must be specified (no default configuration provided)", v.fieldPath) } // ConfigMapRef and Config are mutually exclusive if v.spec.EPPConfig.ConfigMapRef != nil && v.spec.EPPConfig.Config != nil { return fmt.Errorf("%s.eppConfig: configMapRef and config are mutually exclusive, only one can be specified", v.fieldPath) } // If ConfigMapRef is provided, validate it if v.spec.EPPConfig.ConfigMapRef != nil { if v.spec.EPPConfig.ConfigMapRef.Name == "" { return fmt.Errorf("%s.eppConfig.configMapRef.name is required", v.fieldPath) } } return nil } // checkInferencePoolAPIAvailability checks if the inference.networking.k8s.io API group is available in the cluster. // Returns an error if the API group is not available, which prevents EPP deployment. // This reuses the controller_common.DetectInferencePoolAvailability function. func (v *SharedSpecValidator) checkInferencePoolAPIAvailability(ctx context.Context) error { if v.mgr == nil { // No manager provided, skip the check (e.g., in controller without webhooks) return nil } if !controllercommon.DetectInferencePoolAvailability(ctx, v.mgr) { return fmt.Errorf( "InferencePool API group (%s) is not available in the cluster. "+ "EPP requires the Gateway API Inference Extension to be installed. "+ "Please install the Gateway API Inference Extension before deploying EPP components", epp.InferencePoolGroup) } return nil } // validateFrontendSidecar checks that extraPodSpec.containers does not already // contain a container whose name collides with the auto-generated frontend sidecar. func (v *SharedSpecValidator) validateFrontendSidecar() error { if v.spec.FrontendSidecar == nil { return nil } if v.spec.ExtraPodSpec == nil || v.spec.ExtraPodSpec.PodSpec == nil { return nil } for _, c := range v.spec.ExtraPodSpec.PodSpec.Containers { if c.Name == consts.FrontendSidecarContainerName { return fmt.Errorf( "%s: cannot inject frontend sidecar: a container named %q already exists in extraPodSpec.containers", v.fieldPath, consts.FrontendSidecarContainerName) } } return nil } // parseGPUCount extracts the GPU count from a Resources block, preferring // Limits then Requests. Returns (0, nil) when no GPU is requested, or an // error if the value is non-numeric. func parseGPUCount(r *nvidiacomv1alpha1.Resources) (int, error) { gpuStr := "" switch { case r != nil && r.Limits != nil && r.Limits.GPU != "": gpuStr = r.Limits.GPU case r != nil && r.Requests != nil && r.Requests.GPU != "": gpuStr = r.Requests.GPU } if gpuStr == "" { return 0, nil } n, err := strconv.Atoi(gpuStr) if err != nil { return 0, fmt.Errorf("invalid value %q: %w", gpuStr, err) } return n, nil } // validateFailover validates GMS failover configuration constraints. // // The layout (intra-pod sidecar vs. inter-pod weight-server pod) is declared // by gpuMemoryService.mode. failover is an independent toggle: when enabled, // failover.mode MUST match gpuMemoryService.mode so the two knobs describe a // consistent topology. It is also valid to configure gpuMemoryService without // failover (no shadows; a single engine + GMS server pair) — see // validateGPUMemoryService below. func (v *SharedSpecValidator) validateFailover() error { if v.spec.Failover == nil || !v.spec.Failover.Enabled { // When failover.enabled is false the sub-fields (mode, numShadows) // are dormant configuration and the render path ignores them // (GetNumShadows returns 0). We deliberately do not validate them // here so users can stage a failover config before flipping // enabled=true — matching the K8s convention that fields on a // disabled feature are not constrained. return nil } var errs []error // For intra-pod mode: require gpuMemoryService.enabled and validate mode matching. if v.spec.Failover.Mode == nvidiacomv1alpha1.GMSModeIntraPod { if v.spec.GPUMemoryService == nil || !v.spec.GPUMemoryService.Enabled { errs = append(errs, fmt.Errorf( "%s.failover: intraPod failover requires gpuMemoryService.enabled to be true", v.fieldPath)) } else if v.spec.GPUMemoryService.Mode != "" && v.spec.GPUMemoryService.Mode != nvidiacomv1alpha1.GMSModeIntraPod { errs = append(errs, fmt.Errorf( "%s.failover: failover.mode %q must match gpuMemoryService.mode %q", v.fieldPath, v.spec.Failover.Mode, v.spec.GPUMemoryService.Mode)) } // intraPod is a fixed 1 primary + 1 shadow sidecar layout; numShadows // is meaningless here and any value other than the implicit 1 is // almost certainly a configuration error (user probably wanted // mode=interPod). if v.spec.Failover.NumShadows != 0 && v.spec.Failover.NumShadows != 1 { errs = append(errs, fmt.Errorf( "%s.failover.numShadows=%d is invalid for mode=%q: intraPod uses a fixed 1 primary + 1 shadow sidecar; "+ "use failover.mode=%q to configure numShadows", v.fieldPath, v.spec.Failover.NumShadows, nvidiacomv1alpha1.GMSModeIntraPod, nvidiacomv1alpha1.GMSModeInterPod)) } } // For inter-pod mode: require the inter-pod GMS layout (gpuMemoryService // with mode=interPod) so failover hot-spares are added on top of an // already-declared weight-server pod layout. if v.spec.Failover.Mode == nvidiacomv1alpha1.GMSModeInterPod { if v.spec.GPUMemoryService == nil || !v.spec.GPUMemoryService.Enabled { errs = append(errs, fmt.Errorf( "%s.failover: interPod failover requires gpuMemoryService.enabled=true and gpuMemoryService.mode=%q", v.fieldPath, nvidiacomv1alpha1.GMSModeInterPod)) } else if v.spec.GPUMemoryService.Mode != nvidiacomv1alpha1.GMSModeInterPod { // An unset gpuMemoryService.mode defaults to the intra-pod sidecar // layout, which is incompatible with inter-pod failover; the user // must set gpuMemoryService.mode=interPod explicitly. detected := string(v.spec.GPUMemoryService.Mode) if detected == "" { detected = "" } errs = append(errs, fmt.Errorf( "%s.failover: interPod failover requires gpuMemoryService.mode=%q (got %q)", v.fieldPath, nvidiacomv1alpha1.GMSModeInterPod, detected)) } if v.spec.Failover.NumShadows < 1 { errs = append(errs, fmt.Errorf("%s.failover.numShadows must be >= 1", v.fieldPath)) } gpuCount, err := parseGPUCount(v.spec.Resources) if err != nil { errs = append(errs, fmt.Errorf("%s.resources.limits.gpu: %w", v.fieldPath, err)) } else if gpuCount < 1 { errs = append(errs, fmt.Errorf("%s: GMS failover requires at least 1 GPU in resources.limits.gpu", v.fieldPath)) } switch v.spec.ComponentType { case consts.ComponentTypeEPP, consts.ComponentTypeFrontend, consts.ComponentTypePlanner: errs = append(errs, fmt.Errorf("%s: GMS failover is not supported for componentType %q", v.fieldPath, v.spec.ComponentType)) } } return errors.Join(errs...) } // validateGPUMemoryService validates gpuMemoryService constraints. // // gpuMemoryService declares the GMS layout (intra-pod sidecar vs. inter-pod // dedicated weight-server pod) and may be enabled independently of failover: // the intra-pod layout gives the engine a GMS sidecar in the same pod, and // the inter-pod layout gives it a dedicated weight-server pod paired with one // engine pod. Failover adds shadow engine pods on top of the declared layout // (see validateFailover); it is not the sole way to request the inter-pod // layout. func (v *SharedSpecValidator) validateGPUMemoryService() error { if v.spec.GPUMemoryService == nil || !v.spec.GPUMemoryService.Enabled { return nil } isWorker := v.spec.ComponentType == consts.ComponentTypeWorker || v.spec.ComponentType == consts.ComponentTypePrefill || v.spec.ComponentType == consts.ComponentTypeDecode if !isWorker { return fmt.Errorf( "%s.gpuMemoryService: GPU memory service is only supported for worker components (componentType must be worker, prefill, or decode)", v.fieldPath) } gpuCount, err := parseGPUCount(v.spec.Resources) if err != nil || gpuCount < 1 { return fmt.Errorf( "%s.gpuMemoryService: GPU memory service requires resources.limits.gpu >= 1", v.fieldPath) } return nil } // validateServiceAnnotations validates known annotations on the service-level spec. func (v *SharedSpecValidator) validateServiceAnnotations() error { if v.spec.Annotations == nil { return nil } if value, exists := v.spec.Annotations[consts.KubeAnnotationVLLMDistributedExecutorBackend]; exists { switch strings.ToLower(value) { case "mp", "ray": // valid default: return fmt.Errorf("%s.annotations[%s] has invalid value %q: must be \"mp\" or \"ray\"", v.fieldPath, consts.KubeAnnotationVLLMDistributedExecutorBackend, value) } } return nil }