shared.go 9.6 KB
Newer Older
1
/*
2
 * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package validation

import (
21
	"context"
22
	"fmt"
23
	"strings"
24

25
	nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
26
27
28
29
	"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
	controllercommon "github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common"
	"github.com/ai-dynamo/dynamo/deploy/operator/internal/dynamo/epp"
	ctrl "sigs.k8s.io/controller-runtime"
30
	"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
31
32
33
34
35
36
)

// SharedSpecValidator validates DynamoComponentDeploymentSharedSpec fields.
// This validator is used by both DynamoComponentDeploymentValidator and DynamoGraphDeploymentValidator
// to provide consistent validation logic for shared spec fields.
type SharedSpecValidator struct {
37
	spec                *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec
38
39
40
	fieldPath           string       // e.g., "spec" for DCD, "spec.services[foo]" for DGD
	calculatedNamespace string       // The namespace that will be used: {k8s_namespace}-{dgd_name}
	mgr                 ctrl.Manager // Optional: for API group detection via discovery client
41
42
43
44
}

// NewSharedSpecValidator creates a new validator for DynamoComponentDeploymentSharedSpec.
// fieldPath is used to provide context in error messages (e.g., "spec" or "spec.services[main]").
45
46
47
48
// calculatedNamespace is the namespace the operator will use:
//   - If GlobalDynamoNamespace is true: "dynamo" (global constant)
//   - Otherwise: {k8s_namespace}-{dgd_name}
func NewSharedSpecValidator(spec *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec, fieldPath string, calculatedNamespace string) *SharedSpecValidator {
49
	return &SharedSpecValidator{
50
51
52
		spec:                spec,
		fieldPath:           fieldPath,
		calculatedNamespace: calculatedNamespace,
53
54
55
56
57
58
59
60
61
62
63
64
		mgr:                 nil,
	}
}

// NewSharedSpecValidatorWithManager creates a validator with a manager for API group detection.
// This allows the validator to check for API group availability (e.g., inference.networking.k8s.io) when validating EPP components.
func NewSharedSpecValidatorWithManager(spec *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec, fieldPath string, calculatedNamespace string, mgr ctrl.Manager) *SharedSpecValidator {
	return &SharedSpecValidator{
		spec:                spec,
		fieldPath:           fieldPath,
		calculatedNamespace: calculatedNamespace,
		mgr:                 mgr,
65
66
67
68
	}
}

// Validate performs validation on the shared spec fields.
69
// Context is required for any operations that may need to query the cluster (e.g., CRD checks).
70
// Returns warnings (e.g., deprecation notices) and error if validation fails.
71
func (v *SharedSpecValidator) Validate(ctx context.Context) (admission.Warnings, error) {
72
73
74
75
76
77
78
79
80
81
82
	// Collect warnings (e.g., deprecation notices)
	var warnings admission.Warnings

	// Warn about deprecated dynamoNamespace field
	if v.spec.DynamoNamespace != nil && *v.spec.DynamoNamespace != "" {
		warnings = append(warnings, fmt.Sprintf(
			"%s.dynamoNamespace is deprecated and ignored. Value '%s' will be replaced with '%s'. "+
				"Remove this field from your configuration",
			v.fieldPath, *v.spec.DynamoNamespace, v.calculatedNamespace))
	}

83
84
	// Validate replicas if specified
	if v.spec.Replicas != nil && *v.spec.Replicas < 0 {
85
		return nil, fmt.Errorf("%s.replicas must be non-negative", v.fieldPath)
86
87
88
89
90
	}

	// Validate ingress configuration if enabled
	if v.spec.Ingress != nil && v.spec.Ingress.Enabled {
		if err := v.validateIngress(); err != nil {
91
			return nil, err
92
93
94
95
96
		}
	}

	// Validate volume mounts
	if err := v.validateVolumeMounts(); err != nil {
97
		return nil, err
98
99
100
101
102
	}

	// Validate shared memory
	if v.spec.SharedMemory != nil {
		if err := v.validateSharedMemory(); err != nil {
103
			return nil, err
104
105
106
		}
	}

107
108
109
110
111
	// Check for deprecated autoscaling field
	//nolint:staticcheck // SA1019: Intentionally checking deprecated field to warn users
	if v.spec.Autoscaling != nil {
		warnings = append(warnings, fmt.Sprintf(
			"%s.autoscaling is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter "+
112
				"with HPA, KEDA, or Planner for autoscaling instead. See docs/pages/kubernetes/autoscaling.md",
113
			v.fieldPath))
114
115
	}

116
117
118
119
120
	// Validate service-level annotations
	if err := v.validateServiceAnnotations(); err != nil {
		return nil, err
	}

121
122
123
124
125
	// Validate EPP-specific constraints
	if err := v.validateEPPConfig(ctx); err != nil {
		return nil, err
	}

126
	return warnings, nil
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
}

// validateIngress validates the ingress configuration.
func (v *SharedSpecValidator) validateIngress() error {
	if v.spec.Ingress.Host == "" {
		return fmt.Errorf("%s.ingress.host is required when ingress is enabled", v.fieldPath)
	}
	return nil
}

// validateVolumeMounts validates the volume mount configurations.
func (v *SharedSpecValidator) validateVolumeMounts() error {
	for i, volumeMount := range v.spec.VolumeMounts {
		if err := v.validateVolumeMount(i, &volumeMount); err != nil {
			return err
		}
	}
	return nil
}

// validateVolumeMount validates a single volume mount configuration.
func (v *SharedSpecValidator) validateVolumeMount(index int, volumeMount *nvidiacomv1alpha1.VolumeMount) error {
	// If useAsCompilationCache is false, mountPoint is required
	if !volumeMount.UseAsCompilationCache && volumeMount.MountPoint == "" {
		return fmt.Errorf("%s.volumeMounts[%d].mountPoint is required when useAsCompilationCache is false", v.fieldPath, index)
	}
	return nil
}

// validateSharedMemory validates the shared memory configuration.
func (v *SharedSpecValidator) validateSharedMemory() error {
	// If disabled is false (i.e., shared memory is enabled), size is required
	if !v.spec.SharedMemory.Disabled && v.spec.SharedMemory.Size.IsZero() {
		return fmt.Errorf("%s.sharedMemory.size is required when disabled is false", v.fieldPath)
	}
	return nil
}
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232

// validateEPPConfig validates EPP-specific configuration constraints.
func (v *SharedSpecValidator) validateEPPConfig(ctx context.Context) error {
	// Only validate if this is an EPP component
	if v.spec.ComponentType != consts.ComponentTypeEPP {
		return nil
	}

	// Check if InferencePool API group is available in the cluster (if manager is provided)
	if v.mgr != nil {
		if err := v.checkInferencePoolAPIAvailability(ctx); err != nil {
			return fmt.Errorf("%s: cannot deploy EPP component: %w", v.fieldPath, err)
		}
	}

	// EPP must be single-node (cannot be multinode)
	if v.spec.IsMultinode() {
		return fmt.Errorf("%s: EPP component cannot be multinode (multinode field must be nil or nodeCount must be 1)", v.fieldPath)
	}

	// EPP should have exactly 1 replica (optional constraint - can be relaxed if needed)
	if v.spec.Replicas != nil && *v.spec.Replicas != 1 {
		return fmt.Errorf("%s: EPP component must have exactly 1 replica (found %d replicas)", v.fieldPath, *v.spec.Replicas)
	}

	// EPP components MUST have EPPConfig
	if v.spec.EPPConfig == nil {
		return fmt.Errorf("%s.eppConfig is required for EPP components", v.fieldPath)
	}

	// Either ConfigMapRef or Config must be specified (no default)
	if v.spec.EPPConfig.ConfigMapRef == nil && v.spec.EPPConfig.Config == nil {
		return fmt.Errorf("%s.eppConfig: either configMapRef or config must be specified (no default configuration provided)", v.fieldPath)
	}

	// ConfigMapRef and Config are mutually exclusive
	if v.spec.EPPConfig.ConfigMapRef != nil && v.spec.EPPConfig.Config != nil {
		return fmt.Errorf("%s.eppConfig: configMapRef and config are mutually exclusive, only one can be specified", v.fieldPath)
	}

	// If ConfigMapRef is provided, validate it
	if v.spec.EPPConfig.ConfigMapRef != nil {
		if v.spec.EPPConfig.ConfigMapRef.Name == "" {
			return fmt.Errorf("%s.eppConfig.configMapRef.name is required", v.fieldPath)
		}
	}

	return nil
}

// checkInferencePoolAPIAvailability checks if the inference.networking.k8s.io API group is available in the cluster.
// Returns an error if the API group is not available, which prevents EPP deployment.
// This reuses the controller_common.DetectInferencePoolAvailability function.
func (v *SharedSpecValidator) checkInferencePoolAPIAvailability(ctx context.Context) error {
	if v.mgr == nil {
		// No manager provided, skip the check (e.g., in controller without webhooks)
		return nil
	}

	if !controllercommon.DetectInferencePoolAvailability(ctx, v.mgr) {
		return fmt.Errorf(
			"InferencePool API group (%s) is not available in the cluster. "+
				"EPP requires the Gateway API Inference Extension to be installed. "+
				"Please install the Gateway API Inference Extension before deploying EPP components",
			epp.InferencePoolGroup)
	}

	return nil
}
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249

// validateServiceAnnotations validates known annotations on the service-level spec.
func (v *SharedSpecValidator) validateServiceAnnotations() error {
	if v.spec.Annotations == nil {
		return nil
	}
	if value, exists := v.spec.Annotations[consts.KubeAnnotationVLLMDistributedExecutorBackend]; exists {
		switch strings.ToLower(value) {
		case "mp", "ray":
			// valid
		default:
			return fmt.Errorf("%s.annotations[%s] has invalid value %q: must be \"mp\" or \"ray\"",
				v.fieldPath, consts.KubeAnnotationVLLMDistributedExecutorBackend, value)
		}
	}
	return nil
}