"examples/vscode:/vscode.git/clone" did not exist on "e22bb0376fec83e690c8715d2987378433ca2d5c"
shared.go 12 KB
Newer Older
1
/*
2
 * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package validation

import (
21
	"context"
22
	"fmt"
23
	"strconv"
24
	"strings"
25

26
	nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
27
28
29
30
	"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
	controllercommon "github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common"
	"github.com/ai-dynamo/dynamo/deploy/operator/internal/dynamo/epp"
	ctrl "sigs.k8s.io/controller-runtime"
31
	"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
32
33
34
35
36
37
)

// SharedSpecValidator validates DynamoComponentDeploymentSharedSpec fields.
// This validator is used by both DynamoComponentDeploymentValidator and DynamoGraphDeploymentValidator
// to provide consistent validation logic for shared spec fields.
type SharedSpecValidator struct {
38
	spec                *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec
39
40
41
	fieldPath           string       // e.g., "spec" for DCD, "spec.services[foo]" for DGD
	calculatedNamespace string       // The namespace that will be used: {k8s_namespace}-{dgd_name}
	mgr                 ctrl.Manager // Optional: for API group detection via discovery client
42
43
44
45
}

// NewSharedSpecValidator creates a new validator for DynamoComponentDeploymentSharedSpec.
// fieldPath is used to provide context in error messages (e.g., "spec" or "spec.services[main]").
46
47
48
49
// calculatedNamespace is the namespace the operator will use:
//   - If GlobalDynamoNamespace is true: "dynamo" (global constant)
//   - Otherwise: {k8s_namespace}-{dgd_name}
func NewSharedSpecValidator(spec *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec, fieldPath string, calculatedNamespace string) *SharedSpecValidator {
50
	return &SharedSpecValidator{
51
52
53
		spec:                spec,
		fieldPath:           fieldPath,
		calculatedNamespace: calculatedNamespace,
54
55
56
57
58
59
60
61
62
63
64
65
		mgr:                 nil,
	}
}

// NewSharedSpecValidatorWithManager creates a validator with a manager for API group detection.
// This allows the validator to check for API group availability (e.g., inference.networking.k8s.io) when validating EPP components.
func NewSharedSpecValidatorWithManager(spec *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec, fieldPath string, calculatedNamespace string, mgr ctrl.Manager) *SharedSpecValidator {
	return &SharedSpecValidator{
		spec:                spec,
		fieldPath:           fieldPath,
		calculatedNamespace: calculatedNamespace,
		mgr:                 mgr,
66
67
68
69
	}
}

// Validate performs validation on the shared spec fields.
70
// Context is required for any operations that may need to query the cluster (e.g., CRD checks).
71
// Returns warnings (e.g., deprecation notices) and error if validation fails.
72
func (v *SharedSpecValidator) Validate(ctx context.Context) (admission.Warnings, error) {
73
74
75
76
77
78
79
80
81
82
83
	// Collect warnings (e.g., deprecation notices)
	var warnings admission.Warnings

	// Warn about deprecated dynamoNamespace field
	if v.spec.DynamoNamespace != nil && *v.spec.DynamoNamespace != "" {
		warnings = append(warnings, fmt.Sprintf(
			"%s.dynamoNamespace is deprecated and ignored. Value '%s' will be replaced with '%s'. "+
				"Remove this field from your configuration",
			v.fieldPath, *v.spec.DynamoNamespace, v.calculatedNamespace))
	}

84
85
	// Validate replicas if specified
	if v.spec.Replicas != nil && *v.spec.Replicas < 0 {
86
		return nil, fmt.Errorf("%s.replicas must be non-negative", v.fieldPath)
87
88
89
90
91
	}

	// Validate ingress configuration if enabled
	if v.spec.Ingress != nil && v.spec.Ingress.Enabled {
		if err := v.validateIngress(); err != nil {
92
			return nil, err
93
94
95
96
97
		}
	}

	// Validate volume mounts
	if err := v.validateVolumeMounts(); err != nil {
98
		return nil, err
99
100
101
102
103
	}

	// Validate shared memory
	if v.spec.SharedMemory != nil {
		if err := v.validateSharedMemory(); err != nil {
104
			return nil, err
105
106
107
		}
	}

108
109
110
111
112
	// Check for deprecated autoscaling field
	//nolint:staticcheck // SA1019: Intentionally checking deprecated field to warn users
	if v.spec.Autoscaling != nil {
		warnings = append(warnings, fmt.Sprintf(
			"%s.autoscaling is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter "+
113
				"with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md",
114
			v.fieldPath))
115
116
	}

117
118
119
120
121
	// Validate frontend sidecar container name conflicts
	if err := v.validateFrontendSidecar(); err != nil {
		return nil, err
	}

122
123
124
125
126
	// Validate service-level annotations
	if err := v.validateServiceAnnotations(); err != nil {
		return nil, err
	}

127
128
129
130
131
	// Validate EPP-specific constraints
	if err := v.validateEPPConfig(ctx); err != nil {
		return nil, err
	}

132
133
134
135
136
	// Validate GPU memory service configuration
	if err := v.validateGPUMemoryService(); err != nil {
		return nil, err
	}

137
	return warnings, nil
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
}

// validateIngress validates the ingress configuration.
func (v *SharedSpecValidator) validateIngress() error {
	if v.spec.Ingress.Host == "" {
		return fmt.Errorf("%s.ingress.host is required when ingress is enabled", v.fieldPath)
	}
	return nil
}

// validateVolumeMounts validates the volume mount configurations.
func (v *SharedSpecValidator) validateVolumeMounts() error {
	for i, volumeMount := range v.spec.VolumeMounts {
		if err := v.validateVolumeMount(i, &volumeMount); err != nil {
			return err
		}
	}
	return nil
}

// validateVolumeMount validates a single volume mount configuration.
func (v *SharedSpecValidator) validateVolumeMount(index int, volumeMount *nvidiacomv1alpha1.VolumeMount) error {
	// If useAsCompilationCache is false, mountPoint is required
	if !volumeMount.UseAsCompilationCache && volumeMount.MountPoint == "" {
		return fmt.Errorf("%s.volumeMounts[%d].mountPoint is required when useAsCompilationCache is false", v.fieldPath, index)
	}
	return nil
}

// validateSharedMemory validates the shared memory configuration.
func (v *SharedSpecValidator) validateSharedMemory() error {
	// If disabled is false (i.e., shared memory is enabled), size is required
	if !v.spec.SharedMemory.Disabled && v.spec.SharedMemory.Size.IsZero() {
		return fmt.Errorf("%s.sharedMemory.size is required when disabled is false", v.fieldPath)
	}
	return nil
}
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243

// validateEPPConfig validates EPP-specific configuration constraints.
func (v *SharedSpecValidator) validateEPPConfig(ctx context.Context) error {
	// Only validate if this is an EPP component
	if v.spec.ComponentType != consts.ComponentTypeEPP {
		return nil
	}

	// Check if InferencePool API group is available in the cluster (if manager is provided)
	if v.mgr != nil {
		if err := v.checkInferencePoolAPIAvailability(ctx); err != nil {
			return fmt.Errorf("%s: cannot deploy EPP component: %w", v.fieldPath, err)
		}
	}

	// EPP must be single-node (cannot be multinode)
	if v.spec.IsMultinode() {
		return fmt.Errorf("%s: EPP component cannot be multinode (multinode field must be nil or nodeCount must be 1)", v.fieldPath)
	}

	// EPP should have exactly 1 replica (optional constraint - can be relaxed if needed)
	if v.spec.Replicas != nil && *v.spec.Replicas != 1 {
		return fmt.Errorf("%s: EPP component must have exactly 1 replica (found %d replicas)", v.fieldPath, *v.spec.Replicas)
	}

	// EPP components MUST have EPPConfig
	if v.spec.EPPConfig == nil {
		return fmt.Errorf("%s.eppConfig is required for EPP components", v.fieldPath)
	}

	// Either ConfigMapRef or Config must be specified (no default)
	if v.spec.EPPConfig.ConfigMapRef == nil && v.spec.EPPConfig.Config == nil {
		return fmt.Errorf("%s.eppConfig: either configMapRef or config must be specified (no default configuration provided)", v.fieldPath)
	}

	// ConfigMapRef and Config are mutually exclusive
	if v.spec.EPPConfig.ConfigMapRef != nil && v.spec.EPPConfig.Config != nil {
		return fmt.Errorf("%s.eppConfig: configMapRef and config are mutually exclusive, only one can be specified", v.fieldPath)
	}

	// If ConfigMapRef is provided, validate it
	if v.spec.EPPConfig.ConfigMapRef != nil {
		if v.spec.EPPConfig.ConfigMapRef.Name == "" {
			return fmt.Errorf("%s.eppConfig.configMapRef.name is required", v.fieldPath)
		}
	}

	return nil
}

// checkInferencePoolAPIAvailability checks if the inference.networking.k8s.io API group is available in the cluster.
// Returns an error if the API group is not available, which prevents EPP deployment.
// This reuses the controller_common.DetectInferencePoolAvailability function.
func (v *SharedSpecValidator) checkInferencePoolAPIAvailability(ctx context.Context) error {
	if v.mgr == nil {
		// No manager provided, skip the check (e.g., in controller without webhooks)
		return nil
	}

	if !controllercommon.DetectInferencePoolAvailability(ctx, v.mgr) {
		return fmt.Errorf(
			"InferencePool API group (%s) is not available in the cluster. "+
				"EPP requires the Gateway API Inference Extension to be installed. "+
				"Please install the Gateway API Inference Extension before deploying EPP components",
			epp.InferencePoolGroup)
	}

	return nil
}
244

245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
// validateFrontendSidecar checks that extraPodSpec.containers does not already
// contain a container whose name collides with the auto-generated frontend sidecar.
func (v *SharedSpecValidator) validateFrontendSidecar() error {
	if v.spec.FrontendSidecar == nil {
		return nil
	}
	if v.spec.ExtraPodSpec == nil || v.spec.ExtraPodSpec.PodSpec == nil {
		return nil
	}
	for _, c := range v.spec.ExtraPodSpec.PodSpec.Containers {
		if c.Name == consts.FrontendSidecarContainerName {
			return fmt.Errorf(
				"%s: cannot inject frontend sidecar: a container named %q already exists in extraPodSpec.containers",
				v.fieldPath, consts.FrontendSidecarContainerName)
		}
	}
	return nil
}

264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
// validateGPUMemoryService validates the GPU memory service configuration.
func (v *SharedSpecValidator) validateGPUMemoryService() error {
	if v.spec.GPUMemoryService == nil || !v.spec.GPUMemoryService.Enabled {
		return nil
	}

	if v.spec.GPUMemoryService.Mode == nvidiacomv1alpha1.GMSModeInterPod {
		return fmt.Errorf(
			"%s.gpuMemoryService: mode \"interPod\" is not yet supported",
			v.fieldPath)
	}

	isWorker := v.spec.ComponentType == consts.ComponentTypeWorker ||
		v.spec.ComponentType == consts.ComponentTypePrefill ||
		v.spec.ComponentType == consts.ComponentTypeDecode
	if !isWorker {
		return fmt.Errorf(
			"%s.gpuMemoryService: GPU memory service is only supported for worker components (componentType must be worker, prefill, or decode)",
			v.fieldPath)
	}

	if v.spec.Resources == nil {
		return fmt.Errorf(
			"%s.gpuMemoryService: GPU memory service requires resources.limits.gpu >= 1",
			v.fieldPath)
	}

	gpuStr := ""
	switch {
	case v.spec.Resources.Limits != nil && v.spec.Resources.Limits.GPU != "":
		gpuStr = v.spec.Resources.Limits.GPU
	case v.spec.Resources.Requests != nil && v.spec.Resources.Requests.GPU != "":
		gpuStr = v.spec.Resources.Requests.GPU
	}

	if gpuStr == "" {
		return fmt.Errorf(
			"%s.gpuMemoryService: GPU memory service requires resources.limits.gpu >= 1",
			v.fieldPath)
	}

	gpuCount, err := strconv.Atoi(gpuStr)
	if err != nil || gpuCount < 1 {
		return fmt.Errorf(
			"%s.gpuMemoryService: GPU memory service requires resources.limits.gpu >= 1",
			v.fieldPath)
	}

	return nil
}

315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
// validateServiceAnnotations validates known annotations on the service-level spec.
func (v *SharedSpecValidator) validateServiceAnnotations() error {
	if v.spec.Annotations == nil {
		return nil
	}
	if value, exists := v.spec.Annotations[consts.KubeAnnotationVLLMDistributedExecutorBackend]; exists {
		switch strings.ToLower(value) {
		case "mp", "ray":
			// valid
		default:
			return fmt.Errorf("%s.annotations[%s] has invalid value %q: must be \"mp\" or \"ray\"",
				v.fieldPath, consts.KubeAnnotationVLLMDistributedExecutorBackend, value)
		}
	}
	return nil
}