dynamographdeploymentrequest.go 7.45 KB
Newer Older
1
/*
2
 * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package validation

import (
	"errors"
	"fmt"

24
	nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
25
26
27
28
	"k8s.io/apimachinery/pkg/util/yaml"
	"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
)

29
30
// toFloat64 converts a numeric value (int or float64) to float64.
// Returns 0 if the value is neither int nor float64.
31
func toFloat64(val any) float64 {
32
33
34
35
36
37
38
39
40
41
	switch v := val.(type) {
	case float64:
		return v
	case int:
		return float64(v)
	default:
		return 0
	}
}

42
43
44
45
46
// DynamoGraphDeploymentRequestValidator validates DynamoGraphDeploymentRequest resources.
// This validator can be used by both webhooks and controllers for consistent validation.
type DynamoGraphDeploymentRequestValidator struct {
	request               *nvidiacomv1alpha1.DynamoGraphDeploymentRequest
	isClusterWideOperator bool
47
	gpuDiscoveryEnabled   bool
48
49
50
}

// NewDynamoGraphDeploymentRequestValidator creates a new validator for DynamoGraphDeploymentRequest.
51
52
53
// isClusterWide indicates whether the operator has cluster-wide permissions.
// gpuDiscoveryEnabled indicates whether Helm provisioned node read access for the operator.
func NewDynamoGraphDeploymentRequestValidator(request *nvidiacomv1alpha1.DynamoGraphDeploymentRequest, isClusterWide bool, gpuDiscoveryEnabled bool) *DynamoGraphDeploymentRequestValidator {
54
55
56
	return &DynamoGraphDeploymentRequestValidator{
		request:               request,
		isClusterWideOperator: isClusterWide,
57
		gpuDiscoveryEnabled:   gpuDiscoveryEnabled,
58
59
60
61
62
63
64
65
66
	}
}

// Validate performs stateless validation on the DynamoGraphDeploymentRequest.
// Returns warnings and error.
func (v *DynamoGraphDeploymentRequestValidator) Validate() (admission.Warnings, error) {
	var warnings admission.Warnings
	var err error

67
68
69
70
71
	// Warn about deprecated enableGpuDiscovery field
	if v.request.Spec.EnableGPUDiscovery != nil {
		warnings = append(warnings, "spec.enableGpuDiscovery is deprecated and will be removed in v1beta1. GPU discovery is now always attempted automatically. This field has no effect.")
	}

72
73
74
75
76
77
78
79
80
81
	// Validate profiler image is specified
	if v.request.Spec.ProfilingConfig.ProfilerImage == "" {
		err = errors.Join(err, errors.New("spec.profilingConfig.profilerImage is required"))
	}

	// Validate that profilingConfig.config is provided
	if v.request.Spec.ProfilingConfig.Config == nil || len(v.request.Spec.ProfilingConfig.Config.Raw) == 0 {
		err = errors.Join(err, errors.New("spec.profilingConfig.config is required and must not be empty"))
	}

82
83
	// Note: GPU discovery is now automatic for cluster-wide operators
	// Namespace-restricted operators automatically skip GPU discovery and require manual hardware config
84
85
86

	// Parse config to validate structure (only if config is present)
	if v.request.Spec.ProfilingConfig.Config != nil && len(v.request.Spec.ProfilingConfig.Config.Raw) > 0 {
87
		var config map[string]any
88
89
90
91
		if parseErr := yaml.Unmarshal(v.request.Spec.ProfilingConfig.Config.Raw, &config); parseErr != nil {
			err = errors.Join(err, fmt.Errorf("failed to parse spec.profilingConfig.config: %w", parseErr))
		} else {
			// Warn if deployment.model or engine.backend are specified in config (they will be overwritten by spec fields)
92
			if engineConfig, ok := config["engine"].(map[string]any); ok {
93
94
95
96
				if backend, ok := engineConfig["backend"].(string); ok && backend != "" && backend != v.request.Spec.Backend {
					warnings = append(warnings, fmt.Sprintf("spec.profilingConfig.config.engine.backend (%s) will be overwritten by spec.backend (%s)", backend, v.request.Spec.Backend))
				}
			}
97
			if deployment, ok := config["deployment"].(map[string]any); ok {
98
99
100
101
102
103
104
				if model, ok := deployment["model"].(string); ok && model != "" && model != v.request.Spec.Model {
					warnings = append(warnings, fmt.Sprintf("spec.profilingConfig.config.deployment.model (%s) will be overwritten by spec.model (%s)", model, v.request.Spec.Model))
				}
			}
		}
	}

105
106
107
108
109
	// Validate GPU hardware information is available (last, so other errors are collected first)
	if gpuErr := v.validateGPUHardwareInfo(); gpuErr != nil {
		err = errors.Join(err, gpuErr)
	}

110
111
112
	return warnings, err
}

113
// validateGPUHardwareInfo ensures GPU hardware information will be available for profiling.
114
// Returns an error at admission time if GPU discovery is disabled and no manual hardware config is provided.
115
116
func (v *DynamoGraphDeploymentRequestValidator) validateGPUHardwareInfo() error {
	// Parse profiling config
117
	var config map[string]any
118
119
120
121
122
123
	if v.request.Spec.ProfilingConfig.Config != nil {
		if err := yaml.Unmarshal(v.request.Spec.ProfilingConfig.Config.Raw, &config); err != nil {
			// Config parse errors will be caught by other validators
			return nil
		}
	} else {
124
		config = make(map[string]any)
125
126
127
128
129
130
	}

	// Check if manual hardware config is provided
	hardwareVal, hasHardware := config["hardware"]
	var hasManualHardwareConfig bool
	if hasHardware && hardwareVal != nil {
131
		if hardwareConfig, ok := hardwareVal.(map[string]any); ok {
132
133
134
135
136
137
138
139
140
141
142
			// Check if essential hardware fields are provided
			_, hasGPUModel := hardwareConfig["gpuModel"]
			_, hasGPUVram := hardwareConfig["gpuVramMib"]
			_, hasNumGPUs := hardwareConfig["numGpusPerNode"]
			hasManualHardwareConfig = hasGPUModel || hasGPUVram || hasNumGPUs
		}
	}

	// Check if explicit GPU ranges are provided
	var hasExplicitGPURanges bool
	if engineVal, hasEngine := config["engine"]; hasEngine && engineVal != nil {
143
		if engineConfig, ok := engineVal.(map[string]any); ok {
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
			minGPUs, hasMin := engineConfig["minNumGpusPerEngine"]
			maxGPUs, hasMax := engineConfig["maxNumGpusPerEngine"]
			// Validate explicit GPU ranges
			if hasMin && hasMax {
				minVal := toFloat64(minGPUs)
				maxVal := toFloat64(maxGPUs)

				// Validate that min <= max
				if minVal > maxVal {
					return fmt.Errorf("invalid GPU range: minNumGpusPerEngine (%v) cannot be greater than maxNumGpusPerEngine (%v)",
						minVal, maxVal)
				}

				hasExplicitGPURanges = minVal > 0 && maxVal > 0
			}
		}
	}

	if hasManualHardwareConfig || hasExplicitGPURanges {
		return nil
	}

166
167
168
	// No manual hardware config provided. Cluster-wide operators always have GPU discovery via node
	// permissions. Namespace-scoped operators rely on Helm-provisioned GPU discovery (gpuDiscovery.enabled).
	if v.isClusterWideOperator || v.gpuDiscoveryEnabled {
169
170
171
		return nil
	}

172
	return errors.New("GPU hardware configuration required: GPU discovery is disabled (set dynamo-operator.gpuDiscovery.enabled=true in Helm values, or provide hardware config in spec.profilingConfig.config)")
173
174
}

175
176
177
178
179
180
181
// ValidateUpdate performs stateful validation comparing old and new DynamoGraphDeploymentRequest.
// Returns warnings and error.
func (v *DynamoGraphDeploymentRequestValidator) ValidateUpdate(old *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (admission.Warnings, error) {
	// TODO: Add update validation logic for DynamoGraphDeploymentRequest
	// Placeholder for future immutability checks
	return nil, nil
}