dynamographdeploymentrequest.go 5.37 KB
Newer Older
1
/*
2
 * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package validation

import (
21
	"encoding/json"
22
23
24
	"errors"
	"fmt"

25
	nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
26
27
28
29
30
31
	"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
)

// DynamoGraphDeploymentRequestValidator validates DynamoGraphDeploymentRequest resources.
// This validator can be used by both webhooks and controllers for consistent validation.
type DynamoGraphDeploymentRequestValidator struct {
32
	request               *nvidiacomv1beta1.DynamoGraphDeploymentRequest
33
	isClusterWideOperator bool
34
	gpuDiscoveryEnabled   bool
35
36
37
}

// NewDynamoGraphDeploymentRequestValidator creates a new validator for DynamoGraphDeploymentRequest.
38
39
// isClusterWide indicates whether the operator has cluster-wide permissions.
// gpuDiscoveryEnabled indicates whether Helm provisioned node read access for the operator.
40
func NewDynamoGraphDeploymentRequestValidator(request *nvidiacomv1beta1.DynamoGraphDeploymentRequest, isClusterWide bool, gpuDiscoveryEnabled bool) *DynamoGraphDeploymentRequestValidator {
41
42
43
	return &DynamoGraphDeploymentRequestValidator{
		request:               request,
		isClusterWideOperator: isClusterWide,
44
		gpuDiscoveryEnabled:   gpuDiscoveryEnabled,
45
46
47
48
49
50
51
52
	}
}

// Validate performs stateless validation on the DynamoGraphDeploymentRequest.
// Returns warnings and error.
func (v *DynamoGraphDeploymentRequestValidator) Validate() (admission.Warnings, error) {
	var err error

53
54
55
56
57
58
59
60
61
62
	// Disallow searchStrategy: thorough with backend: auto.
	// "thorough" sweeps more configurations and requires a concrete backend to be selected;
	// "auto" defers backend selection and is only compatible with the "rapid" search strategy.
	if v.request.Spec.SearchStrategy == nvidiacomv1beta1.SearchStrategyThorough &&
		v.request.Spec.Backend == nvidiacomv1beta1.BackendTypeAuto {
		err = errors.Join(err, fmt.Errorf(
			"spec.searchStrategy %q is incompatible with spec.backend %q: set spec.backend to a specific backend (sglang, trtllm, or vllm)",
			nvidiacomv1beta1.SearchStrategyThorough,
			nvidiacomv1beta1.BackendTypeAuto,
		))
63
64
	}

65
	// Validate GPU hardware information is available (last, so other errors are collected first).
66
67
68
69
	if gpuErr := v.validateGPUHardwareInfo(); gpuErr != nil {
		err = errors.Join(err, gpuErr)
	}

70
	return nil, err
71
72
}

73
// validateGPUHardwareInfo ensures GPU hardware information will be available for profiling.
74
// Returns an error at admission time if GPU discovery is disabled and no manual hardware config is provided.
75
// Also validates consistency of GPU range fields.
76
func (v *DynamoGraphDeploymentRequestValidator) validateGPUHardwareInfo() error {
77
	// Check if manual hardware config is provided via typed spec.hardware fields.
78
	var hasManualHardwareConfig bool
79
80
	if hw := v.request.Spec.Hardware; hw != nil {
		hasManualHardwareConfig = hw.GPUSKU != "" || hw.VRAMMB != nil || hw.NumGPUsPerNode != nil
81
82
	}

83
	if hasManualHardwareConfig {
84
85
86
		return nil
	}

87
88
89
	// No manual hardware config provided. Cluster-wide operators always have GPU discovery via node
	// permissions. Namespace-scoped operators rely on Helm-provisioned GPU discovery (gpuDiscovery.enabled).
	if v.isClusterWideOperator || v.gpuDiscoveryEnabled {
90
91
92
		return nil
	}

93
	return errors.New("GPU hardware configuration required: GPU discovery is disabled (set dynamo-operator.gpuDiscovery.enabled=true in Helm values, or provide hardware config in spec.hardware)")
94
95
}

96
97
// ValidateUpdate performs stateful validation comparing old and new DynamoGraphDeploymentRequest.
// Returns warnings and error.
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
func (v *DynamoGraphDeploymentRequestValidator) ValidateUpdate(old *nvidiacomv1beta1.DynamoGraphDeploymentRequest) (admission.Warnings, error) {
	// Reject spec changes when the resource is in a non-editable lifecycle phase.
	// During Profiling, Deploying, or Deployed the controller is actively reconciling
	// the resource and spec mutations would conflict with in-flight operations.
	phase := old.Status.Phase
	immutablePhases := map[nvidiacomv1beta1.DGDRPhase]bool{
		nvidiacomv1beta1.DGDRPhaseProfiling: true,
		nvidiacomv1beta1.DGDRPhaseDeploying: true,
		nvidiacomv1beta1.DGDRPhaseDeployed:  true,
	}

	if immutablePhases[phase] {
		// Compare specs — if they differ, reject the update.
		oldSpec := old.Spec
		newSpec := v.request.Spec
		if !specEqual(oldSpec, newSpec) {
			return nil, fmt.Errorf("spec updates are forbidden while the resource is in phase %q; delete and recreate the resource to change its spec", phase)
		}
	}

118
119
	return nil, nil
}
120
121
122
123
124
125
126
127
128
129

// specEqual performs a JSON-round-trip comparison of two DynamoGraphDeploymentRequestSpec values.
func specEqual(a, b nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec) bool {
	aj, err1 := json.Marshal(a)
	bj, err2 := json.Marshal(b)
	if err1 != nil || err2 != nil {
		return false
	}
	return string(aj) == string(bj)
}