feat: use v1beta1 DGDR in controller (#6498)

Signed-off-by: Jont828 <jt572@cornell.edu> Signed-off-by: Hannah Zhang <hannahz@nvidia.com> Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Co-authored-by: Hannah Zhang <hannahz@nvidia.com>

feat: use v1beta1 DGDR in controller (#6498)
Signed-off-by: Jont828 <jt572@cornell.edu> Signed-off-by: Hannah Zhang <hannahz@nvidia.com> Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Co-authored-by: Hannah Zhang <hannahz@nvidia.com>
d100c6e2 · Jonathan Tong · GitHub · 5a319aed · d100c6e2 · d100c6e2
Unverified Commit d100c6e2 authored Feb 25, 2026 by Jonathan Tong Committed by GitHub Feb 25, 2026
11 changed files
--- a/deploy/helm/charts/platform/components/operator/crds/nvidia.com_dynamographdeploymentrequests.yaml
+++ b/deploy/helm/charts/platform/components/operator/crds/nvidia.com_dynamographdeploymentrequests.yaml
@@ -461,7 +461,7 @@ spec:
              type: object
          type: object
      served: true
-      storage: true
+      storage: false
      subresources:
        status: {}
    - additionalPrinterColumns:
@@ -9247,6 +9247,6 @@ spec:
              type: object
          type: object
      served: true
-      storage: false
+      storage: true
      subresources:
        status: {}
--- a/deploy/helm/charts/platform/components/operator/templates/webhook-configuration.yaml
+++ b/deploy/helm/charts/platform/components/operator/templates/webhook-configuration.yaml
@@ -145,7 +145,7 @@ webhooks:
    service:
      name: {{ include "dynamo-operator.fullname" . }}-webhook-service
      namespace: {{ .Release.Namespace }}
-      path: /validate-nvidia-com-v1alpha1-dynamographdeploymentrequest
+      path: /validate-nvidia-com-v1beta1-dynamographdeploymentrequest
  failurePolicy: {{ .Values.webhook.failurePolicy }}
  name: vdynamographdeploymentrequest.kb.io
  {{- if .Values.webhook.namespaceSelector }}
@@ -161,6 +161,7 @@ webhooks:
    - nvidia.com
    apiVersions:
    - v1alpha1
+    - v1beta1
    operations:
    - CREATE
    - UPDATE

--- a/deploy/operator/api/v1alpha1/dynamographdeploymentrequest_types.go
+++ b/deploy/operator/api/v1alpha1/dynamographdeploymentrequest_types.go
@@ -279,7 +279,6 @@ type DynamoGraphDeploymentRequestStatus struct {
 //
 // +kubebuilder:object:root=true
 // +kubebuilder:subresource:status
-// +kubebuilder:storageversion
 // +kubebuilder:resource:shortName=dgdr
 // +kubebuilder:deprecatedversion:warning="nvidia.com/v1alpha1 DynamoGraphDeploymentRequest is deprecated; use nvidia.com/v1beta1 DynamoGraphDeploymentRequest"
 // +kubebuilder:printcolumn:name="Model",type=string,JSONPath=`.spec.model`

--- a/deploy/operator/api/v1beta1/dynamographdeploymentrequest_types.go
+++ b/deploy/operator/api/v1beta1/dynamographdeploymentrequest_types.go
@@ -496,6 +496,7 @@ type DynamoGraphDeploymentRequestStatus struct {
 //
 // +kubebuilder:object:root=true
 // +kubebuilder:subresource:status
+// +kubebuilder:storageversion
 // +kubebuilder:resource:shortName=dgdr
 // +kubebuilder:printcolumn:name="Model",type=string,JSONPath=`.spec.model`
 // +kubebuilder:printcolumn:name="Backend",type=string,JSONPath=`.spec.backend`

--- a/deploy/operator/config/crd/bases/nvidia.com_dynamographdeploymentrequests.yaml
+++ b/deploy/operator/config/crd/bases/nvidia.com_dynamographdeploymentrequests.yaml
@@ -461,7 +461,7 @@ spec:
              type: object
          type: object
      served: true
-      storage: true
+      storage: false
      subresources:
        status: {}
    - additionalPrinterColumns:
@@ -9247,6 +9247,6 @@ spec:
              type: object
          type: object
      served: true
-      storage: false
+      storage: true
      subresources:
        status: {}
--- a/deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go
+++ b/deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go
--- a/deploy/operator/internal/controller/dynamographdeploymentrequest_controller_test.go
+++ b/deploy/operator/internal/controller/dynamographdeploymentrequest_controller_test.go
--- a/deploy/operator/internal/controller/suite_test.go
+++ b/deploy/operator/internal/controller/suite_test.go
@@ -27,6 +27,7 @@ import (
 	"testing"

 	"github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
+	"github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 	monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
@@ -102,6 +103,8 @@ var _ = BeforeSuite(func() {
 	//+kubebuilder:scaffold:scheme
 	err = v1alpha1.AddToScheme(scheme)
 	Expect(err).NotTo(HaveOccurred())
+	err = v1beta1.AddToScheme(scheme)
+	Expect(err).NotTo(HaveOccurred())
 	err = corev1.AddToScheme(scheme)
 	Expect(err).NotTo(HaveOccurred())
 	err = autoscalingv2.AddToScheme(scheme)

--- a/deploy/operator/internal/webhook/validation/dynamographdeploymentrequest.go
+++ b/deploy/operator/internal/webhook/validation/dynamographdeploymentrequest.go
@@ -18,31 +18,18 @@
 package validation

 import (
+	"encoding/json"
 	"errors"
 	"fmt"

-	nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
-	"k8s.io/apimachinery/pkg/util/yaml"
+	nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
 	"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
 )

-// toFloat64 converts a numeric value (int or float64) to float64.
-// Returns 0 if the value is neither int nor float64.
-func toFloat64(val any) float64 {
-	switch v := val.(type) {
-	case float64:
-		return v
-	case int:
-		return float64(v)
-	default:
-		return 0
-	}
-}
-
 // DynamoGraphDeploymentRequestValidator validates DynamoGraphDeploymentRequest resources.
 // This validator can be used by both webhooks and controllers for consistent validation.
 type DynamoGraphDeploymentRequestValidator struct {
-	request               *nvidiacomv1alpha1.DynamoGraphDeploymentRequest
+	request               *nvidiacomv1beta1.DynamoGraphDeploymentRequest
 	isClusterWideOperator bool
 	gpuDiscoveryEnabled   bool
 }
@@ -50,7 +37,7 @@ type DynamoGraphDeploymentRequestValidator struct {
 // NewDynamoGraphDeploymentRequestValidator creates a new validator for DynamoGraphDeploymentRequest.
 // isClusterWide indicates whether the operator has cluster-wide permissions.
 // gpuDiscoveryEnabled indicates whether Helm provisioned node read access for the operator.
-func NewDynamoGraphDeploymentRequestValidator(request *nvidiacomv1alpha1.DynamoGraphDeploymentRequest, isClusterWide bool, gpuDiscoveryEnabled bool) *DynamoGraphDeploymentRequestValidator {
+func NewDynamoGraphDeploymentRequestValidator(request *nvidiacomv1beta1.DynamoGraphDeploymentRequest, isClusterWide bool, gpuDiscoveryEnabled bool) *DynamoGraphDeploymentRequestValidator {
 	return &DynamoGraphDeploymentRequestValidator{
 		request:               request,
 		isClusterWideOperator: isClusterWide,
@@ -61,105 +48,43 @@ func NewDynamoGraphDeploymentRequestValidator(request *nvidiacomv1alpha1.DynamoG
 // Validate performs stateless validation on the DynamoGraphDeploymentRequest.
 // Returns warnings and error.
 func (v *DynamoGraphDeploymentRequestValidator) Validate() (admission.Warnings, error) {
-	var warnings admission.Warnings
 	var err error

-	// Warn about deprecated enableGpuDiscovery field
-	if v.request.Spec.EnableGPUDiscovery != nil {
-		warnings = append(warnings, "spec.enableGpuDiscovery is deprecated and will be removed in v1beta1. GPU discovery is now always attempted automatically. This field has no effect.")
-	}
-
-	// Validate profiler image is specified
-	if v.request.Spec.ProfilingConfig.ProfilerImage == "" {
-		err = errors.Join(err, errors.New("spec.profilingConfig.profilerImage is required"))
-	}
-
-	// Validate that profilingConfig.config is provided
-	if v.request.Spec.ProfilingConfig.Config == nil || len(v.request.Spec.ProfilingConfig.Config.Raw) == 0 {
-		err = errors.Join(err, errors.New("spec.profilingConfig.config is required and must not be empty"))
+	// Validate image is specified (required for the profiling job container).
+	if v.request.Spec.Image == "" {
+		err = errors.Join(err, errors.New("spec.image is required"))
 	}

-	// Note: GPU discovery is now automatic for cluster-wide operators
-	// Namespace-restricted operators automatically skip GPU discovery and require manual hardware config
-
-	// Parse config to validate structure (only if config is present)
-	if v.request.Spec.ProfilingConfig.Config != nil && len(v.request.Spec.ProfilingConfig.Config.Raw) > 0 {
-		var config map[string]any
-		if parseErr := yaml.Unmarshal(v.request.Spec.ProfilingConfig.Config.Raw, &config); parseErr != nil {
-			err = errors.Join(err, fmt.Errorf("failed to parse spec.profilingConfig.config: %w", parseErr))
-		} else {
-			// Warn if deployment.model or engine.backend are specified in config (they will be overwritten by spec fields)
-			if engineConfig, ok := config["engine"].(map[string]any); ok {
-				if backend, ok := engineConfig["backend"].(string); ok && backend != "" && backend != v.request.Spec.Backend {
-					warnings = append(warnings, fmt.Sprintf("spec.profilingConfig.config.engine.backend (%s) will be overwritten by spec.backend (%s)", backend, v.request.Spec.Backend))
-				}
-			}
-			if deployment, ok := config["deployment"].(map[string]any); ok {
-				if model, ok := deployment["model"].(string); ok && model != "" && model != v.request.Spec.Model {
-					warnings = append(warnings, fmt.Sprintf("spec.profilingConfig.config.deployment.model (%s) will be overwritten by spec.model (%s)", model, v.request.Spec.Model))
-				}
-			}
-		}
+	// Disallow searchStrategy: thorough with backend: auto.
+	// "thorough" sweeps more configurations and requires a concrete backend to be selected;
+	// "auto" defers backend selection and is only compatible with the "rapid" search strategy.
+	if v.request.Spec.SearchStrategy == nvidiacomv1beta1.SearchStrategyThorough &&
+		v.request.Spec.Backend == nvidiacomv1beta1.BackendTypeAuto {
+		err = errors.Join(err, fmt.Errorf(
+			"spec.searchStrategy %q is incompatible with spec.backend %q: set spec.backend to a specific backend (sglang, trtllm, or vllm)",
+			nvidiacomv1beta1.SearchStrategyThorough,
+			nvidiacomv1beta1.BackendTypeAuto,
+		))
 	}

-	// Validate GPU hardware information is available (last, so other errors are collected first)
+	// Validate GPU hardware information is available (last, so other errors are collected first).
 	if gpuErr := v.validateGPUHardwareInfo(); gpuErr != nil {
 		err = errors.Join(err, gpuErr)
 	}

-	return warnings, err
+	return nil, err
 }

 // validateGPUHardwareInfo ensures GPU hardware information will be available for profiling.
 // Returns an error at admission time if GPU discovery is disabled and no manual hardware config is provided.
 func (v *DynamoGraphDeploymentRequestValidator) validateGPUHardwareInfo() error {
-	// Parse profiling config
-	var config map[string]any
-	if v.request.Spec.ProfilingConfig.Config != nil {
-		if err := yaml.Unmarshal(v.request.Spec.ProfilingConfig.Config.Raw, &config); err != nil {
-			// Config parse errors will be caught by other validators
-			return nil
-		}
-	} else {
-		config = make(map[string]any)
-	}
-
-	// Check if manual hardware config is provided
-	hardwareVal, hasHardware := config["hardware"]
+	// Check if manual hardware config is provided via typed spec.hardware fields.
 	var hasManualHardwareConfig bool
-	if hasHardware && hardwareVal != nil {
-		if hardwareConfig, ok := hardwareVal.(map[string]any); ok {
-			// Check if essential hardware fields are provided
-			_, hasGPUModel := hardwareConfig["gpuModel"]
-			_, hasGPUVram := hardwareConfig["gpuVramMib"]
-			_, hasNumGPUs := hardwareConfig["numGpusPerNode"]
-			hasManualHardwareConfig = hasGPUModel || hasGPUVram || hasNumGPUs
-		}
+	if hw := v.request.Spec.Hardware; hw != nil {
+		hasManualHardwareConfig = hw.GPUSKU != "" || hw.VRAMMB != nil || hw.NumGPUsPerNode != nil
 	}

-	// Check if explicit GPU ranges are provided
-	var hasExplicitGPURanges bool
-	if engineVal, hasEngine := config["engine"]; hasEngine && engineVal != nil {
-		if engineConfig, ok := engineVal.(map[string]any); ok {
-			minGPUs, hasMin := engineConfig["minNumGpusPerEngine"]
-			maxGPUs, hasMax := engineConfig["maxNumGpusPerEngine"]
-			// Validate explicit GPU ranges
-			if hasMin && hasMax {
-				minVal := toFloat64(minGPUs)
-				maxVal := toFloat64(maxGPUs)
-
-				// Validate that min <= max
-				if minVal > maxVal {
-					return fmt.Errorf("invalid GPU range: minNumGpusPerEngine (%v) cannot be greater than maxNumGpusPerEngine (%v)",
-						minVal, maxVal)
-				}
-
-				hasExplicitGPURanges = minVal > 0 && maxVal > 0
-			}
-		}
-	}
-
-	if hasManualHardwareConfig || hasExplicitGPURanges {
+	if hasManualHardwareConfig {
 		return nil
 	}

@@ -169,13 +94,40 @@ func (v *DynamoGraphDeploymentRequestValidator) validateGPUHardwareInfo() error
 		return nil
 	}

-	return errors.New("GPU hardware configuration required: GPU discovery is disabled (set dynamo-operator.gpuDiscovery.enabled=true in Helm values, or provide hardware config in spec.profilingConfig.config)")
+	return errors.New("GPU hardware configuration required: GPU discovery is disabled (set dynamo-operator.gpuDiscovery.enabled=true in Helm values, or provide hardware config in spec.hardware)")
 }

 // ValidateUpdate performs stateful validation comparing old and new DynamoGraphDeploymentRequest.
 // Returns warnings and error.
-func (v *DynamoGraphDeploymentRequestValidator) ValidateUpdate(old *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (admission.Warnings, error) {
-	// TODO: Add update validation logic for DynamoGraphDeploymentRequest
-	// Placeholder for future immutability checks
+func (v *DynamoGraphDeploymentRequestValidator) ValidateUpdate(old *nvidiacomv1beta1.DynamoGraphDeploymentRequest) (admission.Warnings, error) {
+	// Reject spec changes when the resource is in a non-editable lifecycle phase.
+	// During Profiling, Deploying, or Deployed the controller is actively reconciling
+	// the resource and spec mutations would conflict with in-flight operations.
+	phase := old.Status.Phase
+	immutablePhases := map[nvidiacomv1beta1.DGDRPhase]bool{
+		nvidiacomv1beta1.DGDRPhaseProfiling: true,
+		nvidiacomv1beta1.DGDRPhaseDeploying: true,
+		nvidiacomv1beta1.DGDRPhaseDeployed:  true,
+	}
+
+	if immutablePhases[phase] {
+		// Compare specs — if they differ, reject the update.
+		oldSpec := old.Spec
+		newSpec := v.request.Spec
+		if !specEqual(oldSpec, newSpec) {
+			return nil, fmt.Errorf("spec updates are forbidden while the resource is in phase %q; delete and recreate the resource to change its spec", phase)
+		}
+	}
+
 	return nil, nil
 }
+
+// specEqual performs a JSON-round-trip comparison of two DynamoGraphDeploymentRequestSpec values.
+func specEqual(a, b nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec) bool {
+	aj, err1 := json.Marshal(a)
+	bj, err2 := json.Marshal(b)
+	if err1 != nil || err2 != nil {
+		return false
+	}
+	return string(aj) == string(bj)
+}
--- a/deploy/operator/internal/webhook/validation/dynamographdeploymentrequest_handler.go
+++ b/deploy/operator/internal/webhook/validation/dynamographdeploymentrequest_handler.go
@@ -21,7 +21,7 @@ import (
 	"context"
 	"fmt"

-	nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
+	nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
 	"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
 	"github.com/ai-dynamo/dynamo/deploy/operator/internal/observability"
 	internalwebhook "github.com/ai-dynamo/dynamo/deploy/operator/internal/webhook"
@@ -34,7 +34,7 @@ import (
 const (
 	// DynamoGraphDeploymentRequestWebhookName is the name of the validating webhook handler for DynamoGraphDeploymentRequest.
 	DynamoGraphDeploymentRequestWebhookName = "dynamographdeploymentrequest-validating-webhook"
-	dynamoGraphDeploymentRequestWebhookPath = "/validate-nvidia-com-v1alpha1-dynamographdeploymentrequest"
+	dynamoGraphDeploymentRequestWebhookPath = "/validate-nvidia-com-v1beta1-dynamographdeploymentrequest"
 )

 // DynamoGraphDeploymentRequestHandler is a handler for validating DynamoGraphDeploymentRequest resources.
@@ -137,15 +137,15 @@ func (h *DynamoGraphDeploymentRequestHandler) RegisterWithManager(mgr manager.Ma
 	observedValidator := observability.NewObservedValidator(leaseAwareValidator, consts.ResourceTypeDynamoGraphDeploymentRequest)

 	webhook := admission.
-		WithCustomValidator(mgr.GetScheme(), &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{}, observedValidator).
+		WithCustomValidator(mgr.GetScheme(), &nvidiacomv1beta1.DynamoGraphDeploymentRequest{}, observedValidator).
 		WithRecoverPanic(true)
 	mgr.GetWebhookServer().Register(dynamoGraphDeploymentRequestWebhookPath, webhook)
 	return nil
 }

 // castToDynamoGraphDeploymentRequest attempts to cast a runtime.Object to a DynamoGraphDeploymentRequest.
-func castToDynamoGraphDeploymentRequest(obj runtime.Object) (*nvidiacomv1alpha1.DynamoGraphDeploymentRequest, error) {
-	request, ok := obj.(*nvidiacomv1alpha1.DynamoGraphDeploymentRequest)
+func castToDynamoGraphDeploymentRequest(obj runtime.Object) (*nvidiacomv1beta1.DynamoGraphDeploymentRequest, error) {
+	request, ok := obj.(*nvidiacomv1beta1.DynamoGraphDeploymentRequest)
 	if !ok {
 		return nil, fmt.Errorf("expected DynamoGraphDeploymentRequest but got %T", obj)
 	}

--- a/deploy/operator/internal/webhook/validation/dynamographdeploymentrequest_test.go
+++ b/deploy/operator/internal/webhook/validation/dynamographdeploymentrequest_test.go