Unverified Commit d100c6e2 authored by Jonathan Tong's avatar Jonathan Tong Committed by GitHub
Browse files

feat: use v1beta1 DGDR in controller (#6498)


Signed-off-by: default avatarJont828 <jt572@cornell.edu>
Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
Co-authored-by: default avatarCopilot <223556219+Copilot@users.noreply.github.com>
Co-authored-by: default avatarHannah Zhang <hannahz@nvidia.com>
parent 5a319aed
...@@ -461,7 +461,7 @@ spec: ...@@ -461,7 +461,7 @@ spec:
type: object type: object
type: object type: object
served: true served: true
storage: true storage: false
subresources: subresources:
status: {} status: {}
- additionalPrinterColumns: - additionalPrinterColumns:
...@@ -9247,6 +9247,6 @@ spec: ...@@ -9247,6 +9247,6 @@ spec:
type: object type: object
type: object type: object
served: true served: true
storage: false storage: true
subresources: subresources:
status: {} status: {}
...@@ -145,7 +145,7 @@ webhooks: ...@@ -145,7 +145,7 @@ webhooks:
service: service:
name: {{ include "dynamo-operator.fullname" . }}-webhook-service name: {{ include "dynamo-operator.fullname" . }}-webhook-service
namespace: {{ .Release.Namespace }} namespace: {{ .Release.Namespace }}
path: /validate-nvidia-com-v1alpha1-dynamographdeploymentrequest path: /validate-nvidia-com-v1beta1-dynamographdeploymentrequest
failurePolicy: {{ .Values.webhook.failurePolicy }} failurePolicy: {{ .Values.webhook.failurePolicy }}
name: vdynamographdeploymentrequest.kb.io name: vdynamographdeploymentrequest.kb.io
{{- if .Values.webhook.namespaceSelector }} {{- if .Values.webhook.namespaceSelector }}
...@@ -161,6 +161,7 @@ webhooks: ...@@ -161,6 +161,7 @@ webhooks:
- nvidia.com - nvidia.com
apiVersions: apiVersions:
- v1alpha1 - v1alpha1
- v1beta1
operations: operations:
- CREATE - CREATE
- UPDATE - UPDATE
......
...@@ -279,7 +279,6 @@ type DynamoGraphDeploymentRequestStatus struct { ...@@ -279,7 +279,6 @@ type DynamoGraphDeploymentRequestStatus struct {
// //
// +kubebuilder:object:root=true // +kubebuilder:object:root=true
// +kubebuilder:subresource:status // +kubebuilder:subresource:status
// +kubebuilder:storageversion
// +kubebuilder:resource:shortName=dgdr // +kubebuilder:resource:shortName=dgdr
// +kubebuilder:deprecatedversion:warning="nvidia.com/v1alpha1 DynamoGraphDeploymentRequest is deprecated; use nvidia.com/v1beta1 DynamoGraphDeploymentRequest" // +kubebuilder:deprecatedversion:warning="nvidia.com/v1alpha1 DynamoGraphDeploymentRequest is deprecated; use nvidia.com/v1beta1 DynamoGraphDeploymentRequest"
// +kubebuilder:printcolumn:name="Model",type=string,JSONPath=`.spec.model` // +kubebuilder:printcolumn:name="Model",type=string,JSONPath=`.spec.model`
......
...@@ -496,6 +496,7 @@ type DynamoGraphDeploymentRequestStatus struct { ...@@ -496,6 +496,7 @@ type DynamoGraphDeploymentRequestStatus struct {
// //
// +kubebuilder:object:root=true // +kubebuilder:object:root=true
// +kubebuilder:subresource:status // +kubebuilder:subresource:status
// +kubebuilder:storageversion
// +kubebuilder:resource:shortName=dgdr // +kubebuilder:resource:shortName=dgdr
// +kubebuilder:printcolumn:name="Model",type=string,JSONPath=`.spec.model` // +kubebuilder:printcolumn:name="Model",type=string,JSONPath=`.spec.model`
// +kubebuilder:printcolumn:name="Backend",type=string,JSONPath=`.spec.backend` // +kubebuilder:printcolumn:name="Backend",type=string,JSONPath=`.spec.backend`
......
...@@ -461,7 +461,7 @@ spec: ...@@ -461,7 +461,7 @@ spec:
type: object type: object
type: object type: object
served: true served: true
storage: true storage: false
subresources: subresources:
status: {} status: {}
- additionalPrinterColumns: - additionalPrinterColumns:
...@@ -9247,6 +9247,6 @@ spec: ...@@ -9247,6 +9247,6 @@ spec:
type: object type: object
type: object type: object
served: true served: true
storage: false storage: true
subresources: subresources:
status: {} status: {}
...@@ -27,6 +27,7 @@ import ( ...@@ -27,6 +27,7 @@ import (
"testing" "testing"
"github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
. "github.com/onsi/ginkgo/v2" . "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega" . "github.com/onsi/gomega"
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
...@@ -102,6 +103,8 @@ var _ = BeforeSuite(func() { ...@@ -102,6 +103,8 @@ var _ = BeforeSuite(func() {
//+kubebuilder:scaffold:scheme //+kubebuilder:scaffold:scheme
err = v1alpha1.AddToScheme(scheme) err = v1alpha1.AddToScheme(scheme)
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
err = v1beta1.AddToScheme(scheme)
Expect(err).NotTo(HaveOccurred())
err = corev1.AddToScheme(scheme) err = corev1.AddToScheme(scheme)
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
err = autoscalingv2.AddToScheme(scheme) err = autoscalingv2.AddToScheme(scheme)
......
...@@ -18,31 +18,18 @@ ...@@ -18,31 +18,18 @@
package validation package validation
import ( import (
"encoding/json"
"errors" "errors"
"fmt" "fmt"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
"k8s.io/apimachinery/pkg/util/yaml"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission" "sigs.k8s.io/controller-runtime/pkg/webhook/admission"
) )
// toFloat64 converts a numeric value (int or float64) to float64.
// Returns 0 if the value is neither int nor float64.
func toFloat64(val any) float64 {
switch v := val.(type) {
case float64:
return v
case int:
return float64(v)
default:
return 0
}
}
// DynamoGraphDeploymentRequestValidator validates DynamoGraphDeploymentRequest resources. // DynamoGraphDeploymentRequestValidator validates DynamoGraphDeploymentRequest resources.
// This validator can be used by both webhooks and controllers for consistent validation. // This validator can be used by both webhooks and controllers for consistent validation.
type DynamoGraphDeploymentRequestValidator struct { type DynamoGraphDeploymentRequestValidator struct {
request *nvidiacomv1alpha1.DynamoGraphDeploymentRequest request *nvidiacomv1beta1.DynamoGraphDeploymentRequest
isClusterWideOperator bool isClusterWideOperator bool
gpuDiscoveryEnabled bool gpuDiscoveryEnabled bool
} }
...@@ -50,7 +37,7 @@ type DynamoGraphDeploymentRequestValidator struct { ...@@ -50,7 +37,7 @@ type DynamoGraphDeploymentRequestValidator struct {
// NewDynamoGraphDeploymentRequestValidator creates a new validator for DynamoGraphDeploymentRequest. // NewDynamoGraphDeploymentRequestValidator creates a new validator for DynamoGraphDeploymentRequest.
// isClusterWide indicates whether the operator has cluster-wide permissions. // isClusterWide indicates whether the operator has cluster-wide permissions.
// gpuDiscoveryEnabled indicates whether Helm provisioned node read access for the operator. // gpuDiscoveryEnabled indicates whether Helm provisioned node read access for the operator.
func NewDynamoGraphDeploymentRequestValidator(request *nvidiacomv1alpha1.DynamoGraphDeploymentRequest, isClusterWide bool, gpuDiscoveryEnabled bool) *DynamoGraphDeploymentRequestValidator { func NewDynamoGraphDeploymentRequestValidator(request *nvidiacomv1beta1.DynamoGraphDeploymentRequest, isClusterWide bool, gpuDiscoveryEnabled bool) *DynamoGraphDeploymentRequestValidator {
return &DynamoGraphDeploymentRequestValidator{ return &DynamoGraphDeploymentRequestValidator{
request: request, request: request,
isClusterWideOperator: isClusterWide, isClusterWideOperator: isClusterWide,
...@@ -61,105 +48,43 @@ func NewDynamoGraphDeploymentRequestValidator(request *nvidiacomv1alpha1.DynamoG ...@@ -61,105 +48,43 @@ func NewDynamoGraphDeploymentRequestValidator(request *nvidiacomv1alpha1.DynamoG
// Validate performs stateless validation on the DynamoGraphDeploymentRequest. // Validate performs stateless validation on the DynamoGraphDeploymentRequest.
// Returns warnings and error. // Returns warnings and error.
func (v *DynamoGraphDeploymentRequestValidator) Validate() (admission.Warnings, error) { func (v *DynamoGraphDeploymentRequestValidator) Validate() (admission.Warnings, error) {
var warnings admission.Warnings
var err error var err error
// Warn about deprecated enableGpuDiscovery field // Validate image is specified (required for the profiling job container).
if v.request.Spec.EnableGPUDiscovery != nil { if v.request.Spec.Image == "" {
warnings = append(warnings, "spec.enableGpuDiscovery is deprecated and will be removed in v1beta1. GPU discovery is now always attempted automatically. This field has no effect.") err = errors.Join(err, errors.New("spec.image is required"))
}
// Validate profiler image is specified
if v.request.Spec.ProfilingConfig.ProfilerImage == "" {
err = errors.Join(err, errors.New("spec.profilingConfig.profilerImage is required"))
}
// Validate that profilingConfig.config is provided
if v.request.Spec.ProfilingConfig.Config == nil || len(v.request.Spec.ProfilingConfig.Config.Raw) == 0 {
err = errors.Join(err, errors.New("spec.profilingConfig.config is required and must not be empty"))
} }
// Note: GPU discovery is now automatic for cluster-wide operators // Disallow searchStrategy: thorough with backend: auto.
// Namespace-restricted operators automatically skip GPU discovery and require manual hardware config // "thorough" sweeps more configurations and requires a concrete backend to be selected;
// "auto" defers backend selection and is only compatible with the "rapid" search strategy.
// Parse config to validate structure (only if config is present) if v.request.Spec.SearchStrategy == nvidiacomv1beta1.SearchStrategyThorough &&
if v.request.Spec.ProfilingConfig.Config != nil && len(v.request.Spec.ProfilingConfig.Config.Raw) > 0 { v.request.Spec.Backend == nvidiacomv1beta1.BackendTypeAuto {
var config map[string]any err = errors.Join(err, fmt.Errorf(
if parseErr := yaml.Unmarshal(v.request.Spec.ProfilingConfig.Config.Raw, &config); parseErr != nil { "spec.searchStrategy %q is incompatible with spec.backend %q: set spec.backend to a specific backend (sglang, trtllm, or vllm)",
err = errors.Join(err, fmt.Errorf("failed to parse spec.profilingConfig.config: %w", parseErr)) nvidiacomv1beta1.SearchStrategyThorough,
} else { nvidiacomv1beta1.BackendTypeAuto,
// Warn if deployment.model or engine.backend are specified in config (they will be overwritten by spec fields) ))
if engineConfig, ok := config["engine"].(map[string]any); ok {
if backend, ok := engineConfig["backend"].(string); ok && backend != "" && backend != v.request.Spec.Backend {
warnings = append(warnings, fmt.Sprintf("spec.profilingConfig.config.engine.backend (%s) will be overwritten by spec.backend (%s)", backend, v.request.Spec.Backend))
}
}
if deployment, ok := config["deployment"].(map[string]any); ok {
if model, ok := deployment["model"].(string); ok && model != "" && model != v.request.Spec.Model {
warnings = append(warnings, fmt.Sprintf("spec.profilingConfig.config.deployment.model (%s) will be overwritten by spec.model (%s)", model, v.request.Spec.Model))
}
}
}
} }
// Validate GPU hardware information is available (last, so other errors are collected first) // Validate GPU hardware information is available (last, so other errors are collected first).
if gpuErr := v.validateGPUHardwareInfo(); gpuErr != nil { if gpuErr := v.validateGPUHardwareInfo(); gpuErr != nil {
err = errors.Join(err, gpuErr) err = errors.Join(err, gpuErr)
} }
return warnings, err return nil, err
} }
// validateGPUHardwareInfo ensures GPU hardware information will be available for profiling. // validateGPUHardwareInfo ensures GPU hardware information will be available for profiling.
// Returns an error at admission time if GPU discovery is disabled and no manual hardware config is provided. // Returns an error at admission time if GPU discovery is disabled and no manual hardware config is provided.
func (v *DynamoGraphDeploymentRequestValidator) validateGPUHardwareInfo() error { func (v *DynamoGraphDeploymentRequestValidator) validateGPUHardwareInfo() error {
// Parse profiling config // Check if manual hardware config is provided via typed spec.hardware fields.
var config map[string]any
if v.request.Spec.ProfilingConfig.Config != nil {
if err := yaml.Unmarshal(v.request.Spec.ProfilingConfig.Config.Raw, &config); err != nil {
// Config parse errors will be caught by other validators
return nil
}
} else {
config = make(map[string]any)
}
// Check if manual hardware config is provided
hardwareVal, hasHardware := config["hardware"]
var hasManualHardwareConfig bool var hasManualHardwareConfig bool
if hasHardware && hardwareVal != nil { if hw := v.request.Spec.Hardware; hw != nil {
if hardwareConfig, ok := hardwareVal.(map[string]any); ok { hasManualHardwareConfig = hw.GPUSKU != "" || hw.VRAMMB != nil || hw.NumGPUsPerNode != nil
// Check if essential hardware fields are provided
_, hasGPUModel := hardwareConfig["gpuModel"]
_, hasGPUVram := hardwareConfig["gpuVramMib"]
_, hasNumGPUs := hardwareConfig["numGpusPerNode"]
hasManualHardwareConfig = hasGPUModel || hasGPUVram || hasNumGPUs
}
} }
// Check if explicit GPU ranges are provided if hasManualHardwareConfig {
var hasExplicitGPURanges bool
if engineVal, hasEngine := config["engine"]; hasEngine && engineVal != nil {
if engineConfig, ok := engineVal.(map[string]any); ok {
minGPUs, hasMin := engineConfig["minNumGpusPerEngine"]
maxGPUs, hasMax := engineConfig["maxNumGpusPerEngine"]
// Validate explicit GPU ranges
if hasMin && hasMax {
minVal := toFloat64(minGPUs)
maxVal := toFloat64(maxGPUs)
// Validate that min <= max
if minVal > maxVal {
return fmt.Errorf("invalid GPU range: minNumGpusPerEngine (%v) cannot be greater than maxNumGpusPerEngine (%v)",
minVal, maxVal)
}
hasExplicitGPURanges = minVal > 0 && maxVal > 0
}
}
}
if hasManualHardwareConfig || hasExplicitGPURanges {
return nil return nil
} }
...@@ -169,13 +94,40 @@ func (v *DynamoGraphDeploymentRequestValidator) validateGPUHardwareInfo() error ...@@ -169,13 +94,40 @@ func (v *DynamoGraphDeploymentRequestValidator) validateGPUHardwareInfo() error
return nil return nil
} }
return errors.New("GPU hardware configuration required: GPU discovery is disabled (set dynamo-operator.gpuDiscovery.enabled=true in Helm values, or provide hardware config in spec.profilingConfig.config)") return errors.New("GPU hardware configuration required: GPU discovery is disabled (set dynamo-operator.gpuDiscovery.enabled=true in Helm values, or provide hardware config in spec.hardware)")
} }
// ValidateUpdate performs stateful validation comparing old and new DynamoGraphDeploymentRequest. // ValidateUpdate performs stateful validation comparing old and new DynamoGraphDeploymentRequest.
// Returns warnings and error. // Returns warnings and error.
func (v *DynamoGraphDeploymentRequestValidator) ValidateUpdate(old *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (admission.Warnings, error) { func (v *DynamoGraphDeploymentRequestValidator) ValidateUpdate(old *nvidiacomv1beta1.DynamoGraphDeploymentRequest) (admission.Warnings, error) {
// TODO: Add update validation logic for DynamoGraphDeploymentRequest // Reject spec changes when the resource is in a non-editable lifecycle phase.
// Placeholder for future immutability checks // During Profiling, Deploying, or Deployed the controller is actively reconciling
// the resource and spec mutations would conflict with in-flight operations.
phase := old.Status.Phase
immutablePhases := map[nvidiacomv1beta1.DGDRPhase]bool{
nvidiacomv1beta1.DGDRPhaseProfiling: true,
nvidiacomv1beta1.DGDRPhaseDeploying: true,
nvidiacomv1beta1.DGDRPhaseDeployed: true,
}
if immutablePhases[phase] {
// Compare specs — if they differ, reject the update.
oldSpec := old.Spec
newSpec := v.request.Spec
if !specEqual(oldSpec, newSpec) {
return nil, fmt.Errorf("spec updates are forbidden while the resource is in phase %q; delete and recreate the resource to change its spec", phase)
}
}
return nil, nil return nil, nil
} }
// specEqual performs a JSON-round-trip comparison of two DynamoGraphDeploymentRequestSpec values.
func specEqual(a, b nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec) bool {
aj, err1 := json.Marshal(a)
bj, err2 := json.Marshal(b)
if err1 != nil || err2 != nil {
return false
}
return string(aj) == string(bj)
}
...@@ -21,7 +21,7 @@ import ( ...@@ -21,7 +21,7 @@ import (
"context" "context"
"fmt" "fmt"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts" "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/observability" "github.com/ai-dynamo/dynamo/deploy/operator/internal/observability"
internalwebhook "github.com/ai-dynamo/dynamo/deploy/operator/internal/webhook" internalwebhook "github.com/ai-dynamo/dynamo/deploy/operator/internal/webhook"
...@@ -34,7 +34,7 @@ import ( ...@@ -34,7 +34,7 @@ import (
const ( const (
// DynamoGraphDeploymentRequestWebhookName is the name of the validating webhook handler for DynamoGraphDeploymentRequest. // DynamoGraphDeploymentRequestWebhookName is the name of the validating webhook handler for DynamoGraphDeploymentRequest.
DynamoGraphDeploymentRequestWebhookName = "dynamographdeploymentrequest-validating-webhook" DynamoGraphDeploymentRequestWebhookName = "dynamographdeploymentrequest-validating-webhook"
dynamoGraphDeploymentRequestWebhookPath = "/validate-nvidia-com-v1alpha1-dynamographdeploymentrequest" dynamoGraphDeploymentRequestWebhookPath = "/validate-nvidia-com-v1beta1-dynamographdeploymentrequest"
) )
// DynamoGraphDeploymentRequestHandler is a handler for validating DynamoGraphDeploymentRequest resources. // DynamoGraphDeploymentRequestHandler is a handler for validating DynamoGraphDeploymentRequest resources.
...@@ -137,15 +137,15 @@ func (h *DynamoGraphDeploymentRequestHandler) RegisterWithManager(mgr manager.Ma ...@@ -137,15 +137,15 @@ func (h *DynamoGraphDeploymentRequestHandler) RegisterWithManager(mgr manager.Ma
observedValidator := observability.NewObservedValidator(leaseAwareValidator, consts.ResourceTypeDynamoGraphDeploymentRequest) observedValidator := observability.NewObservedValidator(leaseAwareValidator, consts.ResourceTypeDynamoGraphDeploymentRequest)
webhook := admission. webhook := admission.
WithCustomValidator(mgr.GetScheme(), &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{}, observedValidator). WithCustomValidator(mgr.GetScheme(), &nvidiacomv1beta1.DynamoGraphDeploymentRequest{}, observedValidator).
WithRecoverPanic(true) WithRecoverPanic(true)
mgr.GetWebhookServer().Register(dynamoGraphDeploymentRequestWebhookPath, webhook) mgr.GetWebhookServer().Register(dynamoGraphDeploymentRequestWebhookPath, webhook)
return nil return nil
} }
// castToDynamoGraphDeploymentRequest attempts to cast a runtime.Object to a DynamoGraphDeploymentRequest. // castToDynamoGraphDeploymentRequest attempts to cast a runtime.Object to a DynamoGraphDeploymentRequest.
func castToDynamoGraphDeploymentRequest(obj runtime.Object) (*nvidiacomv1alpha1.DynamoGraphDeploymentRequest, error) { func castToDynamoGraphDeploymentRequest(obj runtime.Object) (*nvidiacomv1beta1.DynamoGraphDeploymentRequest, error) {
request, ok := obj.(*nvidiacomv1alpha1.DynamoGraphDeploymentRequest) request, ok := obj.(*nvidiacomv1beta1.DynamoGraphDeploymentRequest)
if !ok { if !ok {
return nil, fmt.Errorf("expected DynamoGraphDeploymentRequest but got %T", obj) return nil, fmt.Errorf("expected DynamoGraphDeploymentRequest but got %T", obj)
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment