Unverified Commit d100c6e2 authored by Jonathan Tong's avatar Jonathan Tong Committed by GitHub
Browse files

feat: use v1beta1 DGDR in controller (#6498)


Signed-off-by: default avatarJont828 <jt572@cornell.edu>
Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
Co-authored-by: default avatarCopilot <223556219+Copilot@users.noreply.github.com>
Co-authored-by: default avatarHannah Zhang <hannahz@nvidia.com>
parent 5a319aed
......@@ -461,7 +461,7 @@ spec:
type: object
type: object
served: true
storage: true
storage: false
subresources:
status: {}
- additionalPrinterColumns:
......@@ -9247,6 +9247,6 @@ spec:
type: object
type: object
served: true
storage: false
storage: true
subresources:
status: {}
......@@ -145,7 +145,7 @@ webhooks:
service:
name: {{ include "dynamo-operator.fullname" . }}-webhook-service
namespace: {{ .Release.Namespace }}
path: /validate-nvidia-com-v1alpha1-dynamographdeploymentrequest
path: /validate-nvidia-com-v1beta1-dynamographdeploymentrequest
failurePolicy: {{ .Values.webhook.failurePolicy }}
name: vdynamographdeploymentrequest.kb.io
{{- if .Values.webhook.namespaceSelector }}
......@@ -161,6 +161,7 @@ webhooks:
- nvidia.com
apiVersions:
- v1alpha1
- v1beta1
operations:
- CREATE
- UPDATE
......
......@@ -279,7 +279,6 @@ type DynamoGraphDeploymentRequestStatus struct {
//
// +kubebuilder:object:root=true
// +kubebuilder:subresource:status
// +kubebuilder:storageversion
// +kubebuilder:resource:shortName=dgdr
// +kubebuilder:deprecatedversion:warning="nvidia.com/v1alpha1 DynamoGraphDeploymentRequest is deprecated; use nvidia.com/v1beta1 DynamoGraphDeploymentRequest"
// +kubebuilder:printcolumn:name="Model",type=string,JSONPath=`.spec.model`
......
......@@ -496,6 +496,7 @@ type DynamoGraphDeploymentRequestStatus struct {
//
// +kubebuilder:object:root=true
// +kubebuilder:subresource:status
// +kubebuilder:storageversion
// +kubebuilder:resource:shortName=dgdr
// +kubebuilder:printcolumn:name="Model",type=string,JSONPath=`.spec.model`
// +kubebuilder:printcolumn:name="Backend",type=string,JSONPath=`.spec.backend`
......
......@@ -461,7 +461,7 @@ spec:
type: object
type: object
served: true
storage: true
storage: false
subresources:
status: {}
- additionalPrinterColumns:
......@@ -9247,6 +9247,6 @@ spec:
type: object
type: object
served: true
storage: false
storage: true
subresources:
status: {}
......@@ -27,6 +27,7 @@ import (
"testing"
"github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
......@@ -102,6 +103,8 @@ var _ = BeforeSuite(func() {
//+kubebuilder:scaffold:scheme
err = v1alpha1.AddToScheme(scheme)
Expect(err).NotTo(HaveOccurred())
err = v1beta1.AddToScheme(scheme)
Expect(err).NotTo(HaveOccurred())
err = corev1.AddToScheme(scheme)
Expect(err).NotTo(HaveOccurred())
err = autoscalingv2.AddToScheme(scheme)
......
......@@ -18,31 +18,18 @@
package validation
import (
"encoding/json"
"errors"
"fmt"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"k8s.io/apimachinery/pkg/util/yaml"
nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
)
// toFloat64 converts a numeric value (int or float64) to float64.
// Returns 0 if the value is neither int nor float64.
func toFloat64(val any) float64 {
switch v := val.(type) {
case float64:
return v
case int:
return float64(v)
default:
return 0
}
}
// DynamoGraphDeploymentRequestValidator validates DynamoGraphDeploymentRequest resources.
// This validator can be used by both webhooks and controllers for consistent validation.
type DynamoGraphDeploymentRequestValidator struct {
request *nvidiacomv1alpha1.DynamoGraphDeploymentRequest
request *nvidiacomv1beta1.DynamoGraphDeploymentRequest
isClusterWideOperator bool
gpuDiscoveryEnabled bool
}
......@@ -50,7 +37,7 @@ type DynamoGraphDeploymentRequestValidator struct {
// NewDynamoGraphDeploymentRequestValidator creates a new validator for DynamoGraphDeploymentRequest.
// isClusterWide indicates whether the operator has cluster-wide permissions.
// gpuDiscoveryEnabled indicates whether Helm provisioned node read access for the operator.
func NewDynamoGraphDeploymentRequestValidator(request *nvidiacomv1alpha1.DynamoGraphDeploymentRequest, isClusterWide bool, gpuDiscoveryEnabled bool) *DynamoGraphDeploymentRequestValidator {
func NewDynamoGraphDeploymentRequestValidator(request *nvidiacomv1beta1.DynamoGraphDeploymentRequest, isClusterWide bool, gpuDiscoveryEnabled bool) *DynamoGraphDeploymentRequestValidator {
return &DynamoGraphDeploymentRequestValidator{
request: request,
isClusterWideOperator: isClusterWide,
......@@ -61,105 +48,43 @@ func NewDynamoGraphDeploymentRequestValidator(request *nvidiacomv1alpha1.DynamoG
// Validate performs stateless validation on the DynamoGraphDeploymentRequest.
// Returns warnings and error.
func (v *DynamoGraphDeploymentRequestValidator) Validate() (admission.Warnings, error) {
var warnings admission.Warnings
var err error
// Warn about deprecated enableGpuDiscovery field
if v.request.Spec.EnableGPUDiscovery != nil {
warnings = append(warnings, "spec.enableGpuDiscovery is deprecated and will be removed in v1beta1. GPU discovery is now always attempted automatically. This field has no effect.")
}
// Validate profiler image is specified
if v.request.Spec.ProfilingConfig.ProfilerImage == "" {
err = errors.Join(err, errors.New("spec.profilingConfig.profilerImage is required"))
}
// Validate that profilingConfig.config is provided
if v.request.Spec.ProfilingConfig.Config == nil || len(v.request.Spec.ProfilingConfig.Config.Raw) == 0 {
err = errors.Join(err, errors.New("spec.profilingConfig.config is required and must not be empty"))
// Validate image is specified (required for the profiling job container).
if v.request.Spec.Image == "" {
err = errors.Join(err, errors.New("spec.image is required"))
}
// Note: GPU discovery is now automatic for cluster-wide operators
// Namespace-restricted operators automatically skip GPU discovery and require manual hardware config
// Parse config to validate structure (only if config is present)
if v.request.Spec.ProfilingConfig.Config != nil && len(v.request.Spec.ProfilingConfig.Config.Raw) > 0 {
var config map[string]any
if parseErr := yaml.Unmarshal(v.request.Spec.ProfilingConfig.Config.Raw, &config); parseErr != nil {
err = errors.Join(err, fmt.Errorf("failed to parse spec.profilingConfig.config: %w", parseErr))
} else {
// Warn if deployment.model or engine.backend are specified in config (they will be overwritten by spec fields)
if engineConfig, ok := config["engine"].(map[string]any); ok {
if backend, ok := engineConfig["backend"].(string); ok && backend != "" && backend != v.request.Spec.Backend {
warnings = append(warnings, fmt.Sprintf("spec.profilingConfig.config.engine.backend (%s) will be overwritten by spec.backend (%s)", backend, v.request.Spec.Backend))
}
}
if deployment, ok := config["deployment"].(map[string]any); ok {
if model, ok := deployment["model"].(string); ok && model != "" && model != v.request.Spec.Model {
warnings = append(warnings, fmt.Sprintf("spec.profilingConfig.config.deployment.model (%s) will be overwritten by spec.model (%s)", model, v.request.Spec.Model))
}
}
}
// Disallow searchStrategy: thorough with backend: auto.
// "thorough" sweeps more configurations and requires a concrete backend to be selected;
// "auto" defers backend selection and is only compatible with the "rapid" search strategy.
if v.request.Spec.SearchStrategy == nvidiacomv1beta1.SearchStrategyThorough &&
v.request.Spec.Backend == nvidiacomv1beta1.BackendTypeAuto {
err = errors.Join(err, fmt.Errorf(
"spec.searchStrategy %q is incompatible with spec.backend %q: set spec.backend to a specific backend (sglang, trtllm, or vllm)",
nvidiacomv1beta1.SearchStrategyThorough,
nvidiacomv1beta1.BackendTypeAuto,
))
}
// Validate GPU hardware information is available (last, so other errors are collected first)
// Validate GPU hardware information is available (last, so other errors are collected first).
if gpuErr := v.validateGPUHardwareInfo(); gpuErr != nil {
err = errors.Join(err, gpuErr)
}
return warnings, err
return nil, err
}
// validateGPUHardwareInfo ensures GPU hardware information will be available for profiling.
// Returns an error at admission time if GPU discovery is disabled and no manual hardware config is provided.
func (v *DynamoGraphDeploymentRequestValidator) validateGPUHardwareInfo() error {
// Parse profiling config
var config map[string]any
if v.request.Spec.ProfilingConfig.Config != nil {
if err := yaml.Unmarshal(v.request.Spec.ProfilingConfig.Config.Raw, &config); err != nil {
// Config parse errors will be caught by other validators
return nil
}
} else {
config = make(map[string]any)
}
// Check if manual hardware config is provided
hardwareVal, hasHardware := config["hardware"]
// Check if manual hardware config is provided via typed spec.hardware fields.
var hasManualHardwareConfig bool
if hasHardware && hardwareVal != nil {
if hardwareConfig, ok := hardwareVal.(map[string]any); ok {
// Check if essential hardware fields are provided
_, hasGPUModel := hardwareConfig["gpuModel"]
_, hasGPUVram := hardwareConfig["gpuVramMib"]
_, hasNumGPUs := hardwareConfig["numGpusPerNode"]
hasManualHardwareConfig = hasGPUModel || hasGPUVram || hasNumGPUs
}
if hw := v.request.Spec.Hardware; hw != nil {
hasManualHardwareConfig = hw.GPUSKU != "" || hw.VRAMMB != nil || hw.NumGPUsPerNode != nil
}
// Check if explicit GPU ranges are provided
var hasExplicitGPURanges bool
if engineVal, hasEngine := config["engine"]; hasEngine && engineVal != nil {
if engineConfig, ok := engineVal.(map[string]any); ok {
minGPUs, hasMin := engineConfig["minNumGpusPerEngine"]
maxGPUs, hasMax := engineConfig["maxNumGpusPerEngine"]
// Validate explicit GPU ranges
if hasMin && hasMax {
minVal := toFloat64(minGPUs)
maxVal := toFloat64(maxGPUs)
// Validate that min <= max
if minVal > maxVal {
return fmt.Errorf("invalid GPU range: minNumGpusPerEngine (%v) cannot be greater than maxNumGpusPerEngine (%v)",
minVal, maxVal)
}
hasExplicitGPURanges = minVal > 0 && maxVal > 0
}
}
}
if hasManualHardwareConfig || hasExplicitGPURanges {
if hasManualHardwareConfig {
return nil
}
......@@ -169,13 +94,40 @@ func (v *DynamoGraphDeploymentRequestValidator) validateGPUHardwareInfo() error
return nil
}
return errors.New("GPU hardware configuration required: GPU discovery is disabled (set dynamo-operator.gpuDiscovery.enabled=true in Helm values, or provide hardware config in spec.profilingConfig.config)")
return errors.New("GPU hardware configuration required: GPU discovery is disabled (set dynamo-operator.gpuDiscovery.enabled=true in Helm values, or provide hardware config in spec.hardware)")
}
// ValidateUpdate performs stateful validation comparing old and new DynamoGraphDeploymentRequest.
// Returns warnings and error.
func (v *DynamoGraphDeploymentRequestValidator) ValidateUpdate(old *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (admission.Warnings, error) {
// TODO: Add update validation logic for DynamoGraphDeploymentRequest
// Placeholder for future immutability checks
func (v *DynamoGraphDeploymentRequestValidator) ValidateUpdate(old *nvidiacomv1beta1.DynamoGraphDeploymentRequest) (admission.Warnings, error) {
// Reject spec changes when the resource is in a non-editable lifecycle phase.
// During Profiling, Deploying, or Deployed the controller is actively reconciling
// the resource and spec mutations would conflict with in-flight operations.
phase := old.Status.Phase
immutablePhases := map[nvidiacomv1beta1.DGDRPhase]bool{
nvidiacomv1beta1.DGDRPhaseProfiling: true,
nvidiacomv1beta1.DGDRPhaseDeploying: true,
nvidiacomv1beta1.DGDRPhaseDeployed: true,
}
if immutablePhases[phase] {
// Compare specs — if they differ, reject the update.
oldSpec := old.Spec
newSpec := v.request.Spec
if !specEqual(oldSpec, newSpec) {
return nil, fmt.Errorf("spec updates are forbidden while the resource is in phase %q; delete and recreate the resource to change its spec", phase)
}
}
return nil, nil
}
// specEqual performs a JSON-round-trip comparison of two DynamoGraphDeploymentRequestSpec values.
func specEqual(a, b nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec) bool {
aj, err1 := json.Marshal(a)
bj, err2 := json.Marshal(b)
if err1 != nil || err2 != nil {
return false
}
return string(aj) == string(bj)
}
......@@ -21,7 +21,7 @@ import (
"context"
"fmt"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/observability"
internalwebhook "github.com/ai-dynamo/dynamo/deploy/operator/internal/webhook"
......@@ -34,7 +34,7 @@ import (
const (
// DynamoGraphDeploymentRequestWebhookName is the name of the validating webhook handler for DynamoGraphDeploymentRequest.
DynamoGraphDeploymentRequestWebhookName = "dynamographdeploymentrequest-validating-webhook"
dynamoGraphDeploymentRequestWebhookPath = "/validate-nvidia-com-v1alpha1-dynamographdeploymentrequest"
dynamoGraphDeploymentRequestWebhookPath = "/validate-nvidia-com-v1beta1-dynamographdeploymentrequest"
)
// DynamoGraphDeploymentRequestHandler is a handler for validating DynamoGraphDeploymentRequest resources.
......@@ -137,15 +137,15 @@ func (h *DynamoGraphDeploymentRequestHandler) RegisterWithManager(mgr manager.Ma
observedValidator := observability.NewObservedValidator(leaseAwareValidator, consts.ResourceTypeDynamoGraphDeploymentRequest)
webhook := admission.
WithCustomValidator(mgr.GetScheme(), &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{}, observedValidator).
WithCustomValidator(mgr.GetScheme(), &nvidiacomv1beta1.DynamoGraphDeploymentRequest{}, observedValidator).
WithRecoverPanic(true)
mgr.GetWebhookServer().Register(dynamoGraphDeploymentRequestWebhookPath, webhook)
return nil
}
// castToDynamoGraphDeploymentRequest attempts to cast a runtime.Object to a DynamoGraphDeploymentRequest.
func castToDynamoGraphDeploymentRequest(obj runtime.Object) (*nvidiacomv1alpha1.DynamoGraphDeploymentRequest, error) {
request, ok := obj.(*nvidiacomv1alpha1.DynamoGraphDeploymentRequest)
func castToDynamoGraphDeploymentRequest(obj runtime.Object) (*nvidiacomv1beta1.DynamoGraphDeploymentRequest, error) {
request, ok := obj.(*nvidiacomv1beta1.DynamoGraphDeploymentRequest)
if !ok {
return nil, fmt.Errorf("expected DynamoGraphDeploymentRequest but got %T", obj)
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment