Unverified Commit d100c6e2 authored by Jonathan Tong's avatar Jonathan Tong Committed by GitHub
Browse files

feat: use v1beta1 DGDR in controller (#6498)


Signed-off-by: default avatarJont828 <jt572@cornell.edu>
Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
Co-authored-by: default avatarCopilot <223556219+Copilot@users.noreply.github.com>
Co-authored-by: default avatarHannah Zhang <hannahz@nvidia.com>
parent 5a319aed
...@@ -461,7 +461,7 @@ spec: ...@@ -461,7 +461,7 @@ spec:
type: object type: object
type: object type: object
served: true served: true
storage: true storage: false
subresources: subresources:
status: {} status: {}
- additionalPrinterColumns: - additionalPrinterColumns:
...@@ -9247,6 +9247,6 @@ spec: ...@@ -9247,6 +9247,6 @@ spec:
type: object type: object
type: object type: object
served: true served: true
storage: false storage: true
subresources: subresources:
status: {} status: {}
...@@ -145,7 +145,7 @@ webhooks: ...@@ -145,7 +145,7 @@ webhooks:
service: service:
name: {{ include "dynamo-operator.fullname" . }}-webhook-service name: {{ include "dynamo-operator.fullname" . }}-webhook-service
namespace: {{ .Release.Namespace }} namespace: {{ .Release.Namespace }}
path: /validate-nvidia-com-v1alpha1-dynamographdeploymentrequest path: /validate-nvidia-com-v1beta1-dynamographdeploymentrequest
failurePolicy: {{ .Values.webhook.failurePolicy }} failurePolicy: {{ .Values.webhook.failurePolicy }}
name: vdynamographdeploymentrequest.kb.io name: vdynamographdeploymentrequest.kb.io
{{- if .Values.webhook.namespaceSelector }} {{- if .Values.webhook.namespaceSelector }}
...@@ -161,6 +161,7 @@ webhooks: ...@@ -161,6 +161,7 @@ webhooks:
- nvidia.com - nvidia.com
apiVersions: apiVersions:
- v1alpha1 - v1alpha1
- v1beta1
operations: operations:
- CREATE - CREATE
- UPDATE - UPDATE
......
...@@ -279,7 +279,6 @@ type DynamoGraphDeploymentRequestStatus struct { ...@@ -279,7 +279,6 @@ type DynamoGraphDeploymentRequestStatus struct {
// //
// +kubebuilder:object:root=true // +kubebuilder:object:root=true
// +kubebuilder:subresource:status // +kubebuilder:subresource:status
// +kubebuilder:storageversion
// +kubebuilder:resource:shortName=dgdr // +kubebuilder:resource:shortName=dgdr
// +kubebuilder:deprecatedversion:warning="nvidia.com/v1alpha1 DynamoGraphDeploymentRequest is deprecated; use nvidia.com/v1beta1 DynamoGraphDeploymentRequest" // +kubebuilder:deprecatedversion:warning="nvidia.com/v1alpha1 DynamoGraphDeploymentRequest is deprecated; use nvidia.com/v1beta1 DynamoGraphDeploymentRequest"
// +kubebuilder:printcolumn:name="Model",type=string,JSONPath=`.spec.model` // +kubebuilder:printcolumn:name="Model",type=string,JSONPath=`.spec.model`
......
...@@ -496,6 +496,7 @@ type DynamoGraphDeploymentRequestStatus struct { ...@@ -496,6 +496,7 @@ type DynamoGraphDeploymentRequestStatus struct {
// //
// +kubebuilder:object:root=true // +kubebuilder:object:root=true
// +kubebuilder:subresource:status // +kubebuilder:subresource:status
// +kubebuilder:storageversion
// +kubebuilder:resource:shortName=dgdr // +kubebuilder:resource:shortName=dgdr
// +kubebuilder:printcolumn:name="Model",type=string,JSONPath=`.spec.model` // +kubebuilder:printcolumn:name="Model",type=string,JSONPath=`.spec.model`
// +kubebuilder:printcolumn:name="Backend",type=string,JSONPath=`.spec.backend` // +kubebuilder:printcolumn:name="Backend",type=string,JSONPath=`.spec.backend`
......
...@@ -461,7 +461,7 @@ spec: ...@@ -461,7 +461,7 @@ spec:
type: object type: object
type: object type: object
served: true served: true
storage: true storage: false
subresources: subresources:
status: {} status: {}
- additionalPrinterColumns: - additionalPrinterColumns:
...@@ -9247,6 +9247,6 @@ spec: ...@@ -9247,6 +9247,6 @@ spec:
type: object type: object
type: object type: object
served: true served: true
storage: false storage: true
subresources: subresources:
status: {} status: {}
...@@ -20,6 +20,8 @@ package controller ...@@ -20,6 +20,8 @@ package controller
import ( import (
"bytes" "bytes"
"context" "context"
"encoding/json"
"errors"
"fmt" "fmt"
"io" "io"
"text/template" "text/template"
...@@ -45,7 +47,8 @@ import ( ...@@ -45,7 +47,8 @@ import (
sigsyaml "sigs.k8s.io/yaml" sigsyaml "sigs.k8s.io/yaml"
configv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/config/v1alpha1" configv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/config/v1alpha1"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" dgdv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts" "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
commonController "github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common" commonController "github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/gpu" "github.com/ai-dynamo/dynamo/deploy/operator/internal/gpu"
...@@ -53,37 +56,6 @@ import ( ...@@ -53,37 +56,6 @@ import (
) )
const ( const (
// Condition types
ConditionTypeValidation = "Validation"
ConditionTypeProfiling = "Profiling"
ConditionTypeSpecGenerated = "SpecGenerated"
ConditionTypeDeploymentReady = "DeploymentReady"
// Event reasons
EventReasonInitialized = "Initialized"
EventReasonValidationFailed = "ValidationFailed"
EventReasonProfilingJobCreated = "ProfilingJobCreated"
EventReasonProfilingJobFailed = "ProfilingJobFailed"
EventReasonAIConfiguratorFailed = "AIConfiguratorFailed"
EventReasonSpecGenerated = "SpecGenerated"
EventReasonSpecChangeRejected = "SpecChangeRejected"
EventReasonDeploymentCreated = "DeploymentCreated"
EventReasonDeploymentReady = "DeploymentReady"
EventReasonDeploymentDegraded = "DeploymentDegraded"
EventReasonDeploymentDeleted = "DeploymentDeleted"
// Label keys
LabelApp = "app"
LabelDGDR = "dgdr"
LabelDGDRName = "dgdr.nvidia.com/name"
LabelDGDRNamespace = "dgdr.nvidia.com/namespace"
LabelManagedBy = "nvidia.com/managed-by"
// Label values
LabelValueDynamoProfiler = "dynamo-profiler"
LabelValueAICProfiler = "aic-profiler"
LabelValueDynamoOperator = "dynamo-operator"
// Job naming // Job naming
JobNamePrefixOnline = "profile-online-" JobNamePrefixOnline = "profile-online-"
JobNamePrefixAIC = "profile-aic-" JobNamePrefixAIC = "profile-aic-"
...@@ -101,6 +73,12 @@ const ( ...@@ -101,6 +73,12 @@ const (
// Annotation keys // Annotation keys
AnnotationAdditionalResources = "dgdr.nvidia.com/additional-resources" AnnotationAdditionalResources = "dgdr.nvidia.com/additional-resources"
// Annotation keys for v1alpha1 round-trip compatibility.
// The conversion layer stores v1alpha1 fields that have no v1beta1 spec equivalent
// as annotations so the controller can still honour them for converted resources.
AnnotationConfigMapRef = "nvidia.com/dgdr-config-map-ref"
AnnotationOutputPVC = "nvidia.com/dgdr-output-pvc"
// Size limits // Size limits
MaxAnnotationSize = 250000 // ~250KB, below K8s 256KB limit MaxAnnotationSize = 250000 // ~250KB, below K8s 256KB limit
...@@ -108,16 +86,16 @@ const ( ...@@ -108,16 +86,16 @@ const (
SidecarImage = "bitnami/kubectl:latest" SidecarImage = "bitnami/kubectl:latest"
// Volume names // Volume names
VolumeNameProfilingConfig = "profiling-config"
VolumeNameProfilingOutput = "profiling-output" VolumeNameProfilingOutput = "profiling-output"
VolumeNameProfilingConfig = "profiling-config"
VolumeNameModelCache = "model-cache" VolumeNameModelCache = "model-cache"
// Volume paths // Volume paths
ProfilingOutputPath = "/data" ProfilingOutputPath = "/data"
ProfilingOutputFile = "config_with_planner.yaml" ProfilingOutputFile = "config_with_planner.yaml"
ProfilingOutputFileMocker = "mocker_config_with_planner.yaml" ProfilingOutputFileMocker = "mocker_config_with_planner.yaml"
ProfilingConfigPath = "/config" ProfilingConfigMountPath = "/config"
ProfilingConfigFile = "disagg.yaml" ProfilingConfigDefaultKey = "disagg.yaml"
DefaultModelCacheMountPath = "/opt/model-cache" DefaultModelCacheMountPath = "/opt/model-cache"
// Command line arguments // Command line arguments
...@@ -139,7 +117,7 @@ const ( ...@@ -139,7 +117,7 @@ const (
MessageDeploymentDegraded = "DynamoGraphDeployment %s degraded from Ready to %s" MessageDeploymentDegraded = "DynamoGraphDeployment %s degraded from Ready to %s"
MessageDeploymentDeleted = "DGD %s was deleted. DGDR will not recreate it. Delete this DGDR and create a new one to redeploy." MessageDeploymentDeleted = "DGD %s was deleted. DGDR will not recreate it. Delete this DGDR and create a new one to redeploy."
MessageInvalidState = "Invalid state" MessageInvalidState = "Invalid state"
MessageSpecChangeRejected = "Cannot modify spec in state '%s'. DynamoGraphDeploymentRequest is immutable once profiling starts. Create a new resource with a different name instead." MessageSpecChangeRejected = "Cannot modify spec in phase '%s'. DynamoGraphDeploymentRequest is immutable once profiling starts. Create a new resource with a different name instead."
MessageJobCreationFailed = "JobCreationFailed" MessageJobCreationFailed = "JobCreationFailed"
MessageDeploymentCreationFailed = "DeploymentCreationFailed" MessageDeploymentCreationFailed = "DeploymentCreationFailed"
MessageResultsRetrievalFailed = "ResultsRetrievalFailed" MessageResultsRetrievalFailed = "ResultsRetrievalFailed"
...@@ -149,38 +127,6 @@ const ( ...@@ -149,38 +127,6 @@ const (
MessageConfigMapNotFound = "ConfigMap %s not found in namespace %s" MessageConfigMapNotFound = "ConfigMap %s not found in namespace %s"
MessageConfigMapKeyNotFound = "key %s not found in ConfigMap %s" MessageConfigMapKeyNotFound = "key %s not found in ConfigMap %s"
MessageModelCachePVCNotFound = "model cache PVC %s not found in namespace %s" MessageModelCachePVCNotFound = "model cache PVC %s not found in namespace %s"
// Validation messages
ValidationErrorModelRequired = "model is required"
ValidationErrorITLPositive = "sla.itl must be positive"
ValidationErrorTTFTPositive = "sla.ttft must be positive"
ValidationErrorInvalidBackend = "invalid backend: %s (must be vllm, sglang, or trtllm)"
// Valid backend values
BackendVLLM = "vllm"
BackendSGLang = "sglang"
BackendTRTLLM = "trtllm"
// Profiling config field names for v1alpha1; note: will be removed in v1beta1
ConfigKeyDeployment = "deployment"
ConfigKeyModelCache = "modelCache"
ConfigKeyPVCName = "pvcName"
ConfigKeyPVCPath = "pvcPath"
ConfigKeyMountPath = "mountPath"
ConfigKeyHardware = "hardware"
ConfigKeyEngine = "engine"
ConfigKeyOutputDir = "output_dir"
ConfigKeyNumGpusPerNode = "numGpusPerNode"
ConfigKeyGPUModel = "gpuModel"
ConfigKeyGPUVramMib = "gpuVramMib"
ConfigKeySystem = "system"
ConfigKeyMinNumGpusPerEng = "minNumGpusPerEngine"
ConfigKeyMaxNumGpusPerEng = "maxNumGpusPerEngine"
ConfigKeyBackend = "backend"
ConfigKeyConfig = "config"
ConfigKeyNamespace = "namespace"
ConfigKeyModel = "model"
ConfigKeyDGDImage = "dgd_image"
) )
// shell script template for the output copier sidecar // shell script template for the output copier sidecar
...@@ -318,7 +264,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) GetRecorder() record.EventRecor ...@@ -318,7 +264,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) GetRecorder() record.EventRecor
} }
// FinalizeResource implements commonController.Finalizer interface // FinalizeResource implements commonController.Finalizer interface
func (r *DynamoGraphDeploymentRequestReconciler) FinalizeResource(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error { func (r *DynamoGraphDeploymentRequestReconciler) FinalizeResource(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) error {
logger := log.FromContext(ctx) logger := log.FromContext(ctx)
logger.Info("DGDR finalized successfully", "name", dgdr.Name) logger.Info("DGDR finalized successfully", "name", dgdr.Name)
...@@ -342,7 +288,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) Reconcile(ctx context.Context, ...@@ -342,7 +288,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) Reconcile(ctx context.Context,
logger.Info("Reconciling DynamoGraphDeploymentRequest", "name", req.Name, "namespace", req.Namespace) logger.Info("Reconciling DynamoGraphDeploymentRequest", "name", req.Name, "namespace", req.Namespace)
// Fetch the DGDR instance // Fetch the DGDR instance
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{} dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{}
if err := r.Get(ctx, req.NamespacedName, dgdr); err != nil { if err := r.Get(ctx, req.NamespacedName, dgdr); err != nil {
if apierrors.IsNotFound(err) { if apierrors.IsNotFound(err) {
logger.Info("DGDR resource not found, ignoring since object must be deleted") logger.Info("DGDR resource not found, ignoring since object must be deleted")
...@@ -365,99 +311,98 @@ func (r *DynamoGraphDeploymentRequestReconciler) Reconcile(ctx context.Context, ...@@ -365,99 +311,98 @@ func (r *DynamoGraphDeploymentRequestReconciler) Reconcile(ctx context.Context,
// Check for spec changes (immutability enforcement) // Check for spec changes (immutability enforcement)
if dgdr.Status.ObservedGeneration > 0 && dgdr.Status.ObservedGeneration != dgdr.Generation { if dgdr.Status.ObservedGeneration > 0 && dgdr.Status.ObservedGeneration != dgdr.Generation {
// Spec changed after initial processing // Spec changed after initial processing
if dgdr.Status.State == nvidiacomv1alpha1.DGDRStateProfiling || dgdr.Status.State == nvidiacomv1alpha1.DGDRStateDeploying || if dgdr.Status.Phase == nvidiacomv1beta1.DGDRPhaseProfiling || dgdr.Status.Phase == nvidiacomv1beta1.DGDRPhaseDeploying ||
dgdr.Status.State == nvidiacomv1alpha1.DGDRStateReady || dgdr.Status.State == nvidiacomv1alpha1.DGDRStateDeploymentDeleted { dgdr.Status.Phase == nvidiacomv1beta1.DGDRPhaseReady || dgdr.Status.Phase == nvidiacomv1beta1.DGDRPhaseDeployed {
logger.Info("Spec change detected in immutable state", logger.Info("Spec change detected in immutable phase",
"state", dgdr.Status.State, "phase", dgdr.Status.Phase,
"observedGeneration", dgdr.Status.ObservedGeneration, "observedGeneration", dgdr.Status.ObservedGeneration,
"currentGeneration", dgdr.Generation) "currentGeneration", dgdr.Generation)
r.Recorder.Event(dgdr, corev1.EventTypeWarning, EventReasonSpecChangeRejected, r.Recorder.Event(dgdr, corev1.EventTypeWarning, nvidiacomv1beta1.EventReasonSpecChangeRejected,
fmt.Sprintf(MessageSpecChangeRejected, dgdr.Status.State)) fmt.Sprintf(MessageSpecChangeRejected, dgdr.Status.Phase))
// Keep the old observedGeneration to continue rejecting changes // Keep the old observedGeneration to continue rejecting changes
// No state transition - stay in current state with old spec // No phase transition - stay in current phase with old spec
return ctrl.Result{}, nil return ctrl.Result{}, nil
} }
} }
// State machine: handle different states // Phase machine: handle different phases
switch dgdr.Status.State { switch dgdr.Status.Phase {
case nvidiacomv1alpha1.DGDRStateInitializing, "": case nvidiacomv1beta1.DGDRPhasePending, "":
return r.handleInitialState(ctx, dgdr) return r.handlePendingPhase(ctx, dgdr)
case nvidiacomv1alpha1.DGDRStatePending: case nvidiacomv1beta1.DGDRPhaseProfiling:
return r.handlePendingState(ctx, dgdr) return r.handleProfilingPhase(ctx, dgdr)
case nvidiacomv1alpha1.DGDRStateProfiling: case nvidiacomv1beta1.DGDRPhaseDeploying:
return r.handleProfilingState(ctx, dgdr) return r.handleDeployingPhase(ctx, dgdr)
case nvidiacomv1alpha1.DGDRStateDeploying: case nvidiacomv1beta1.DGDRPhaseReady:
return r.handleDeployingState(ctx, dgdr) return r.handleReadyPhase(ctx, dgdr)
case nvidiacomv1alpha1.DGDRStateReady: case nvidiacomv1beta1.DGDRPhaseDeployed:
return r.handleReadyState(ctx, dgdr) return r.handleDeployedPhase(ctx, dgdr)
case nvidiacomv1alpha1.DGDRStateDeploymentDeleted: case nvidiacomv1beta1.DGDRPhaseFailed:
return r.handleDeploymentDeletedState(ctx, dgdr) return r.handleFailedPhase(ctx, dgdr)
case nvidiacomv1alpha1.DGDRStateFailed:
return r.handleFailedState(ctx, dgdr)
default: default:
logger.Info("Unknown state", "state", dgdr.Status.State) logger.Info("Unknown phase", "phase", dgdr.Status.Phase)
return r.updateStateAndRequeue(ctx, dgdr, nvidiacomv1alpha1.DGDRStateFailed, MessageInvalidState) return r.updatePhaseAndRequeue(ctx, dgdr, nvidiacomv1beta1.DGDRPhaseFailed, MessageInvalidState)
} }
} }
// handleInitialState processes newly created DGDR resources // handlePendingPhase processes newly created or pending DGDR resources.
func (r *DynamoGraphDeploymentRequestReconciler) handleInitialState(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (ctrl.Result, error) { // When ObservedGeneration == 0, performs initial validation (merged from v1alpha1 Initializing state).
// Otherwise, starts the profiling process.
func (r *DynamoGraphDeploymentRequestReconciler) handlePendingPhase(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
logger := log.FromContext(ctx) logger := log.FromContext(ctx)
logger.Info("Handling initial state", "name", dgdr.Name)
// First-time processing: validate spec (merged from handleInitialState)
if dgdr.Status.ObservedGeneration == 0 {
logger.Info("Handling initial validation", "name", dgdr.Name)
// Validate the spec // Validate the spec
if err := r.validateSpec(ctx, dgdr); err != nil { if err := r.validateSpec(ctx, dgdr); err != nil {
r.Recorder.Event(dgdr, corev1.EventTypeWarning, EventReasonValidationFailed, err.Error()) r.Recorder.Event(dgdr, corev1.EventTypeWarning, nvidiacomv1beta1.EventReasonValidationFailed, err.Error())
return r.updateStateWithCondition(ctx, dgdr, nvidiacomv1alpha1.DGDRStateFailed, ConditionTypeValidation, metav1.ConditionFalse, EventReasonValidationFailed, err.Error()) return r.updatePhaseWithCondition(ctx, dgdr, nvidiacomv1beta1.DGDRPhaseFailed, nvidiacomv1beta1.ConditionTypeValidation, metav1.ConditionFalse, nvidiacomv1beta1.EventReasonValidationFailed, err.Error())
} }
// Set observedGeneration to track the spec we're processing // Set observedGeneration to track the spec we're processing
dgdr.Status.ObservedGeneration = dgdr.Generation dgdr.Status.ObservedGeneration = dgdr.Generation
// Populate backend in status from spec for display in kubectl output
dgdr.Status.Backend = dgdr.Spec.Backend
// Initialize status // Initialize status
r.Recorder.Event(dgdr, corev1.EventTypeNormal, EventReasonInitialized, MessageInitialized) r.Recorder.Event(dgdr, corev1.EventTypeNormal, nvidiacomv1beta1.EventReasonInitialized, MessageInitialized)
return r.updateStateAndRequeue(ctx, dgdr, nvidiacomv1alpha1.DGDRStatePending, MessageInitialized) return r.updatePhaseAndRequeue(ctx, dgdr, nvidiacomv1beta1.DGDRPhasePending, MessageInitialized)
} }
// handlePendingState starts the profiling process logger.Info("Handling pending phase", "name", dgdr.Name)
func (r *DynamoGraphDeploymentRequestReconciler) handlePendingState(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
logger := log.FromContext(ctx)
logger.Info("Handling pending state", "name", dgdr.Name)
// Create profiling job (online or AIC) // Create profiling job (online or AIC)
if err := r.createProfilingJob(ctx, dgdr); err != nil { if err := r.createProfilingJob(ctx, dgdr); err != nil {
r.Recorder.Event(dgdr, corev1.EventTypeWarning, EventReasonProfilingJobFailed, err.Error()) r.Recorder.Event(dgdr, corev1.EventTypeWarning, nvidiacomv1beta1.EventReasonProfilingJobFailed, err.Error())
return r.updateStateWithCondition(ctx, dgdr, nvidiacomv1alpha1.DGDRStateFailed, ConditionTypeProfiling, metav1.ConditionFalse, MessageJobCreationFailed, err.Error()) return r.updatePhaseWithCondition(ctx, dgdr, nvidiacomv1beta1.DGDRPhaseFailed, nvidiacomv1beta1.ConditionTypeProfiling, metav1.ConditionFalse, MessageJobCreationFailed, err.Error())
} }
// Record event with appropriate message // Record event with appropriate message
if isOnlineProfiling(dgdr) { if isOnlineProfiling(dgdr) {
r.Recorder.Event(dgdr, corev1.EventTypeNormal, EventReasonProfilingJobCreated, MessageProfilingJobCreated) r.Recorder.Event(dgdr, corev1.EventTypeNormal, nvidiacomv1beta1.EventReasonProfilingJobCreated, MessageProfilingJobCreated)
} else { } else {
r.Recorder.Event(dgdr, corev1.EventTypeNormal, EventReasonProfilingJobCreated, MessageAICProfilingJobCreated) r.Recorder.Event(dgdr, corev1.EventTypeNormal, nvidiacomv1beta1.EventReasonProfilingJobCreated, MessageAICProfilingJobCreated)
} }
// Update to Profiling state with Running status // Update to Profiling phase with Running status
return r.updateStateWithCondition(ctx, dgdr, nvidiacomv1alpha1.DGDRStateProfiling, ConditionTypeProfiling, metav1.ConditionFalse, "ProfilingRunning", MessageProfilingInProgress) dgdr.SetProfilingPhase(nvidiacomv1beta1.ProfilingPhaseInitializing)
return r.updatePhaseWithCondition(ctx, dgdr, nvidiacomv1beta1.DGDRPhaseProfiling, nvidiacomv1beta1.ConditionTypeProfiling, metav1.ConditionFalse, "ProfilingRunning", MessageProfilingInProgress)
} }
// handleProfilingState monitors profiling progress and generates spec when complete // handleProfilingPhase monitors profiling progress and generates spec when complete
func (r *DynamoGraphDeploymentRequestReconciler) handleProfilingState(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (ctrl.Result, error) { func (r *DynamoGraphDeploymentRequestReconciler) handleProfilingPhase(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
logger := log.FromContext(ctx) logger := log.FromContext(ctx)
logger.Info("Handling profiling state", "name", dgdr.Name) logger.Info("Handling profiling phase", "name", dgdr.Name)
// Check profiling job status (both online and offline/AIC run as Jobs) // Check profiling job status (both online and offline/AIC run as Jobs)
// Note: We watch the Job via Owns(), so we'll be triggered automatically on Job changes // Note: We watch the Job via Owns(), so we'll be triggered automatically on Job changes
completed, err := r.checkProfilingJobStatus(ctx, dgdr) completed, err := r.checkProfilingJobStatus(ctx, dgdr)
if err != nil { if err != nil {
r.Recorder.Event(dgdr, corev1.EventTypeWarning, MessageProfilingCheckFailed, err.Error()) r.Recorder.Event(dgdr, corev1.EventTypeWarning, MessageProfilingCheckFailed, err.Error())
// Job failed - transition to Failed state // Job failed - clear profiling sub-phase and transition to Failed
return r.updateStateWithCondition(ctx, dgdr, nvidiacomv1alpha1.DGDRStateFailed, ConditionTypeProfiling, metav1.ConditionFalse, "ProfilingFailed", err.Error()) dgdr.ClearProfilingPhase()
return r.updatePhaseWithCondition(ctx, dgdr, nvidiacomv1beta1.DGDRPhaseFailed, nvidiacomv1beta1.ConditionTypeProfiling, metav1.ConditionFalse, "ProfilingFailed", err.Error())
} }
if !completed { if !completed {
...@@ -466,9 +411,12 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleProfilingState(ctx contex ...@@ -466,9 +411,12 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleProfilingState(ctx contex
return ctrl.Result{}, nil return ctrl.Result{}, nil
} }
// Profiling complete — clear the profiling sub-phase
dgdr.ClearProfilingPhase()
// Mark profiling as completed successfully // Mark profiling as completed successfully
meta.SetStatusCondition(&dgdr.Status.Conditions, metav1.Condition{ meta.SetStatusCondition(&dgdr.Status.Conditions, metav1.Condition{
Type: ConditionTypeProfiling, Type: nvidiacomv1beta1.ConditionTypeProfiling,
Status: metav1.ConditionTrue, Status: metav1.ConditionTrue,
ObservedGeneration: dgdr.Generation, ObservedGeneration: dgdr.Generation,
Reason: "ProfilingCompleted", Reason: "ProfilingCompleted",
...@@ -478,18 +426,16 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleProfilingState(ctx contex ...@@ -478,18 +426,16 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleProfilingState(ctx contex
// Retrieve profiling results and generate spec // Retrieve profiling results and generate spec
if err := r.generateDGDSpec(ctx, dgdr); err != nil { if err := r.generateDGDSpec(ctx, dgdr); err != nil {
r.Recorder.Event(dgdr, corev1.EventTypeWarning, MessageGenerationFailed, err.Error()) r.Recorder.Event(dgdr, corev1.EventTypeWarning, MessageGenerationFailed, err.Error())
return r.updateStateWithCondition(ctx, dgdr, nvidiacomv1alpha1.DGDRStateFailed, ConditionTypeSpecGenerated, metav1.ConditionFalse, MessageGenerationFailed, err.Error()) return r.updatePhaseWithCondition(ctx, dgdr, nvidiacomv1beta1.DGDRPhaseFailed, nvidiacomv1beta1.ConditionTypeSpecGenerated, metav1.ConditionFalse, MessageGenerationFailed, err.Error())
} }
// Record spec generation event // Record spec generation event
r.Recorder.Event(dgdr, corev1.EventTypeNormal, EventReasonSpecGenerated, MessageSpecGenerated) r.Recorder.Event(dgdr, corev1.EventTypeNormal, nvidiacomv1beta1.EventReasonSpecGenerated, MessageSpecGenerated)
// Create additional resources (ConfigMaps) immediately after profiling // Create additional resources (ConfigMaps) immediately after profiling
// This ensures that the `planner-profile-data` ConfigMap is available for both auto and manual deployment // This ensures that the `planner-profile-data` ConfigMap is available for both auto and manual deployment
// v1beta1 uses the DGDR namespace for additional resources.
targetNamespace := dgdr.Namespace targetNamespace := dgdr.Namespace
if dgdr.Spec.DeploymentOverrides != nil && dgdr.Spec.DeploymentOverrides.Namespace != "" {
targetNamespace = dgdr.Spec.DeploymentOverrides.Namespace
}
if err := r.createAdditionalResources(ctx, dgdr, targetNamespace); err != nil { if err := r.createAdditionalResources(ctx, dgdr, targetNamespace); err != nil {
logger.Error(err, "Failed to create additional resources after profiling") logger.Error(err, "Failed to create additional resources after profiling")
// Don't fail the DGDR, just log the error - ConfigMaps can be created manually // Don't fail the DGDR, just log the error - ConfigMaps can be created manually
...@@ -497,31 +443,48 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleProfilingState(ctx contex ...@@ -497,31 +443,48 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleProfilingState(ctx contex
fmt.Sprintf("Failed to create ConfigMaps from profiling output: %v", err)) fmt.Sprintf("Failed to create ConfigMaps from profiling output: %v", err))
} }
// If autoApply is enabled, transition to Deploying state // If autoApply is enabled, transition to Deploying phase
if dgdr.Spec.AutoApply { if dgdr.Spec.AutoApply {
logger.Info("AutoApply enabled, transitioning to Deploying state") logger.Info("AutoApply enabled, transitioning to Deploying phase")
return r.updateStateWithCondition(ctx, dgdr, nvidiacomv1alpha1.DGDRStateDeploying, ConditionTypeSpecGenerated, metav1.ConditionTrue, EventReasonSpecGenerated, MessageSpecGenerated) return r.updatePhaseWithCondition(ctx, dgdr, nvidiacomv1beta1.DGDRPhaseDeploying, nvidiacomv1beta1.ConditionTypeSpecGenerated, metav1.ConditionTrue, nvidiacomv1beta1.EventReasonSpecGenerated, MessageSpecGenerated)
} }
// Otherwise, transition to Ready state // Otherwise, transition to Ready phase
return r.updateStateWithCondition(ctx, dgdr, nvidiacomv1alpha1.DGDRStateReady, ConditionTypeSpecGenerated, metav1.ConditionTrue, EventReasonSpecGenerated, MessageSpecAvailable) return r.updatePhaseWithCondition(ctx, dgdr, nvidiacomv1beta1.DGDRPhaseReady, nvidiacomv1beta1.ConditionTypeSpecGenerated, metav1.ConditionTrue, nvidiacomv1beta1.EventReasonSpecGenerated, MessageSpecAvailable)
} }
// handleReadyState handles DGDR in Ready state // handleReadyPhase handles DGDR in Ready phase (profiling complete, spec available)
func (r *DynamoGraphDeploymentRequestReconciler) handleReadyState(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (ctrl.Result, error) { func (r *DynamoGraphDeploymentRequestReconciler) handleReadyPhase(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
logger := log.FromContext(ctx) logger := log.FromContext(ctx)
logger.Info("DGDR is ready", "name", dgdr.Name) logger.Info("DGDR is ready", "name", dgdr.Name)
// If autoApply is not enabled, nothing to monitor // Nothing to monitor in Ready phase - spec is available for manual application
if !dgdr.Spec.AutoApply {
return ctrl.Result{}, nil return ctrl.Result{}, nil
}
// handleDeployingPhase handles DGD creation and monitors deployment
func (r *DynamoGraphDeploymentRequestReconciler) handleDeployingPhase(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
logger := log.FromContext(ctx)
logger.Info("Handling deploying phase", "name", dgdr.Name)
if !dgdr.Spec.AutoApply {
// Shouldn't be in this phase without autoApply
logger.Info("AutoApply not enabled, transitioning to Ready")
dgdr.Status.Phase = nvidiacomv1beta1.DGDRPhaseReady
setSucceededCondition(dgdr, nvidiacomv1beta1.DGDRPhaseReady)
return ctrl.Result{}, r.Status().Update(ctx, dgdr)
} }
// Check if DGD still exists and monitor its status // Check if we need to create DGD
dgd := &nvidiacomv1alpha1.DynamoGraphDeployment{} if dgdr.Status.DGDName == "" {
return r.createDGD(ctx, dgdr)
}
// DGD was already created, check its status
dgd := &dgdv1alpha1.DynamoGraphDeployment{}
err := r.Get(ctx, types.NamespacedName{ err := r.Get(ctx, types.NamespacedName{
Name: dgdr.Status.Deployment.Name, Name: dgdr.Status.DGDName,
Namespace: dgdr.Status.Deployment.Namespace, Namespace: dgdr.Namespace,
}, dgd) }, dgd)
if apierrors.IsNotFound(err) { if apierrors.IsNotFound(err) {
...@@ -533,52 +496,50 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleReadyState(ctx context.Co ...@@ -533,52 +496,50 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleReadyState(ctx context.Co
return ctrl.Result{}, err return ctrl.Result{}, err
} }
// Update deployment status // Check if DGD is Ready
dgdr.Status.Deployment.State = dgd.Status.State var condStatus metav1.ConditionStatus
var condReason, condMessage string
// Check if DGD degraded from Ready if dgd.Status.State == dgdv1alpha1.DGDStateSuccessful {
if dgd.Status.State != nvidiacomv1alpha1.DGDStateSuccessful { logger.Info("DGD is Ready, transitioning to Deployed phase")
logger.Info("DGD degraded, transitioning back to Deploying", dgdr.Status.Phase = nvidiacomv1beta1.DGDRPhaseDeployed
"dgdState", dgd.Status.State) setSucceededCondition(dgdr, nvidiacomv1beta1.DGDRPhaseDeployed)
dgdr.Status.State = nvidiacomv1alpha1.DGDRStateDeploying r.Recorder.Event(dgdr, corev1.EventTypeNormal, nvidiacomv1beta1.EventReasonDeploymentReady,
fmt.Sprintf(MessageDeploymentReady, dgd.Name))
r.Recorder.Event(dgdr, corev1.EventTypeWarning, EventReasonDeploymentDegraded, condStatus = metav1.ConditionTrue
fmt.Sprintf(MessageDeploymentDegraded, dgd.Name, string(dgd.Status.State))) condReason = nvidiacomv1beta1.EventReasonDeploymentReady
condMessage = fmt.Sprintf(MessageDeploymentReady, dgd.Name)
} else {
logger.Info("DGD not yet ready", "name", dgd.Name, "state", dgd.Status.State)
condStatus = metav1.ConditionFalse
condReason = "DeploymentInProgress"
condMessage = fmt.Sprintf("DGD %s is in %s state", dgd.Name, string(dgd.Status.State))
}
updateDeploymentInfo(dgdr, dgd)
meta.SetStatusCondition(&dgdr.Status.Conditions, metav1.Condition{ meta.SetStatusCondition(&dgdr.Status.Conditions, metav1.Condition{
Type: ConditionTypeDeploymentReady, Type: nvidiacomv1beta1.ConditionTypeDeploymentReady,
Status: metav1.ConditionFalse, Status: condStatus,
Reason: EventReasonDeploymentDegraded, Reason: condReason,
Message: fmt.Sprintf("Deployment degraded to %s", string(dgd.Status.State)), Message: condMessage,
}) })
}
return ctrl.Result{}, r.Status().Update(ctx, dgdr) return ctrl.Result{}, r.Status().Update(ctx, dgdr)
} }
// handleDeployingState handles DGD creation and monitors deployment // handleDeployedPhase monitors a healthy DGD and detects degradation or deletion
func (r *DynamoGraphDeploymentRequestReconciler) handleDeployingState(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (ctrl.Result, error) { func (r *DynamoGraphDeploymentRequestReconciler) handleDeployedPhase(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
logger := log.FromContext(ctx) logger := log.FromContext(ctx)
logger.Info("Handling deploying state", "name", dgdr.Name) logger.Info("DGDR is deployed", "name", dgdr.Name)
if !dgdr.Spec.AutoApply { // Check if DGD still exists and monitor its status
// Shouldn't be in this state without autoApply dgd := &dgdv1alpha1.DynamoGraphDeployment{}
logger.Info("AutoApply not enabled, transitioning to Ready")
dgdr.Status.State = nvidiacomv1alpha1.DGDRStateReady
return ctrl.Result{}, r.Status().Update(ctx, dgdr)
}
// Check if we need to create DGD
if dgdr.Status.Deployment == nil || !dgdr.Status.Deployment.Created {
return r.createDGD(ctx, dgdr)
}
// DGD was already created, check its status
dgd := &nvidiacomv1alpha1.DynamoGraphDeployment{}
err := r.Get(ctx, types.NamespacedName{ err := r.Get(ctx, types.NamespacedName{
Name: dgdr.Status.Deployment.Name, Name: dgdr.Status.DGDName,
Namespace: dgdr.Status.Deployment.Namespace, Namespace: dgdr.Namespace,
}, dgd) }, dgd)
if apierrors.IsNotFound(err) { if apierrors.IsNotFound(err) {
...@@ -590,51 +551,54 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleDeployingState(ctx contex ...@@ -590,51 +551,54 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleDeployingState(ctx contex
return ctrl.Result{}, err return ctrl.Result{}, err
} }
// Update deployment status // Check if DGD degraded from Ready
dgdr.Status.Deployment.State = dgd.Status.State if dgd.Status.State != dgdv1alpha1.DGDStateSuccessful {
logger.Info("DGD degraded, transitioning back to Deploying",
"dgdState", dgd.Status.State)
// Check if DGD is Ready dgdr.Status.Phase = nvidiacomv1beta1.DGDRPhaseDeploying
if dgd.Status.State == nvidiacomv1alpha1.DGDStateSuccessful { setSucceededCondition(dgdr, nvidiacomv1beta1.DGDRPhaseDeploying)
logger.Info("DGD is Ready, transitioning to Ready state") updateDeploymentInfo(dgdr, dgd)
dgdr.Status.State = nvidiacomv1alpha1.DGDRStateReady
r.Recorder.Event(dgdr, corev1.EventTypeNormal, EventReasonDeploymentReady, r.Recorder.Event(dgdr, corev1.EventTypeWarning, nvidiacomv1beta1.EventReasonDeploymentDegraded,
fmt.Sprintf(MessageDeploymentReady, dgd.Name)) fmt.Sprintf(MessageDeploymentDegraded, dgd.Name, string(dgd.Status.State)))
meta.SetStatusCondition(&dgdr.Status.Conditions, metav1.Condition{ meta.SetStatusCondition(&dgdr.Status.Conditions, metav1.Condition{
Type: ConditionTypeDeploymentReady, Type: nvidiacomv1beta1.ConditionTypeDeploymentReady,
Status: metav1.ConditionTrue, Status: metav1.ConditionFalse,
Reason: EventReasonDeploymentReady, Reason: nvidiacomv1beta1.EventReasonDeploymentDegraded,
Message: fmt.Sprintf(MessageDeploymentReady, dgd.Name), Message: fmt.Sprintf("Deployment degraded to %s", string(dgd.Status.State)),
}) })
} else {
// DGD is healthy — update replica info only if changed
if !updateDeploymentInfo(dgdr, dgd) {
// Nothing changed, skip the status write
return ctrl.Result{}, nil
}
} }
return ctrl.Result{}, r.Status().Update(ctx, dgdr) return ctrl.Result{}, r.Status().Update(ctx, dgdr)
} }
// handleDeploymentDeletedState is a terminal state for when auto-created DGD is deleted // handleDGDDeleted handles the case when auto-created DGD is deleted by user.
func (r *DynamoGraphDeploymentRequestReconciler) handleDeploymentDeletedState(_ context.Context, _ *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (ctrl.Result, error) { // In v1beta1, this transitions to Failed (DeploymentDeleted phase was removed).
// Terminal state - nothing to do func (r *DynamoGraphDeploymentRequestReconciler) handleDGDDeleted(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
// User must delete this DGDR and create a new one to redeploy
return ctrl.Result{}, nil
}
// handleDGDDeleted handles the case when auto-created DGD is deleted by user
func (r *DynamoGraphDeploymentRequestReconciler) handleDGDDeleted(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
logger := log.FromContext(ctx) logger := log.FromContext(ctx)
logger.Info("DGD was deleted by user, transitioning to DeploymentDeleted state") logger.Info("DGD was deleted by user, transitioning to Failed phase")
dgdr.Status.State = nvidiacomv1alpha1.DGDRStateDeploymentDeleted dgdr.Status.Phase = nvidiacomv1beta1.DGDRPhaseFailed
setSucceededCondition(dgdr, nvidiacomv1beta1.DGDRPhaseFailed)
r.Recorder.Event(dgdr, corev1.EventTypeWarning, EventReasonDeploymentDeleted, r.Recorder.Event(dgdr, corev1.EventTypeWarning, nvidiacomv1beta1.EventReasonDeploymentDeleted,
fmt.Sprintf(MessageDeploymentDeleted, dgdr.Status.Deployment.Name)) fmt.Sprintf(MessageDeploymentDeleted, dgdr.Status.DGDName))
dgdr.Status.Deployment = nil dgdr.Status.DGDName = ""
dgdr.Status.DeploymentInfo = nil
meta.SetStatusCondition(&dgdr.Status.Conditions, metav1.Condition{ meta.SetStatusCondition(&dgdr.Status.Conditions, metav1.Condition{
Type: ConditionTypeDeploymentReady, Type: nvidiacomv1beta1.ConditionTypeDeploymentReady,
Status: metav1.ConditionFalse, Status: metav1.ConditionFalse,
Reason: EventReasonDeploymentDeleted, Reason: nvidiacomv1beta1.EventReasonDeploymentDeleted,
Message: "Deployment was deleted by user. Create a new DGDR to redeploy.", Message: "Deployment was deleted by user. Create a new DGDR to redeploy.",
}) })
...@@ -642,45 +606,24 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleDGDDeleted(ctx context.Co ...@@ -642,45 +606,24 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleDGDDeleted(ctx context.Co
} }
// createDGD creates a DynamoGraphDeployment with the generated spec // createDGD creates a DynamoGraphDeployment with the generated spec
func (r *DynamoGraphDeploymentRequestReconciler) createDGD(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (ctrl.Result, error) { func (r *DynamoGraphDeploymentRequestReconciler) createDGD(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
logger := log.FromContext(ctx) logger := log.FromContext(ctx)
// Extract DGD from RawExtension // Extract DGD spec from annotation (stored by generateDGDSpec)
if dgdr.Status.GeneratedDeployment == nil { dgdSpecYAML, ok := dgdr.Annotations["nvidia.com/generated-dgd-spec"]
return ctrl.Result{}, fmt.Errorf("generatedDeployment is not set") if !ok || dgdSpecYAML == "" {
return ctrl.Result{}, fmt.Errorf("generated DGD spec not found in annotation nvidia.com/generated-dgd-spec")
} }
generatedDGD := &nvidiacomv1alpha1.DynamoGraphDeployment{} generatedDGD := &dgdv1alpha1.DynamoGraphDeployment{}
if err := yaml.Unmarshal([]byte(dgdSpecYAML), generatedDGD); err != nil {
// RawExtension can have either Object (already decoded) or Raw (JSON bytes) return ctrl.Result{}, fmt.Errorf("failed to unmarshal generated deployment from annotation: %w", err)
if dgdr.Status.GeneratedDeployment.Object != nil {
var ok bool
generatedDGD, ok = dgdr.Status.GeneratedDeployment.Object.(*nvidiacomv1alpha1.DynamoGraphDeployment)
if !ok {
return ctrl.Result{}, fmt.Errorf("generatedDeployment.Object is not a DynamoGraphDeployment")
}
} else if dgdr.Status.GeneratedDeployment.Raw != nil {
if err := yaml.Unmarshal(dgdr.Status.GeneratedDeployment.Raw, generatedDGD); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to unmarshal generated deployment: %w", err)
}
} else {
return ctrl.Result{}, fmt.Errorf("generatedDeployment has neither Object nor Raw set")
} }
// Determine DGD name and namespace // Determine DGD name and namespace from generated deployment
dgdName := generatedDGD.Name dgdName := generatedDGD.Name
dgdNamespace := dgdr.Namespace dgdNamespace := dgdr.Namespace
// Apply deployment overrides
if dgdr.Spec.DeploymentOverrides != nil {
if dgdr.Spec.DeploymentOverrides.Name != "" {
dgdName = dgdr.Spec.DeploymentOverrides.Name
}
if dgdr.Spec.DeploymentOverrides.Namespace != "" {
dgdNamespace = dgdr.Spec.DeploymentOverrides.Namespace
}
}
// Build labels (start with generated DGD's labels) // Build labels (start with generated DGD's labels)
labels := make(map[string]string) labels := make(map[string]string)
if generatedDGD.Labels != nil { if generatedDGD.Labels != nil {
...@@ -689,16 +632,9 @@ func (r *DynamoGraphDeploymentRequestReconciler) createDGD(ctx context.Context, ...@@ -689,16 +632,9 @@ func (r *DynamoGraphDeploymentRequestReconciler) createDGD(ctx context.Context,
} }
} }
// Add/override with managed labels // Add/override with managed labels
labels[LabelDGDRName] = dgdr.Name labels[nvidiacomv1beta1.LabelDGDRName] = dgdr.Name
labels[LabelDGDRNamespace] = dgdr.Namespace labels[nvidiacomv1beta1.LabelDGDRNamespace] = dgdr.Namespace
labels[LabelManagedBy] = LabelValueDynamoOperator labels[nvidiacomv1beta1.LabelManagedBy] = nvidiacomv1beta1.LabelValueDynamoOperator
// Merge custom labels from overrides
if dgdr.Spec.DeploymentOverrides != nil && dgdr.Spec.DeploymentOverrides.Labels != nil {
for k, v := range dgdr.Spec.DeploymentOverrides.Labels {
labels[k] = v
}
}
// Build annotations (start with generated DGD's annotations) // Build annotations (start with generated DGD's annotations)
annotations := make(map[string]string) annotations := make(map[string]string)
...@@ -707,15 +643,9 @@ func (r *DynamoGraphDeploymentRequestReconciler) createDGD(ctx context.Context, ...@@ -707,15 +643,9 @@ func (r *DynamoGraphDeploymentRequestReconciler) createDGD(ctx context.Context,
annotations[k] = v annotations[k] = v
} }
} }
// Merge custom annotations from overrides
if dgdr.Spec.DeploymentOverrides != nil && dgdr.Spec.DeploymentOverrides.Annotations != nil {
for k, v := range dgdr.Spec.DeploymentOverrides.Annotations {
annotations[k] = v
}
}
// Create DGD from generated deployment // Create DGD from generated deployment
dgd := &nvidiacomv1alpha1.DynamoGraphDeployment{ dgd := &dgdv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: dgdName, Name: dgdName,
Namespace: dgdNamespace, Namespace: dgdNamespace,
...@@ -735,12 +665,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) createDGD(ctx context.Context, ...@@ -735,12 +665,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) createDGD(ctx context.Context,
if apierrors.IsAlreadyExists(err) { if apierrors.IsAlreadyExists(err) {
// DGD already exists, just update status // DGD already exists, just update status
logger.Info("DGD already exists, updating status") logger.Info("DGD already exists, updating status")
dgdr.Status.Deployment = &nvidiacomv1alpha1.DeploymentStatus{ dgdr.Status.DGDName = dgdName
Name: dgdName,
Namespace: dgdNamespace,
State: nvidiacomv1alpha1.DGDStatePending,
Created: true,
}
return ctrl.Result{}, r.Status().Update(ctx, dgdr) return ctrl.Result{}, r.Status().Update(ctx, dgdr)
} }
r.Recorder.Event(dgdr, corev1.EventTypeWarning, MessageDeploymentCreationFailed, err.Error()) r.Recorder.Event(dgdr, corev1.EventTypeWarning, MessageDeploymentCreationFailed, err.Error())
...@@ -748,20 +673,15 @@ func (r *DynamoGraphDeploymentRequestReconciler) createDGD(ctx context.Context, ...@@ -748,20 +673,15 @@ func (r *DynamoGraphDeploymentRequestReconciler) createDGD(ctx context.Context,
} }
// Update status // Update status
dgdr.Status.Deployment = &nvidiacomv1alpha1.DeploymentStatus{ dgdr.Status.DGDName = dgdName
Name: dgdName,
Namespace: dgdNamespace,
State: nvidiacomv1alpha1.DGDStatePending,
Created: true,
}
r.Recorder.Event(dgdr, corev1.EventTypeNormal, EventReasonDeploymentCreated, r.Recorder.Event(dgdr, corev1.EventTypeNormal, nvidiacomv1beta1.EventReasonDeploymentCreated,
fmt.Sprintf(MessageDeploymentCreated, dgdName)) fmt.Sprintf(MessageDeploymentCreated, dgdName))
meta.SetStatusCondition(&dgdr.Status.Conditions, metav1.Condition{ meta.SetStatusCondition(&dgdr.Status.Conditions, metav1.Condition{
Type: ConditionTypeDeploymentReady, Type: nvidiacomv1beta1.ConditionTypeDeploymentReady,
Status: metav1.ConditionFalse, Status: metav1.ConditionFalse,
Reason: EventReasonDeploymentCreated, Reason: nvidiacomv1beta1.EventReasonDeploymentCreated,
Message: fmt.Sprintf("DGD %s created, waiting for Ready", dgdName), Message: fmt.Sprintf("DGD %s created, waiting for Ready", dgdName),
}) })
...@@ -771,7 +691,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) createDGD(ctx context.Context, ...@@ -771,7 +691,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) createDGD(ctx context.Context,
} }
// createAdditionalResources creates ConfigMaps from the profiling output that should be deployed alongside the DGD // createAdditionalResources creates ConfigMaps from the profiling output that should be deployed alongside the DGD
func (r *DynamoGraphDeploymentRequestReconciler) createAdditionalResources(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest, targetNamespace string) error { func (r *DynamoGraphDeploymentRequestReconciler) createAdditionalResources(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest, targetNamespace string) error {
logger := log.FromContext(ctx) logger := log.FromContext(ctx)
// Check if there are additional resources stored in annotations // Check if there are additional resources stored in annotations
...@@ -821,9 +741,9 @@ func (r *DynamoGraphDeploymentRequestReconciler) createAdditionalResources(ctx c ...@@ -821,9 +741,9 @@ func (r *DynamoGraphDeploymentRequestReconciler) createAdditionalResources(ctx c
if cm.Labels == nil { if cm.Labels == nil {
cm.Labels = make(map[string]string) cm.Labels = make(map[string]string)
} }
cm.Labels[LabelDGDRName] = dgdr.Name cm.Labels[nvidiacomv1beta1.LabelDGDRName] = dgdr.Name
cm.Labels[LabelDGDRNamespace] = dgdr.Namespace cm.Labels[nvidiacomv1beta1.LabelDGDRNamespace] = dgdr.Namespace
cm.Labels[LabelManagedBy] = LabelValueDynamoOperator cm.Labels[nvidiacomv1beta1.LabelManagedBy] = nvidiacomv1beta1.LabelValueDynamoOperator
// Create the ConfigMap // Create the ConfigMap
if err := r.Create(ctx, cm); err != nil { if err := r.Create(ctx, cm); err != nil {
...@@ -844,168 +764,89 @@ func (r *DynamoGraphDeploymentRequestReconciler) createAdditionalResources(ctx c ...@@ -844,168 +764,89 @@ func (r *DynamoGraphDeploymentRequestReconciler) createAdditionalResources(ctx c
return nil return nil
} }
// handleFailedState handles DGDR in Failed state // handleFailedPhase handles DGDR in Failed phase
func (r *DynamoGraphDeploymentRequestReconciler) handleFailedState(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (ctrl.Result, error) { func (r *DynamoGraphDeploymentRequestReconciler) handleFailedPhase(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
logger := log.FromContext(ctx) logger := log.FromContext(ctx)
logger.Info("DGDR is in failed state", "name", dgdr.Name) logger.Info("DGDR is in failed phase", "name", dgdr.Name)
// Could implement retry logic here if desired // Could implement retry logic here if desired
return ctrl.Result{}, nil return ctrl.Result{}, nil
} }
// getProfilingJobName returns the job name for a DGDR // getProfilingJobName returns the job name for a DGDR
func getProfilingJobName(dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) string { func getProfilingJobName(dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) string {
// Use "profile-" prefix for all profiling jobs // Use "profile-" prefix for all profiling jobs
return fmt.Sprintf("profile-%s", dgdr.Name) return fmt.Sprintf("profile-%s", dgdr.Name)
} }
// getOutputConfigMapName returns the ConfigMap name for profiling output // getOutputConfigMapName returns the ConfigMap name for profiling output
func getOutputConfigMapName(dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) string { func getOutputConfigMapName(dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) string {
return fmt.Sprintf("%s%s", ConfigMapOutputPrefix, dgdr.Name) return fmt.Sprintf("%s%s", ConfigMapOutputPrefix, dgdr.Name)
} }
// isOnlineProfiling determines whether online profiling or AI Configurator is being used // isOnlineProfiling returns true. In v1beta1, the profiler decides online vs AIC
// based on the sweep.use_ai_configurator config value // mode internally based on its config. The controller always uses the same label.
func isOnlineProfiling(dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) bool { func isOnlineProfiling(_ *nvidiacomv1beta1.DynamoGraphDeploymentRequest) bool {
if dgdr.Spec.ProfilingConfig.Config == nil {
return true
}
var config map[string]interface{}
if err := yaml.Unmarshal(dgdr.Spec.ProfilingConfig.Config.Raw, &config); err != nil {
return true // Default to online on parse error
}
if sweep, ok := config["sweep"].(map[string]interface{}); ok {
// Check camelCase first (preferred), then snake_case (backwards compat)
if useAIC, exists := sweep["useAiConfigurator"].(bool); exists {
return !useAIC
}
if useAIC, exists := sweep["use_ai_configurator"].(bool); exists {
return !useAIC
}
}
// Default to online profiling if not specified
return true return true
} }
// validateSpec validates the DGDR spec // validateSpec validates the DGDR spec
func (r *DynamoGraphDeploymentRequestReconciler) validateSpec(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error { func (r *DynamoGraphDeploymentRequestReconciler) validateSpec(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) error {
// Validate ConfigMap if provided (for the DGD base config) var errs []error
// This requires cluster access and cannot be done in the stateless validator
if dgdr.Spec.ProfilingConfig.ConfigMapRef != nil { // Validate image is specified (required for the profiling job container).
cm := &corev1.ConfigMap{} // Mirrors the webhook admission check so controller-side writes cannot bypass it.
err := r.Get(ctx, types.NamespacedName{ if dgdr.Spec.Image == "" {
Name: dgdr.Spec.ProfilingConfig.ConfigMapRef.Name, errs = append(errs, fmt.Errorf("spec.image is required"))
Namespace: dgdr.Namespace, }
}, cm)
// Disallow searchStrategy: thorough with backend: auto.
if err != nil { // Mirrors the webhook admission check so controller-side writes cannot bypass it.
if apierrors.IsNotFound(err) { if dgdr.Spec.SearchStrategy == nvidiacomv1beta1.SearchStrategyThorough &&
return fmt.Errorf(MessageConfigMapNotFound, dgdr.Spec.Backend == nvidiacomv1beta1.BackendTypeAuto {
dgdr.Spec.ProfilingConfig.ConfigMapRef.Name, dgdr.Namespace) errs = append(errs, fmt.Errorf(
} "spec.searchStrategy %q is incompatible with spec.backend %q: set spec.backend to a specific backend (sglang, trtllm, or vllm)",
return err nvidiacomv1beta1.SearchStrategyThorough,
} nvidiacomv1beta1.BackendTypeAuto,
))
// Validate key exists
key := dgdr.Spec.ProfilingConfig.ConfigMapRef.Key
if key == "" {
key = "disagg.yaml"
}
if _, exists := cm.Data[key]; !exists {
return fmt.Errorf(MessageConfigMapKeyNotFound, key, cm.Name)
}
} }
// Validate model cache PVC if provided // Validate model cache PVC if provided
modelCachePVC, _ := extractModelCachePVCConfig(dgdr) if dgdr.Spec.ModelCache != nil && dgdr.Spec.ModelCache.PVCName != "" {
if modelCachePVC != "" {
pvc := &corev1.PersistentVolumeClaim{} pvc := &corev1.PersistentVolumeClaim{}
err := r.Get(ctx, types.NamespacedName{ err := r.Get(ctx, types.NamespacedName{
Name: modelCachePVC, Name: dgdr.Spec.ModelCache.PVCName,
Namespace: dgdr.Namespace, Namespace: dgdr.Namespace,
}, pvc) }, pvc)
if err != nil { if err != nil {
if apierrors.IsNotFound(err) { if apierrors.IsNotFound(err) {
return fmt.Errorf(MessageModelCachePVCNotFound, modelCachePVC, dgdr.Namespace) errs = append(errs, fmt.Errorf(MessageModelCachePVCNotFound, dgdr.Spec.ModelCache.PVCName, dgdr.Namespace))
} } else {
return err return err
} }
} }
}
if err := r.validateGPUHardwareInfo(ctx, dgdr); err != nil { if err := r.validateGPUHardwareInfo(ctx, dgdr); err != nil {
return err errs = append(errs, err)
} }
// The profiler will validate the rest of the configuration // The profiler will validate the rest of the configuration
return nil return errors.Join(errs...)
}
// toFloat64 converts a numeric value (int or float64) to float64.
// Returns 0 if the value is neither int nor float64.
func toFloat64(val interface{}) float64 {
switch v := val.(type) {
case float64:
return v
case int:
return float64(v)
default:
return 0
}
} }
// validateGPUHardwareInfo ensures GPU hardware information is available when required for profiling // validateGPUHardwareInfo ensures GPU hardware information is available when required for profiling
func (r *DynamoGraphDeploymentRequestReconciler) validateGPUHardwareInfo(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error { func (r *DynamoGraphDeploymentRequestReconciler) validateGPUHardwareInfo(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) error {
logger := log.FromContext(ctx) logger := log.FromContext(ctx)
// Check for hardware info and GPU ranges // Check if user provided hardware info in the typed spec
// TODO: will be cleaner once we swap to new DGDR schema (#6130) hasManualConfig := dgdr.Spec.Hardware != nil && (dgdr.Spec.Hardware.GPUSKU != "" ||
var config map[string]interface{} dgdr.Spec.Hardware.VRAMMB != nil ||
if dgdr.Spec.ProfilingConfig.Config != nil { dgdr.Spec.Hardware.NumGPUsPerNode != nil)
if err := yaml.Unmarshal(dgdr.Spec.ProfilingConfig.Config.Raw, &config); err != nil {
// Config parse errors will be caught later, skip validation here
return nil
}
} else {
config = make(map[string]interface{})
}
hardwareVal, hasHardware := config[ConfigKeyHardware]
var hasManualHardwareConfig bool
if hasHardware && hardwareVal != nil {
if hardwareConfig, ok := hardwareVal.(map[string]interface{}); ok {
_, hasGPUModel := hardwareConfig[ConfigKeyGPUModel]
_, hasGPUVram := hardwareConfig[ConfigKeyGPUVramMib]
_, hasNumGPUs := hardwareConfig[ConfigKeyNumGpusPerNode]
hasManualHardwareConfig = hasGPUModel || hasGPUVram || hasNumGPUs
}
}
var hasExplicitGPURanges bool
if engineVal, hasEngine := config[ConfigKeyEngine]; hasEngine && engineVal != nil {
if engineConfig, ok := engineVal.(map[string]interface{}); ok {
minGPUs, hasMin := engineConfig[ConfigKeyMinNumGpusPerEng]
maxGPUs, hasMax := engineConfig[ConfigKeyMaxNumGpusPerEng]
if hasMin && hasMax {
minVal := toFloat64(minGPUs)
maxVal := toFloat64(maxGPUs)
// Validate that min <= max // If manual config is provided, validation passes
if minVal > maxVal { if hasManualConfig {
return fmt.Errorf("invalid GPU range: %s (%v) cannot be greater than %s (%v)",
ConfigKeyMinNumGpusPerEng, minVal, ConfigKeyMaxNumGpusPerEng, maxVal)
}
hasExplicitGPURanges = minVal > 0 && maxVal > 0
}
}
}
// If manual config or explicit ranges are provided, validation passes
if hasManualHardwareConfig || hasExplicitGPURanges {
return nil return nil
} }
...@@ -1019,37 +860,22 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateGPUHardwareInfo(ctx con ...@@ -1019,37 +860,22 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateGPUHardwareInfo(ctx con
isNamespaceScoped := r.Config.Namespace.Restricted != "" isNamespaceScoped := r.Config.Namespace.Restricted != ""
if isNamespaceScoped { if isNamespaceScoped {
tmpl := template.Must(template.New("nsGPUErr").Parse( return fmt.Errorf(
`GPU hardware info required but cannot be auto-discovered.` + "GPU hardware info required but cannot be auto-discovered." +
"\n\nOptions to resolve:" + "\n\nOptions to resolve:" +
"\n\n1. Re-enable GPU discovery (if it was disabled during Helm install):" + "\n\n1. Re-enable GPU discovery (if it was disabled during Helm install):" +
"\n helm upgrade ... --set dynamo-operator.gpuDiscovery.enabled=true" + "\n helm upgrade ... --set dynamo-operator.gpuDiscovery.enabled=true" +
"\n\n2. Add hardware config to profilingConfig.config.{{.Hardware}}:" + "\n\n2. Add hardware config to spec.hardware:" +
"\n {{.NumGPUs}}: 8" + "\n numGpusPerNode: 8" +
"\n {{.GPUModel}}: \"H100-SXM5-80GB\"" + "\n gpuSku: \"H100-SXM5-80GB\"" +
"\n {{.GPUVram}}: 81920" + "\n vramMb: 81920")
"\n\n3. Or specify {{.Engine}}.{{.MinGPUs}} and {{.Engine}}.{{.MaxGPUs}} for explicit GPU search ranges.",
))
var buf bytes.Buffer
_ = tmpl.Execute(&buf, map[string]string{
"Hardware": ConfigKeyHardware,
"NumGPUs": ConfigKeyNumGpusPerNode,
"GPUModel": ConfigKeyGPUModel,
"GPUVram": ConfigKeyGPUVramMib,
"Engine": ConfigKeyEngine,
"MinGPUs": ConfigKeyMinNumGpusPerEng,
"MaxGPUs": ConfigKeyMaxNumGpusPerEng,
})
return fmt.Errorf("%s", buf.String())
} }
return fmt.Errorf("GPU hardware info required but auto-discovery failed. Add hardware config to profilingConfig.config.%s (%s, %s, %s) or specify %s.%s and %s.%s", return fmt.Errorf("GPU hardware info required but auto-discovery failed. Add spec.hardware.gpuSku, spec.hardware.vramMb, spec.hardware.numGpusPerNode")
ConfigKeyHardware, ConfigKeyNumGpusPerNode, ConfigKeyGPUModel, ConfigKeyGPUVramMib,
ConfigKeyEngine, ConfigKeyMinNumGpusPerEng, ConfigKeyEngine, ConfigKeyMaxNumGpusPerEng)
} }
// createProfilingJob creates a Kubernetes Job for profiling using SyncResource // createProfilingJob creates a Kubernetes Job for profiling using SyncResource
func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error { func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) error {
logger := log.FromContext(ctx) logger := log.FromContext(ctx)
// Delete any existing output ConfigMap to ensure fresh profiling results // Delete any existing output ConfigMap to ensure fresh profiling results
...@@ -1087,21 +913,10 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context. ...@@ -1087,21 +913,10 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
} }
} }
// Run GPU discovery before creating job (cluster-wide and namespace-restricted operators if they have node read permissions) // Enrich hardware from GPU discovery before marshalling the spec.
var gpuInfo *gpu.GPUInfo // This fills in gpuSku, vramMb, numGpusPerNode if the user didn't set them.
logger.Info("Attempting GPU discovery for profiling job") if err := r.enrichHardwareFromDiscovery(ctx, dgdr); err != nil {
discoveredInfo, err := gpu.DiscoverGPUs(ctx, r.Client) logger.Info("GPU discovery not available, proceeding without enrichment", "reason", err.Error())
if err != nil {
// This path is expected for namespace-restricted operators without node read permissions
logger.Info("GPU discovery not available, using manual hardware configuration from profiling config",
"reason", err.Error())
} else {
gpuInfo = discoveredInfo
logger.Info("GPU discovery completed successfully",
"gpusPerNode", gpuInfo.GPUsPerNode,
"model", gpuInfo.Model,
"vramMiB", gpuInfo.VRAMPerGPU,
"system", gpuInfo.System)
} }
// Use SyncResource to create/update the job // Use SyncResource to create/update the job
...@@ -1109,7 +924,8 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context. ...@@ -1109,7 +924,8 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
jobName := getProfilingJobName(dgdr) jobName := getProfilingJobName(dgdr)
outputConfigMapName := getOutputConfigMapName(dgdr) outputConfigMapName := getOutputConfigMapName(dgdr)
configYAML, err := r.prepareProfilingConfig(dgdr, gpuInfo) // Marshal the DGDR spec to JSON — the profiler receives the spec verbatim
specJSON, err := marshalDGDRSpec(dgdr)
if err != nil { if err != nil {
return nil, false, err return nil, false, err
} }
...@@ -1158,16 +974,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context. ...@@ -1158,16 +974,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
}, },
} }
// Add ConfigMap volume mount if provided // Add model cache PVC mount if configured
if dgdr.Spec.ProfilingConfig.ConfigMapRef != nil {
volumeMounts = append(volumeMounts, corev1.VolumeMount{
Name: VolumeNameProfilingConfig,
MountPath: ProfilingConfigPath,
ReadOnly: true,
})
}
// Add model cache PVC mount if configured in profilingConfig.config.deployment
modelCachePVC, modelCacheMountPath := extractModelCachePVCConfig(dgdr) modelCachePVC, modelCacheMountPath := extractModelCachePVCConfig(dgdr)
if modelCachePVC != "" { if modelCachePVC != "" {
logger.Info("Mounting model cache PVC to profiler pod", "pvc", modelCachePVC, "mountPath", modelCacheMountPath) logger.Info("Mounting model cache PVC to profiler pod", "pvc", modelCachePVC, "mountPath", modelCacheMountPath)
...@@ -1178,29 +985,32 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context. ...@@ -1178,29 +985,32 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
}) })
} }
// Profiler args: pass the config as an inline YAML string via --profile-config // v1alpha1 round-trip: mount ConfigMap if referenced via annotation
profilerArgs := []string{ cmRef := configMapRefFromAnnotation(dgdr)
"--profile-config", string(configYAML), if cmRef != nil {
volumeMounts = append(volumeMounts, corev1.VolumeMount{
Name: VolumeNameProfilingConfig,
MountPath: ProfilingConfigMountPath,
ReadOnly: true,
})
} }
// Use profiler image from profilingConfig // Profiler args: pass the DGDR spec as JSON via --config
imageName := dgdr.Spec.ProfilingConfig.ProfilerImage profilerArgs := []string{"--config", specJSON}
// Use image from spec
imageName := dgdr.Spec.Image
logger.Info("Using profiler image", "image", imageName) logger.Info("Using profiler image", "image", imageName)
profilerContainer := corev1.Container{ profilerContainer := corev1.Container{
Name: ContainerNameProfiler, Name: ContainerNameProfiler,
Image: imageName, Image: imageName,
Command: []string{"python", "-m", "dynamo.profiler.profile_sla"}, Command: []string{"python", "-m", "dynamo.profiler"},
Args: profilerArgs, Args: profilerArgs,
Env: profilerEnv, Env: profilerEnv,
VolumeMounts: volumeMounts, VolumeMounts: volumeMounts,
} }
// Apply resource requirements if specified in the DGDR
if dgdr.Spec.ProfilingConfig.Resources != nil {
profilerContainer.Resources = *dgdr.Spec.ProfilingConfig.Resources
}
// Generate sidecar script from template // Generate sidecar script from template
tmpl, err := template.New("sidecar").Parse(sidecarScriptTemplate) tmpl, err := template.New("sidecar").Parse(sidecarScriptTemplate)
if err != nil { if err != nil {
...@@ -1232,15 +1042,16 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context. ...@@ -1232,15 +1042,16 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
}}, }},
} }
// Use PVC if specified, otherwise use emptyDir for profiling output // Use PVC for profiling output if round-tripped v1alpha1 annotation is present,
// otherwise use emptyDir (v1beta1 default).
var profilingOutputVolume corev1.Volume var profilingOutputVolume corev1.Volume
if dgdr.Spec.ProfilingConfig.OutputPVC != "" { if outputPVC := outputPVCFromAnnotation(dgdr); outputPVC != "" {
logger.Info("Using PVC for profiling output", "pvc", dgdr.Spec.ProfilingConfig.OutputPVC) logger.Info("Using PVC for profiling output (from v1alpha1 annotation)", "pvc", outputPVC)
profilingOutputVolume = corev1.Volume{ profilingOutputVolume = corev1.Volume{
Name: VolumeNameProfilingOutput, Name: VolumeNameProfilingOutput,
VolumeSource: corev1.VolumeSource{ VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
ClaimName: dgdr.Spec.ProfilingConfig.OutputPVC, ClaimName: outputPVC,
}, },
}, },
} }
...@@ -1254,59 +1065,52 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context. ...@@ -1254,59 +1065,52 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
} }
volumes := []corev1.Volume{profilingOutputVolume} volumes := []corev1.Volume{profilingOutputVolume}
// Add ConfigMap volume if provided // Add model cache PVC volume if configured
if dgdr.Spec.ProfilingConfig.ConfigMapRef != nil { if modelCachePVC != "" {
key := dgdr.Spec.ProfilingConfig.ConfigMapRef.Key volumes = append(volumes, corev1.Volume{
if key == "" { Name: VolumeNameModelCache,
key = ProfilingConfigFile VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
ClaimName: modelCachePVC,
ReadOnly: true,
},
},
})
} }
// v1alpha1 round-trip: add ConfigMap volume if referenced via annotation
if cmRef != nil {
cmKey := cmRef.Key
if cmKey == "" {
cmKey = ProfilingConfigDefaultKey
}
volumes = append(volumes, corev1.Volume{ volumes = append(volumes, corev1.Volume{
Name: VolumeNameProfilingConfig, Name: VolumeNameProfilingConfig,
VolumeSource: corev1.VolumeSource{ VolumeSource: corev1.VolumeSource{
ConfigMap: &corev1.ConfigMapVolumeSource{ ConfigMap: &corev1.ConfigMapVolumeSource{
LocalObjectReference: corev1.LocalObjectReference{ LocalObjectReference: corev1.LocalObjectReference{
Name: dgdr.Spec.ProfilingConfig.ConfigMapRef.Name, Name: cmRef.Name,
}, },
Items: []corev1.KeyToPath{{ Items: []corev1.KeyToPath{{
Key: key, Key: cmKey,
Path: ProfilingConfigFile, Path: ProfilingConfigDefaultKey,
}}, }},
}, },
}, },
}) })
} }
// Add model cache PVC volume if configured
if modelCachePVC != "" {
volumes = append(volumes, corev1.Volume{
Name: VolumeNameModelCache,
VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
ClaimName: modelCachePVC,
ReadOnly: true,
},
},
})
}
// Limit retries to prevent infinite loop // Limit retries to prevent infinite loop
backoffLimit := int32(3) backoffLimit := int32(3)
// Determine label based on whether AI Configurator is used
labelValue := LabelValueDynamoProfiler
if !isOnlineProfiling(dgdr) {
labelValue = LabelValueAICProfiler
}
podSpec := corev1.PodSpec{ podSpec := corev1.PodSpec{
ServiceAccountName: ServiceAccountProfilingJob, ServiceAccountName: ServiceAccountProfilingJob,
RestartPolicy: corev1.RestartPolicyNever, RestartPolicy: corev1.RestartPolicyNever,
SecurityContext: &corev1.PodSecurityContext{ SecurityContext: &corev1.PodSecurityContext{
RunAsNonRoot: ptr.To(true), // Enforces that container cannot run as root RunAsNonRoot: ptr.To(true),
RunAsUser: ptr.To[int64](1000), // Run as UID 1000 (non-privileged user) RunAsUser: ptr.To[int64](1000),
RunAsGroup: ptr.To[int64](1000), // Run with GID 1000 (non-privileged group) RunAsGroup: ptr.To[int64](1000),
FSGroup: ptr.To[int64](1000), // Volume files owned by GID 1000 FSGroup: ptr.To[int64](1000),
}, },
Containers: []corev1.Container{profilerContainer, sidecarContainer}, Containers: []corev1.Container{profilerContainer, sidecarContainer},
Volumes: volumes, Volumes: volumes,
...@@ -1315,24 +1119,14 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context. ...@@ -1315,24 +1119,14 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
}, },
} }
// Apply tolerations if specified in the DGDR
if len(dgdr.Spec.ProfilingConfig.Tolerations) > 0 {
podSpec.Tolerations = dgdr.Spec.ProfilingConfig.Tolerations
}
// Apply nodeSelector if specified in the DGDR
if len(dgdr.Spec.ProfilingConfig.NodeSelector) > 0 {
podSpec.NodeSelector = dgdr.Spec.ProfilingConfig.NodeSelector
}
job := &batchv1.Job{ job := &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: jobName, Name: jobName,
Namespace: dgdr.Namespace, Namespace: dgdr.Namespace,
Labels: map[string]string{ Labels: map[string]string{
LabelApp: labelValue, nvidiacomv1beta1.LabelApp: nvidiacomv1beta1.LabelValueDynamoProfiler,
LabelDGDR: dgdr.Name, nvidiacomv1beta1.LabelDGDR: dgdr.Name,
LabelManagedBy: LabelValueDynamoOperator, nvidiacomv1beta1.LabelManagedBy: nvidiacomv1beta1.LabelValueDynamoOperator,
}, },
}, },
Spec: batchv1.JobSpec{ Spec: batchv1.JobSpec{
...@@ -1343,6 +1137,9 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context. ...@@ -1343,6 +1137,9 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
}, },
} }
// Apply overrides from spec.overrides.profilingJob if provided
applyProfilingJobOverrides(job, dgdr)
return job, false, nil return job, false, nil
}) })
...@@ -1354,144 +1151,149 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context. ...@@ -1354,144 +1151,149 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
logger.Info("Profiling job created/updated", "job", job.Name) logger.Info("Profiling job created/updated", "job", job.Name)
} }
// Store the job name in status for observability
dgdr.Status.ProfilingJobName = job.Name
return nil return nil
} }
// prepareProfilingConfig parses and modifies the profiling config // applyProfilingJobOverrides applies user-specified overrides from
func (r *DynamoGraphDeploymentRequestReconciler) prepareProfilingConfig(dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest, gpuInfo *gpu.GPUInfo) ([]byte, error) { // spec.overrides.profilingJob to both the pod spec and job spec.
// Parse the profiling config from JSON func applyProfilingJobOverrides(job *batchv1.Job, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) {
var config map[string]interface{} if dgdr.Spec.Overrides == nil || dgdr.Spec.Overrides.ProfilingJob == nil {
if err := yaml.Unmarshal(dgdr.Spec.ProfilingConfig.Config.Raw, &config); err != nil { return
return nil, fmt.Errorf("failed to parse profiling config: %w", err)
} }
// Set deployment.namespace if not already set overrides := dgdr.Spec.Overrides.ProfilingJob
deploymentVal, hasDeployment := config[ConfigKeyDeployment] podSpec := &job.Spec.Template.Spec
var deploymentConfig map[string]interface{}
if !hasDeployment || deploymentVal == nil { // Apply pod-level overrides
deploymentConfig = make(map[string]interface{}) overridePS := overrides.Template.Spec
config[ConfigKeyDeployment] = deploymentConfig if len(overridePS.Containers) > 0 {
} else { podSpec.Containers[0].Resources = overridePS.Containers[0].Resources
var ok bool
deploymentConfig, ok = deploymentVal.(map[string]interface{})
if !ok {
return nil, fmt.Errorf("profilingConfig.config.%s must be an object, got %T", ConfigKeyDeployment, deploymentVal)
} }
if len(overridePS.Tolerations) > 0 {
podSpec.Tolerations = overridePS.Tolerations
} }
if _, hasNamespace := deploymentConfig[ConfigKeyNamespace]; !hasNamespace { if len(overridePS.NodeSelector) > 0 {
deploymentConfig[ConfigKeyNamespace] = dgdr.Namespace podSpec.NodeSelector = overridePS.NodeSelector
} }
if len(overridePS.ImagePullSecrets) > 0 {
// Set deployment.model from spec.model // Merge override secrets with existing ones (deduplicate by name)
deploymentConfig[ConfigKeyModel] = dgdr.Spec.Model seen := make(map[string]bool)
for _, s := range podSpec.ImagePullSecrets {
// Set deployment.dgd_image from deploymentOverrides.workersImage if provided seen[s.Name] = true
if dgdr.Spec.DeploymentOverrides != nil && dgdr.Spec.DeploymentOverrides.WorkersImage != "" {
deploymentConfig[ConfigKeyDGDImage] = dgdr.Spec.DeploymentOverrides.WorkersImage
} }
for _, s := range overridePS.ImagePullSecrets {
// Set output_dir if not already set if !seen[s.Name] {
if _, hasOutputDir := config[ConfigKeyOutputDir]; !hasOutputDir { podSpec.ImagePullSecrets = append(podSpec.ImagePullSecrets, s)
config[ConfigKeyOutputDir] = ProfilingOutputPath seen[s.Name] = true
} }
// Set engine.backend from spec.backend
engineVal, hasEngine := config[ConfigKeyEngine]
var engineConfig map[string]interface{}
if !hasEngine || engineVal == nil {
engineConfig = make(map[string]interface{})
config[ConfigKeyEngine] = engineConfig
} else {
var ok bool
engineConfig, ok = engineVal.(map[string]interface{})
if !ok {
return nil, fmt.Errorf("profilingConfig.config.%s must be an object, got %T", ConfigKeyEngine, engineVal)
} }
} }
engineConfig[ConfigKeyBackend] = dgdr.Spec.Backend if overridePS.ServiceAccountName != "" {
podSpec.ServiceAccountName = overridePS.ServiceAccountName
// If ConfigMapRef is provided, set engine.config path
if dgdr.Spec.ProfilingConfig.ConfigMapRef != nil {
engineConfig[ConfigKeyConfig] = fmt.Sprintf("%s/%s", ProfilingConfigPath, ProfilingConfigFile)
} }
// User-specified values take precedence over auto-discovered values // Apply job-level overrides
if gpuInfo != nil { if overrides.BackoffLimit != nil {
hardwareVal, hasHardware := config["hardware"] job.Spec.BackoffLimit = overrides.BackoffLimit
var hardwareConfig map[string]interface{}
if !hasHardware || hardwareVal == nil {
hardwareConfig = make(map[string]interface{})
config["hardware"] = hardwareConfig
} else {
var ok bool
hardwareConfig, ok = hardwareVal.(map[string]interface{})
if !ok {
return nil, fmt.Errorf("profilingConfig.config.hardware must be an object, got %T", hardwareVal)
}
} }
}
if _, hasNumGpus := hardwareConfig[ConfigKeyNumGpusPerNode]; !hasNumGpus { // marshalDGDRSpec produces the JSON string passed to the profiler via --config.
hardwareConfig[ConfigKeyNumGpusPerNode] = gpuInfo.GPUsPerNode // The profiler receives the DGDR spec verbatim — no bespoke key mapping needed.
} func marshalDGDRSpec(dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) (string, error) {
if _, hasGpuModel := hardwareConfig[ConfigKeyGPUModel]; !hasGpuModel { specJSON, err := json.Marshal(dgdr.Spec)
hardwareConfig[ConfigKeyGPUModel] = gpuInfo.Model if err != nil {
} return "", fmt.Errorf("failed to marshal DGDR spec to JSON: %w", err)
if _, hasGpuVram := hardwareConfig[ConfigKeyGPUVramMib]; !hasGpuVram {
hardwareConfig[ConfigKeyGPUVramMib] = gpuInfo.VRAMPerGPU
}
if gpuInfo.System != "" {
if _, hasSystem := hardwareConfig[ConfigKeySystem]; !hasSystem {
hardwareConfig[ConfigKeySystem] = gpuInfo.System
} }
return string(specJSON), nil
}
// enrichHardwareFromDiscovery fills in hardware fields that the user didn't set.
// Called before marshalDGDRSpec(). Mutates dgdr.Spec.Hardware in-place (memory only, not persisted).
func (r *DynamoGraphDeploymentRequestReconciler) enrichHardwareFromDiscovery(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) error {
if dgdr.Spec.Hardware == nil {
dgdr.Spec.Hardware = &nvidiacomv1beta1.HardwareSpec{}
} }
hw := dgdr.Spec.Hardware
if hw.GPUSKU != "" && hw.VRAMMB != nil && hw.NumGPUsPerNode != nil {
return nil // all fields already set by user
} }
// Serialize config to YAML for passing to profiler gpuInfo, err := gpu.DiscoverGPUs(ctx, r.Client)
configYAML, err := sigsyaml.Marshal(config)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to marshal profiling config to YAML: %w", err) return err
} }
return configYAML, nil logger := log.FromContext(ctx)
} logger.Info("GPU discovery completed successfully",
"gpusPerNode", gpuInfo.GPUsPerNode,
"model", gpuInfo.Model,
"vramMiB", gpuInfo.VRAMPerGPU)
// extractModelCachePVCConfig extracts model cache PVC settings from the profiling config. if hw.GPUSKU == "" {
// Returns (pvcName, mountPath) - both empty if not configured. hw.GPUSKU = gpuInfo.Model
func extractModelCachePVCConfig(dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (string, string) {
if dgdr.Spec.ProfilingConfig.Config == nil {
return "", ""
} }
if hw.VRAMMB == nil {
var config map[string]interface{} vram := float64(gpuInfo.VRAMPerGPU)
if err := yaml.Unmarshal(dgdr.Spec.ProfilingConfig.Config.Raw, &config); err != nil { hw.VRAMMB = &vram
return "", ""
} }
if hw.NumGPUsPerNode == nil {
deployment, ok := config[ConfigKeyDeployment].(map[string]interface{}) n := int32(gpuInfo.GPUsPerNode)
if !ok { hw.NumGPUsPerNode = &n
return "", ""
}
modelCache, ok := deployment[ConfigKeyModelCache].(map[string]interface{})
if !ok {
return "", ""
} }
return nil
}
pvcName, _ := modelCache[ConfigKeyPVCName].(string) // extractModelCachePVCConfig reads model cache PVC settings from the typed v1beta1 spec.
if pvcName == "" { // Returns (pvcName, mountPath) — both empty if not configured.
func extractModelCachePVCConfig(dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) (string, string) {
if dgdr.Spec.ModelCache == nil || dgdr.Spec.ModelCache.PVCName == "" {
return "", "" return "", ""
} }
mountPath := dgdr.Spec.ModelCache.PVCMountPath
mountPath, _ := modelCache[ConfigKeyMountPath].(string)
if mountPath == "" { if mountPath == "" {
mountPath = DefaultModelCacheMountPath mountPath = DefaultModelCacheMountPath
} }
return dgdr.Spec.ModelCache.PVCName, mountPath
}
return pvcName, mountPath // configMapKeySelector mirrors v1alpha1.ConfigMapKeySelector for annotation deserialization.
type configMapKeySelector struct {
Name string `json:"name"`
Key string `json:"key,omitempty"`
}
// configMapRefFromAnnotation reads the ConfigMap reference from the round-trip annotation.
// Returns nil for native v1beta1 resources (no annotation present).
func configMapRefFromAnnotation(dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) *configMapKeySelector {
if dgdr.Annotations == nil {
return nil
}
raw, ok := dgdr.Annotations[AnnotationConfigMapRef]
if !ok || raw == "" {
return nil
}
var ref configMapKeySelector
if err := json.Unmarshal([]byte(raw), &ref); err != nil {
return nil
}
return &ref
}
// outputPVCFromAnnotation reads the output PVC name from the round-trip annotation.
// Returns "" for native v1beta1 resources (always emptyDir).
func outputPVCFromAnnotation(dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) string {
if dgdr.Annotations == nil {
return ""
}
return dgdr.Annotations[AnnotationOutputPVC]
} }
// checkProfilingJobStatus checks if the profiling job has completed // checkProfilingJobStatus checks if the profiling job has completed
func (r *DynamoGraphDeploymentRequestReconciler) checkProfilingJobStatus(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (bool, error) { func (r *DynamoGraphDeploymentRequestReconciler) checkProfilingJobStatus(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) (bool, error) {
logger := log.FromContext(ctx) logger := log.FromContext(ctx)
jobName := getProfilingJobName(dgdr) jobName := getProfilingJobName(dgdr)
...@@ -1520,7 +1322,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) checkProfilingJobStatus(ctx con ...@@ -1520,7 +1322,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) checkProfilingJobStatus(ctx con
} }
// getProfilingJobErrorDetails retrieves detailed error information from failed profiling job pods // getProfilingJobErrorDetails retrieves detailed error information from failed profiling job pods
func (r *DynamoGraphDeploymentRequestReconciler) getProfilingJobErrorDetails(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest, job *batchv1.Job) string { func (r *DynamoGraphDeploymentRequestReconciler) getProfilingJobErrorDetails(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest, job *batchv1.Job) string {
logger := log.FromContext(ctx) logger := log.FromContext(ctx)
// List pods owned by this job // List pods owned by this job
...@@ -1570,7 +1372,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) getProfilingJobErrorDetails(ctx ...@@ -1570,7 +1372,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) getProfilingJobErrorDetails(ctx
} }
// generateDGDSpec generates DGD spec from profiling results (online or offline/AIC) // generateDGDSpec generates DGD spec from profiling results (online or offline/AIC)
func (r *DynamoGraphDeploymentRequestReconciler) generateDGDSpec(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error { func (r *DynamoGraphDeploymentRequestReconciler) generateDGDSpec(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) error {
logger := log.FromContext(ctx) logger := log.FromContext(ctx)
logger.Info("Generating DGD spec from profiling results", "name", dgdr.Name, "backend", dgdr.Spec.Backend) logger.Info("Generating DGD spec from profiling results", "name", dgdr.Name, "backend", dgdr.Spec.Backend)
...@@ -1589,10 +1391,10 @@ func (r *DynamoGraphDeploymentRequestReconciler) generateDGDSpec(ctx context.Con ...@@ -1589,10 +1391,10 @@ func (r *DynamoGraphDeploymentRequestReconciler) generateDGDSpec(ctx context.Con
return fmt.Errorf("failed to get output ConfigMap: %w", err) return fmt.Errorf("failed to get output ConfigMap: %w", err)
} }
// Select the right config file based on useMocker flag // Select the right config file based on mocker feature flag
// Profiler always generates both real and mocker configs // Profiler always generates both real and mocker configs
var outputFile string var outputFile string
if dgdr.Spec.UseMocker { if dgdr.Spec.Features != nil && dgdr.Spec.Features.Mocker != nil && dgdr.Spec.Features.Mocker.Enabled {
outputFile = ProfilingOutputFileMocker outputFile = ProfilingOutputFileMocker
logger.Info("Using mocker deployment config") logger.Info("Using mocker deployment config")
} else { } else {
...@@ -1627,18 +1429,46 @@ func (r *DynamoGraphDeploymentRequestReconciler) generateDGDSpec(ctx context.Con ...@@ -1627,18 +1429,46 @@ func (r *DynamoGraphDeploymentRequestReconciler) generateDGDSpec(ctx context.Con
} }
} }
// Store the generated DGD in status // Store the generated DGD name in status and cache the spec in an annotation for createDGD
dgdr.Status.GeneratedDeployment = &runtime.RawExtension{ dgdr.Status.DGDName = dgd.Name
Object: dgd,
// Store the generated DGD in ProfilingResults.SelectedConfig for status visibility
dgdJSON, err := json.Marshal(dgd)
if err != nil {
return fmt.Errorf("failed to marshal generated DGD to JSON: %w", err)
}
if dgdr.Status.ProfilingResults == nil {
dgdr.Status.ProfilingResults = &nvidiacomv1beta1.ProfilingResultsStatus{}
}
dgdr.Status.ProfilingResults.SelectedConfig = &runtime.RawExtension{Raw: dgdJSON}
// Serialize the DGD spec to an annotation so createDGD can retrieve it
dgdBytes, err := sigsyaml.Marshal(dgd)
if err != nil {
return fmt.Errorf("failed to marshal generated DGD: %w", err)
}
if dgdr.Annotations == nil {
dgdr.Annotations = make(map[string]string)
}
dgdr.Annotations["nvidia.com/generated-dgd-spec"] = string(dgdBytes)
// Update the object (annotations are on the object, not status)
if err := r.Update(ctx, dgdr); err != nil {
return fmt.Errorf("failed to update DGDR with generated DGD annotation: %w", err)
}
// Refetch the DGDR after the annotation update to get the latest resourceVersion
// and avoid conflicts with concurrent modifications before updating status.
if err := r.Get(ctx, types.NamespacedName{Name: dgdr.Name, Namespace: dgdr.Namespace}, dgdr); err != nil {
return fmt.Errorf("failed to refetch DGDR after annotation update: %w", err)
} }
dgdr.Status.ProfilingResults = fmt.Sprintf("configmap/%s", outputConfigMapName)
return r.Status().Update(ctx, dgdr) return r.Status().Update(ctx, dgdr)
} }
// storeAdditionalResources marshals additional resources to YAML and stores them in DGDR annotations. // storeAdditionalResources marshals additional resources to YAML and stores them in DGDR annotations.
// Validates annotation size and fails gracefully if too large. // Validates annotation size and fails gracefully if too large.
func (r *DynamoGraphDeploymentRequestReconciler) storeAdditionalResources(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest, resources []*unstructured.Unstructured) error { func (r *DynamoGraphDeploymentRequestReconciler) storeAdditionalResources(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest, resources []*unstructured.Unstructured) error {
if len(resources) == 0 { if len(resources) == 0 {
return nil return nil
} }
...@@ -1673,10 +1503,10 @@ func (r *DynamoGraphDeploymentRequestReconciler) storeAdditionalResources(ctx co ...@@ -1673,10 +1503,10 @@ func (r *DynamoGraphDeploymentRequestReconciler) storeAdditionalResources(ctx co
// extractResourcesFromYAML parses multi-document YAML from profiling output, // extractResourcesFromYAML parses multi-document YAML from profiling output,
// extracting the DynamoGraphDeployment and any ConfigMaps that should be deployed with it. // extracting the DynamoGraphDeployment and any ConfigMaps that should be deployed with it.
func (r *DynamoGraphDeploymentRequestReconciler) extractResourcesFromYAML(yamlContent []byte) (*nvidiacomv1alpha1.DynamoGraphDeployment, []*unstructured.Unstructured, error) { func (r *DynamoGraphDeploymentRequestReconciler) extractResourcesFromYAML(yamlContent []byte) (*dgdv1alpha1.DynamoGraphDeployment, []*unstructured.Unstructured, error) {
decoder := yaml.NewYAMLOrJSONDecoder(bytes.NewReader(yamlContent), 4096) decoder := yaml.NewYAMLOrJSONDecoder(bytes.NewReader(yamlContent), 4096)
var dgd *nvidiacomv1alpha1.DynamoGraphDeployment var dgd *dgdv1alpha1.DynamoGraphDeployment
var additionalResources []*unstructured.Unstructured var additionalResources []*unstructured.Unstructured
for { for {
...@@ -1695,7 +1525,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) extractResourcesFromYAML(yamlCo ...@@ -1695,7 +1525,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) extractResourcesFromYAML(yamlCo
} }
if obj.GetKind() == "DynamoGraphDeployment" { if obj.GetKind() == "DynamoGraphDeployment" {
dgd = &nvidiacomv1alpha1.DynamoGraphDeployment{} dgd = &dgdv1alpha1.DynamoGraphDeployment{}
if err := runtime.DefaultUnstructuredConverter.FromUnstructured(obj.Object, dgd); err != nil { if err := runtime.DefaultUnstructuredConverter.FromUnstructured(obj.Object, dgd); err != nil {
return nil, nil, fmt.Errorf("failed to convert to DynamoGraphDeployment: %w", err) return nil, nil, fmt.Errorf("failed to convert to DynamoGraphDeployment: %w", err)
} }
...@@ -1713,31 +1543,90 @@ func (r *DynamoGraphDeploymentRequestReconciler) extractResourcesFromYAML(yamlCo ...@@ -1713,31 +1543,90 @@ func (r *DynamoGraphDeploymentRequestReconciler) extractResourcesFromYAML(yamlCo
} }
// extractDGDFromYAML is a convenience wrapper that extracts only the DGD (used by tests) // extractDGDFromYAML is a convenience wrapper that extracts only the DGD (used by tests)
func (r *DynamoGraphDeploymentRequestReconciler) extractDGDFromYAML(yamlContent []byte) (*nvidiacomv1alpha1.DynamoGraphDeployment, error) { func (r *DynamoGraphDeploymentRequestReconciler) extractDGDFromYAML(yamlContent []byte) (*dgdv1alpha1.DynamoGraphDeployment, error) {
dgd, _, err := r.extractResourcesFromYAML(yamlContent) dgd, _, err := r.extractResourcesFromYAML(yamlContent)
return dgd, err return dgd, err
} }
// updateStateAndRequeue updates the DGDR state and requeues // updateDeploymentInfo populates status.deploymentInfo from DGD service replica counts.
func (r *DynamoGraphDeploymentRequestReconciler) updateStateAndRequeue(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest, state nvidiacomv1alpha1.DGDRState, _ string) (ctrl.Result, error) { func updateDeploymentInfo(dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest, dgd *dgdv1alpha1.DynamoGraphDeployment) bool {
dgdr.Status.State = state var totalReplicas, totalAvailable int32
for _, svc := range dgd.Status.Services {
totalReplicas += svc.Replicas
if svc.AvailableReplicas != nil {
totalAvailable += *svc.AvailableReplicas
}
}
// Short-circuit if nothing changed
if cur := dgdr.Status.DeploymentInfo; cur != nil &&
cur.Replicas != nil && *cur.Replicas == totalReplicas &&
cur.AvailableReplicas != nil && *cur.AvailableReplicas == totalAvailable {
return false
}
dgdr.Status.DeploymentInfo = &nvidiacomv1beta1.DeploymentInfoStatus{
Replicas: &totalReplicas,
AvailableReplicas: &totalAvailable,
}
return true
}
// setSucceededCondition sets the aggregate Succeeded condition based on the current phase.
func setSucceededCondition(dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest, phase nvidiacomv1beta1.DGDRPhase) {
var status metav1.ConditionStatus
var reason, message string
switch phase {
case nvidiacomv1beta1.DGDRPhasePending, "":
status, reason, message = metav1.ConditionFalse, "Pending", "DGDR is pending"
case nvidiacomv1beta1.DGDRPhaseProfiling:
status, reason, message = metav1.ConditionFalse, "Profiling", "Profiling is in progress"
case nvidiacomv1beta1.DGDRPhaseReady:
status, reason, message = metav1.ConditionTrue, "SpecGenerated", "Profiling complete, spec available"
case nvidiacomv1beta1.DGDRPhaseDeploying:
status, reason, message = metav1.ConditionFalse, "Deploying", "Deployment is in progress"
case nvidiacomv1beta1.DGDRPhaseDeployed:
status, reason, message = metav1.ConditionTrue, "Deployed", "Deployment is healthy"
case nvidiacomv1beta1.DGDRPhaseFailed:
status, reason, message = metav1.ConditionFalse, "Failed", "DGDR has failed"
default:
status, reason, message = metav1.ConditionFalse, "Unknown", "Unknown phase"
}
meta.SetStatusCondition(&dgdr.Status.Conditions, metav1.Condition{
Type: nvidiacomv1beta1.ConditionTypeSucceeded,
Status: status,
ObservedGeneration: dgdr.Generation,
Reason: reason,
Message: message,
})
}
// updatePhaseAndRequeue updates the DGDR phase and requeues
func (r *DynamoGraphDeploymentRequestReconciler) updatePhaseAndRequeue(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest, phase nvidiacomv1beta1.DGDRPhase, message string) (ctrl.Result, error) {
logger := log.FromContext(ctx)
logger.Info("Updating DGDR phase", "name", dgdr.Name, "phase", phase, "message", message)
dgdr.Status.Phase = phase
setSucceededCondition(dgdr, phase)
if err := r.Status().Update(ctx, dgdr); err != nil { if err := r.Status().Update(ctx, dgdr); err != nil {
return ctrl.Result{}, err return ctrl.Result{}, err
} }
return ctrl.Result{Requeue: true}, nil return ctrl.Result{Requeue: true}, nil
} }
// updateStateWithCondition updates state and adds/updates a condition // updatePhaseWithCondition updates phase and adds/updates a condition
func (r *DynamoGraphDeploymentRequestReconciler) updateStateWithCondition( func (r *DynamoGraphDeploymentRequestReconciler) updatePhaseWithCondition(
ctx context.Context, ctx context.Context,
dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest,
state nvidiacomv1alpha1.DGDRState, phase nvidiacomv1beta1.DGDRPhase,
conditionType string, conditionType string,
status metav1.ConditionStatus, status metav1.ConditionStatus,
reason string, reason string,
message string, message string,
) (ctrl.Result, error) { ) (ctrl.Result, error) {
dgdr.Status.State = state dgdr.Status.Phase = phase
setSucceededCondition(dgdr, phase)
condition := metav1.Condition{ condition := metav1.Condition{
Type: conditionType, Type: conditionType,
...@@ -1760,7 +1649,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) updateStateWithCondition( ...@@ -1760,7 +1649,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) updateStateWithCondition(
// SetupWithManager sets up the controller with the Manager // SetupWithManager sets up the controller with the Manager
func (r *DynamoGraphDeploymentRequestReconciler) SetupWithManager(mgr ctrl.Manager) error { func (r *DynamoGraphDeploymentRequestReconciler) SetupWithManager(mgr ctrl.Manager) error {
return ctrl.NewControllerManagedBy(mgr). return ctrl.NewControllerManagedBy(mgr).
For(&nvidiacomv1alpha1.DynamoGraphDeploymentRequest{}). For(&nvidiacomv1beta1.DynamoGraphDeploymentRequest{}).
Named(consts.ResourceTypeDynamoGraphDeploymentRequest). Named(consts.ResourceTypeDynamoGraphDeploymentRequest).
Owns(&batchv1.Job{}, builder.WithPredicates(predicate.Funcs{ Owns(&batchv1.Job{}, builder.WithPredicates(predicate.Funcs{
// ignore creation cause we don't want to be called again after we create the job // ignore creation cause we don't want to be called again after we create the job
...@@ -1770,12 +1659,12 @@ func (r *DynamoGraphDeploymentRequestReconciler) SetupWithManager(mgr ctrl.Manag ...@@ -1770,12 +1659,12 @@ func (r *DynamoGraphDeploymentRequestReconciler) SetupWithManager(mgr ctrl.Manag
GenericFunc: func(ge event.GenericEvent) bool { return true }, GenericFunc: func(ge event.GenericEvent) bool { return true },
})). // Watch Jobs created by this controller (via ownerReference) })). // Watch Jobs created by this controller (via ownerReference)
Watches( Watches(
&nvidiacomv1alpha1.DynamoGraphDeployment{}, &dgdv1alpha1.DynamoGraphDeployment{},
handler.EnqueueRequestsFromMapFunc(func(ctx context.Context, obj client.Object) []ctrl.Request { handler.EnqueueRequestsFromMapFunc(func(ctx context.Context, obj client.Object) []ctrl.Request {
// Find DGDR by label instead of owner reference // Find DGDR by label instead of owner reference
dgd := obj.(*nvidiacomv1alpha1.DynamoGraphDeployment) dgd := obj.(*dgdv1alpha1.DynamoGraphDeployment)
dgdrName, hasName := dgd.Labels[LabelDGDRName] dgdrName, hasName := dgd.Labels[nvidiacomv1beta1.LabelDGDRName]
dgdrNamespace, hasNamespace := dgd.Labels[LabelDGDRNamespace] dgdrNamespace, hasNamespace := dgd.Labels[nvidiacomv1beta1.LabelDGDRNamespace]
if !hasName || !hasNamespace { if !hasName || !hasNamespace {
return nil return nil
} }
......
...@@ -19,23 +19,22 @@ package controller ...@@ -19,23 +19,22 @@ package controller
import ( import (
"context" "context"
"encoding/json"
"time" "time"
configv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/config/v1alpha1" configv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/config/v1alpha1"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" dgdv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
commonController "github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common" commonController "github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common"
. "github.com/onsi/ginkgo/v2" . "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega" . "github.com/onsi/gomega"
batchv1 "k8s.io/api/batch/v1" batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1" corev1 "k8s.io/api/core/v1"
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
"k8s.io/apimachinery/pkg/api/meta" "k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/tools/record" "k8s.io/client-go/tools/record"
"k8s.io/utils/ptr"
"sigs.k8s.io/controller-runtime/pkg/reconcile" "sigs.k8s.io/controller-runtime/pkg/reconcile"
"sigs.k8s.io/yaml"
) )
const ( const (
...@@ -54,23 +53,6 @@ func (m *MockRBACManager) EnsureServiceAccountWithRBAC(ctx context.Context, targ ...@@ -54,23 +53,6 @@ func (m *MockRBACManager) EnsureServiceAccountWithRBAC(ctx context.Context, targ
return nil return nil
} }
// Helper function to create JSON config for tests
func createTestConfig(config map[string]interface{}) *apiextensionsv1.JSON {
// Add default hardware config if not present to satisfy validation
if _, hasHardware := config["hardware"]; !hasHardware {
config["hardware"] = map[string]interface{}{
"numGpusPerNode": 8,
"gpuModel": "H100-SXM5-80GB",
"gpuVramMib": 81920,
}
}
jsonBytes, err := json.Marshal(config)
if err != nil {
panic(err)
}
return &apiextensionsv1.JSON{Raw: jsonBytes}
}
var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
const ( const (
timeout = time.Second * 10 timeout = time.Second * 10
...@@ -106,27 +88,23 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -106,27 +88,23 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
dgdrName := "test-dgdr-initial" dgdrName := "test-dgdr-initial"
namespace := defaultNamespace namespace := defaultNamespace
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: dgdrName, Name: dgdrName,
Namespace: namespace, Namespace: namespace,
}, },
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model", Model: "test-model",
Backend: "vllm", Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ Image: "test-profiler:latest",
ProfilerImage: "test-profiler:latest", Hardware: &nvidiacomv1beta1.HardwareSpec{
Config: createTestConfig(map[string]interface{}{ NumGPUsPerNode: ptr.To[int32](8),
"engine": map[string]interface{}{ GPUSKU: "H100-SXM5-80GB",
"config": "/tmp/test-config.yaml", VRAMMB: ptr.To(81920.0),
}, },
"sla": map[string]interface{}{ SLA: &nvidiacomv1beta1.SLASpec{
"ttft": 100.0, TTFT: ptr.To(100.0),
"itl": 1500.0, ITL: ptr.To(1500.0),
"isl": 3000,
"osl": 5,
},
}),
}, },
}, },
} }
...@@ -144,14 +122,14 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -144,14 +122,14 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
// Check status // Check status
Eventually(func() nvidiacomv1alpha1.DGDRState { Eventually(func() nvidiacomv1beta1.DGDRPhase {
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated) _ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
return updated.Status.State return updated.Status.Phase
}, timeout, interval).Should(Equal(nvidiacomv1alpha1.DGDRStatePending)) }, timeout, interval).Should(Equal(nvidiacomv1beta1.DGDRPhasePending))
// Verify observedGeneration is set // Verify observedGeneration is set
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated) _ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
Expect(updated.Status.ObservedGeneration).Should(Equal(updated.Generation)) Expect(updated.Status.ObservedGeneration).Should(Equal(updated.Generation))
}) })
...@@ -161,22 +139,23 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -161,22 +139,23 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
dgdrName := "test-dgdr-minimal" dgdrName := "test-dgdr-minimal"
namespace := defaultNamespace namespace := defaultNamespace
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: dgdrName, Name: dgdrName,
Namespace: namespace, Namespace: namespace,
}, },
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model", Model: "test-model",
Backend: "vllm", Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ Image: "test-profiler:latest",
ProfilerImage: "test-profiler:latest", Hardware: &nvidiacomv1beta1.HardwareSpec{
Config: createTestConfig(map[string]interface{}{ NumGPUsPerNode: ptr.To[int32](8),
"sla": map[string]interface{}{ GPUSKU: "H100-SXM5-80GB",
"ttft": 100.0, VRAMMB: ptr.To(81920.0),
"itl": 1500.0,
}, },
}), SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
}, },
}, },
} }
...@@ -194,11 +173,11 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -194,11 +173,11 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
// Check status transitions to Pending (not Failed) // Check status transitions to Pending (not Failed)
Eventually(func() nvidiacomv1alpha1.DGDRState { Eventually(func() nvidiacomv1beta1.DGDRPhase {
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated) _ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
return updated.Status.State return updated.Status.Phase
}, timeout, interval).Should(Equal(nvidiacomv1alpha1.DGDRStatePending)) }, timeout, interval).Should(Equal(nvidiacomv1beta1.DGDRPhasePending))
}) })
}) })
...@@ -231,31 +210,26 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -231,31 +210,26 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
Expect(k8sClient.Create(ctx, sa)).Should(Succeed()) Expect(k8sClient.Create(ctx, sa)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, sa) }() defer func() { _ = k8sClient.Delete(ctx, sa) }()
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: dgdrName, Name: dgdrName,
Namespace: namespace, Namespace: namespace,
Annotations: map[string]string{
"nvidia.com/dgdr-config-map-ref": `{"name":"test-config","key":"disagg.yaml"}`,
},
}, },
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model", Model: "test-model",
Backend: "vllm", Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ Image: "test-profiler:latest",
ProfilerImage: "test-profiler:latest", Hardware: &nvidiacomv1beta1.HardwareSpec{
Config: createTestConfig(map[string]interface{}{ NumGPUsPerNode: ptr.To[int32](8),
"engine": map[string]interface{}{ GPUSKU: "H100-SXM5-80GB",
"profiler_image": "test-profiler:latest", VRAMMB: ptr.To(81920.0),
},
"sla": map[string]interface{}{
"ttft": 100.0,
"itl": 1500.0,
"isl": 3000,
"osl": 5,
},
}),
ConfigMapRef: &nvidiacomv1alpha1.ConfigMapKeySelector{
Name: "test-config",
Key: "disagg.yaml",
}, },
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
}, },
}, },
} }
...@@ -287,8 +261,8 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -287,8 +261,8 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
jobName := getProfilingJobName(dgdr) jobName := getProfilingJobName(dgdr)
job := &batchv1.Job{} job := &batchv1.Job{}
_ = k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job) _ = k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job)
Expect(job.Labels[LabelApp]).Should(Equal(LabelValueDynamoProfiler)) Expect(job.Labels[nvidiacomv1beta1.LabelApp]).Should(Equal(nvidiacomv1beta1.LabelValueDynamoProfiler))
Expect(job.Labels[LabelDGDR]).Should(Equal(dgdrName)) Expect(job.Labels[nvidiacomv1beta1.LabelDGDR]).Should(Equal(dgdrName))
// Verify job has profiler container // Verify job has profiler container
Expect(job.Spec.Template.Spec.Containers).Should(HaveLen(2)) Expect(job.Spec.Template.Spec.Containers).Should(HaveLen(2))
...@@ -324,34 +298,24 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -324,34 +298,24 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
Expect(k8sClient.Create(ctx, sa)).Should(Succeed()) Expect(k8sClient.Create(ctx, sa)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, sa) }() defer func() { _ = k8sClient.Delete(ctx, sa) }()
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: dgdrName, Name: dgdrName,
Namespace: namespace, Namespace: namespace,
}, },
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model", Model: "test-model",
Backend: "trtllm", Backend: "trtllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ Image: "test-profiler:latest",
ProfilerImage: "test-profiler:latest", SearchStrategy: "rapid",
Config: createTestConfig(map[string]interface{}{ Hardware: &nvidiacomv1beta1.HardwareSpec{
"engine": map[string]interface{}{ NumGPUsPerNode: ptr.To[int32](8),
"config": "/tmp/test-config.yaml", GPUSKU: "H100-SXM5-80GB",
"profiler_image": "test-profiler:latest", VRAMMB: ptr.To(81920.0),
},
"sla": map[string]interface{}{
"ttft": 100.0,
"itl": 1500.0,
"isl": 3000,
"osl": 5,
},
"sweep": map[string]interface{}{
"use_ai_configurator": true,
"aic_system": "h200_sxm",
"aic_hf_id": "Qwen/Qwen3-32B",
"aic_backend_version": "0.20.0",
}, },
}), SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
}, },
}, },
} }
...@@ -377,8 +341,8 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -377,8 +341,8 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
if err := k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job); err != nil { if err := k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job); err != nil {
return "" return ""
} }
return job.Labels[LabelApp] return job.Labels[nvidiacomv1beta1.LabelApp]
}, timeout, interval).Should(Equal(LabelValueAICProfiler)) }, timeout, interval).Should(Equal(nvidiacomv1beta1.LabelValueDynamoProfiler))
// Clean up // Clean up
jobName := getProfilingJobName(dgdr) jobName := getProfilingJobName(dgdr)
...@@ -395,27 +359,23 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -395,27 +359,23 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
dgdrName := "test-dgdr-profiling-complete" dgdrName := "test-dgdr-profiling-complete"
namespace := defaultNamespace namespace := defaultNamespace
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: dgdrName, Name: dgdrName,
Namespace: namespace, Namespace: namespace,
}, },
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model", Model: "test-model",
Backend: "vllm", Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ Image: "test-profiler:latest",
ProfilerImage: "test-profiler:latest", Hardware: &nvidiacomv1beta1.HardwareSpec{
Config: createTestConfig(map[string]interface{}{ NumGPUsPerNode: ptr.To[int32](8),
"engine": map[string]interface{}{ GPUSKU: "H100-SXM5-80GB",
"config": "/tmp/test-config.yaml", VRAMMB: ptr.To(81920.0),
}, },
"sla": map[string]interface{}{ SLA: &nvidiacomv1beta1.SLASpec{
"ttft": 100.0, TTFT: ptr.To(100.0),
"itl": 1500.0, ITL: ptr.To(1500.0),
"isl": 3000,
"osl": 5,
},
}),
}, },
}, },
} }
...@@ -424,7 +384,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -424,7 +384,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
defer func() { _ = k8sClient.Delete(ctx, dgdr) }() defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Update status to Profiling using Status subresource // Update status to Profiling using Status subresource
dgdr.Status.State = nvidiacomv1alpha1.DGDRStateProfiling dgdr.Status.Phase = nvidiacomv1beta1.DGDRPhaseProfiling
Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed()) Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed())
// Create completed profiling job // Create completed profiling job
...@@ -492,14 +452,14 @@ spec: ...@@ -492,14 +452,14 @@ spec:
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
// Get the updated DGDR // Get the updated DGDR
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed()) Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
// Check that DGD spec was generated // Check that DGD spec was generated (stored in annotation)
Expect(updated.Status.GeneratedDeployment).NotTo(BeNil()) Expect(updated.Annotations["nvidia.com/generated-dgd-spec"]).NotTo(BeEmpty())
// Verify state transitioned to Ready (since autoApply is false by default) // autoApply defaults to true in v1beta1, so after profiling the DGDR transitions to Deploying
Expect(updated.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStateReady)) Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhaseDeploying))
}) })
}) })
...@@ -509,27 +469,23 @@ spec: ...@@ -509,27 +469,23 @@ spec:
dgdrName := "test-dgdr-autoapply" dgdrName := "test-dgdr-autoapply"
namespace := defaultNamespace namespace := defaultNamespace
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: dgdrName, Name: dgdrName,
Namespace: namespace, Namespace: namespace,
}, },
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model", Model: "test-model",
Backend: "vllm", Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ Image: "test-profiler:latest",
ProfilerImage: "test-profiler:latest", Hardware: &nvidiacomv1beta1.HardwareSpec{
Config: createTestConfig(map[string]interface{}{ NumGPUsPerNode: ptr.To[int32](8),
"engine": map[string]interface{}{ GPUSKU: "H100-SXM5-80GB",
"config": "/tmp/test-config.yaml", VRAMMB: ptr.To(81920.0),
},
"sla": map[string]interface{}{
"ttft": 100.0,
"itl": 1500.0,
"isl": 3000,
"osl": 5,
}, },
}), SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
}, },
AutoApply: true, AutoApply: true,
}, },
...@@ -539,7 +495,7 @@ spec: ...@@ -539,7 +495,7 @@ spec:
defer func() { _ = k8sClient.Delete(ctx, dgdr) }() defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Update status to Profiling using Status subresource // Update status to Profiling using Status subresource
dgdr.Status.State = nvidiacomv1alpha1.DGDRStateProfiling dgdr.Status.Phase = nvidiacomv1beta1.DGDRPhaseProfiling
Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed()) Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed())
// Create completed profiling job // Create completed profiling job
...@@ -607,9 +563,9 @@ spec: ...@@ -607,9 +563,9 @@ spec:
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
// Get updated DGDR and check state is Deploying // Get updated DGDR and check state is Deploying
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed()) Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
Expect(updated.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStateDeploying)) Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhaseDeploying))
// Reconcile again to create DGD // Reconcile again to create DGD
_, err = reconciler.Reconcile(ctx, reconcile.Request{ _, err = reconciler.Reconcile(ctx, reconcile.Request{
...@@ -618,14 +574,12 @@ spec: ...@@ -618,14 +574,12 @@ spec:
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
// Verify DGD was created // Verify DGD was created
dgd := &nvidiacomv1alpha1.DynamoGraphDeployment{} dgd := &dgdv1alpha1.DynamoGraphDeployment{}
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: "test-dgd-auto", Namespace: namespace}, dgd)).Should(Succeed()) Expect(k8sClient.Get(ctx, types.NamespacedName{Name: "test-dgd-auto", Namespace: namespace}, dgd)).Should(Succeed())
// Get final DGDR status // Get final DGDR status
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed()) Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
Expect(updated.Status.Deployment).NotTo(BeNil()) Expect(updated.Status.DGDName).Should(Equal("test-dgd-auto"))
Expect(updated.Status.Deployment.Created).Should(BeTrue())
Expect(updated.Status.Deployment.Name).Should(Equal("test-dgd-auto"))
// Clean up DGD // Clean up DGD
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: "test-dgd-auto", Namespace: namespace}, dgd)).Should(Succeed()) Expect(k8sClient.Get(ctx, types.NamespacedName{Name: "test-dgd-auto", Namespace: namespace}, dgd)).Should(Succeed())
...@@ -639,27 +593,23 @@ spec: ...@@ -639,27 +593,23 @@ spec:
dgdrName := "test-dgdr-immutable" dgdrName := "test-dgdr-immutable"
namespace := defaultNamespace namespace := defaultNamespace
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: dgdrName, Name: dgdrName,
Namespace: namespace, Namespace: namespace,
}, },
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model", Model: "test-model",
Backend: "vllm", Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ Image: "test-profiler:latest",
ProfilerImage: "test-profiler:latest", Hardware: &nvidiacomv1beta1.HardwareSpec{
Config: createTestConfig(map[string]interface{}{ NumGPUsPerNode: ptr.To[int32](8),
"engine": map[string]interface{}{ GPUSKU: "H100-SXM5-80GB",
"config": "/tmp/test-config.yaml", VRAMMB: ptr.To(81920.0),
},
"sla": map[string]interface{}{
"ttft": 100.0,
"itl": 1500.0,
"isl": 3000,
"osl": 5,
}, },
}), SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
}, },
}, },
} }
...@@ -674,22 +624,18 @@ spec: ...@@ -674,22 +624,18 @@ spec:
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
// Get current generation // Get current generation
var current nvidiacomv1alpha1.DynamoGraphDeploymentRequest var current nvidiacomv1beta1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &current)).Should(Succeed()) Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &current)).Should(Succeed())
initialGeneration := current.Generation initialGeneration := current.Generation
observedGeneration := current.Status.ObservedGeneration observedGeneration := current.Status.ObservedGeneration
// Manually set state to Profiling to simulate in-progress profiling // Manually set state to Profiling to simulate in-progress profiling
current.Status.State = nvidiacomv1alpha1.DGDRStateProfiling current.Status.Phase = nvidiacomv1beta1.DGDRPhaseProfiling
Expect(k8sClient.Status().Update(ctx, &current)).Should(Succeed()) Expect(k8sClient.Status().Update(ctx, &current)).Should(Succeed())
// Try to modify spec // Try to modify spec
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &current)).Should(Succeed()) Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &current)).Should(Succeed())
// Unmarshal config, modify it, and marshal back current.Spec.Model = "modified-model"
var config map[string]interface{}
Expect(yaml.Unmarshal(current.Spec.ProfilingConfig.Config.Raw, &config)).Should(Succeed())
config["sla"].(map[string]interface{})["ttft"] = 200.0
current.Spec.ProfilingConfig.Config = createTestConfig(config)
Expect(k8sClient.Update(ctx, &current)).Should(Succeed()) Expect(k8sClient.Update(ctx, &current)).Should(Succeed())
// Reconcile // Reconcile
...@@ -702,13 +648,13 @@ spec: ...@@ -702,13 +648,13 @@ spec:
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &current)).Should(Succeed()) Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &current)).Should(Succeed())
Expect(current.Generation).Should(BeNumerically(">", initialGeneration)) Expect(current.Generation).Should(BeNumerically(">", initialGeneration))
Expect(current.Status.ObservedGeneration).Should(Equal(observedGeneration)) Expect(current.Status.ObservedGeneration).Should(Equal(observedGeneration))
Expect(current.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStateProfiling)) // State unchanged Expect(current.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhaseProfiling)) // State unchanged
// Verify event was recorded // Verify event was recorded
Eventually(func() bool { Eventually(func() bool {
select { select {
case event := <-recorder.Events: case event := <-recorder.Events:
return event == "Warning SpecChangeRejected Cannot modify spec in state 'Profiling'. DynamoGraphDeploymentRequest is immutable once profiling starts. Create a new resource with a different name instead." return event == "Warning SpecChangeRejected Cannot modify spec in phase 'Profiling'. DynamoGraphDeploymentRequest is immutable once profiling starts. Create a new resource with a different name instead."
default: default:
return false return false
} }
...@@ -717,32 +663,28 @@ spec: ...@@ -717,32 +663,28 @@ spec:
}) })
Context("When handling DGD deletion", func() { Context("When handling DGD deletion", func() {
It("Should transition to DeploymentDeleted state", func() { It("Should transition to Failed phase when DGD is deleted", func() {
ctx := context.Background() ctx := context.Background()
dgdrName := "test-dgdr-dgd-deleted" dgdrName := "test-dgdr-dgd-deleted"
namespace := defaultNamespace namespace := defaultNamespace
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: dgdrName, Name: dgdrName,
Namespace: namespace, Namespace: namespace,
}, },
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model", Model: "test-model",
Backend: "vllm", Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ Image: "test-profiler:latest",
ProfilerImage: "test-profiler:latest", Hardware: &nvidiacomv1beta1.HardwareSpec{
Config: createTestConfig(map[string]interface{}{ NumGPUsPerNode: ptr.To[int32](8),
"engine": map[string]interface{}{ GPUSKU: "H100-SXM5-80GB",
"config": "/tmp/test-config.yaml", VRAMMB: ptr.To(81920.0),
}, },
"sla": map[string]interface{}{ SLA: &nvidiacomv1beta1.SLASpec{
"ttft": 100.0, TTFT: ptr.To(100.0),
"itl": 1500.0, ITL: ptr.To(1500.0),
"isl": 3000,
"osl": 5,
},
}),
}, },
AutoApply: true, AutoApply: true,
}, },
...@@ -751,14 +693,9 @@ spec: ...@@ -751,14 +693,9 @@ spec:
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, dgdr) }() defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Update status to Ready with Deployment info using Status subresource // Update status to Deployed with Deployment info using Status subresource
dgdr.Status.State = nvidiacomv1alpha1.DGDRStateReady dgdr.Status.Phase = nvidiacomv1beta1.DGDRPhaseDeployed
dgdr.Status.Deployment = &nvidiacomv1alpha1.DeploymentStatus{ dgdr.Status.DGDName = "test-dgd-to-delete"
Name: "test-dgd-to-delete",
Namespace: namespace,
Created: true,
State: nvidiacomv1alpha1.DGDStateSuccessful,
}
Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed()) Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed())
// Reconcile when DGD doesn't exist (simulating deletion) // Reconcile when DGD doesn't exist (simulating deletion)
...@@ -767,10 +704,10 @@ spec: ...@@ -767,10 +704,10 @@ spec:
}) })
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
// Get updated DGDR and check state transitioned to DeploymentDeleted // Get updated DGDR and check phase transitioned to Failed
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed()) Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
Expect(updated.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStateDeploymentDeleted)) Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhaseFailed))
}) })
}) })
}) })
...@@ -778,7 +715,7 @@ spec: ...@@ -778,7 +715,7 @@ spec:
var _ = Describe("DGDR Helper Functions", func() { var _ = Describe("DGDR Helper Functions", func() {
Context("getProfilingJobName", func() { Context("getProfilingJobName", func() {
It("Should return correct job name", func() { It("Should return correct job name", func() {
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr", Name: "test-dgdr",
}, },
...@@ -789,7 +726,7 @@ var _ = Describe("DGDR Helper Functions", func() { ...@@ -789,7 +726,7 @@ var _ = Describe("DGDR Helper Functions", func() {
Context("getOutputConfigMapName", func() { Context("getOutputConfigMapName", func() {
It("Should return correct ConfigMap name", func() { It("Should return correct ConfigMap name", func() {
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr", Name: "test-dgdr",
}, },
...@@ -799,91 +736,42 @@ var _ = Describe("DGDR Helper Functions", func() { ...@@ -799,91 +736,42 @@ var _ = Describe("DGDR Helper Functions", func() {
}) })
Context("isOnlineProfiling", func() { Context("isOnlineProfiling", func() {
It("Should return true for online profiling (use_ai_configurator=false)", func() { It("Should always return true regardless of spec", func() {
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ Model: "test-model",
Config: createTestConfig(map[string]interface{}{ Backend: "vllm",
"sweep": map[string]interface{}{
"use_ai_configurator": false,
},
}),
},
}, },
} }
Expect(isOnlineProfiling(dgdr)).Should(BeTrue()) Expect(isOnlineProfiling(dgdr)).Should(BeTrue())
}) })
It("Should return false for AI Configurator profiling (use_ai_configurator=true)", func() { It("Should return true with search strategy rapid", func() {
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ Model: "test-model",
Config: createTestConfig(map[string]interface{}{ Backend: "trtllm",
"sweep": map[string]interface{}{ SearchStrategy: "rapid",
"use_ai_configurator": true,
},
}),
},
},
}
Expect(isOnlineProfiling(dgdr)).Should(BeFalse())
})
It("Should return true by default when sweep section is missing", func() {
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{
"engine": map[string]interface{}{
"backend": "vllm",
},
}),
},
}, },
} }
Expect(isOnlineProfiling(dgdr)).Should(BeTrue()) Expect(isOnlineProfiling(dgdr)).Should(BeTrue())
}) })
It("Should return true by default when use_ai_configurator is not specified", func() { It("Should return true with search strategy thorough", func() {
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ Model: "test-model",
Config: createTestConfig(map[string]interface{}{ Backend: "vllm",
"sweep": map[string]interface{}{ SearchStrategy: "thorough",
"prefill_interpolation_granularity": 16,
},
}),
},
}, },
} }
Expect(isOnlineProfiling(dgdr)).Should(BeTrue()) Expect(isOnlineProfiling(dgdr)).Should(BeTrue())
}) })
It("Should return false for AI Configurator profiling (useAiConfigurator=true camelCase)", func() { It("Should return true with nil spec fields", func() {
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ Model: "test-model",
Config: createTestConfig(map[string]interface{}{
"sweep": map[string]interface{}{
"useAiConfigurator": true,
},
}),
},
},
}
Expect(isOnlineProfiling(dgdr)).Should(BeFalse())
})
It("Should return true for online profiling (useAiConfigurator=false camelCase)", func() {
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{
"sweep": map[string]interface{}{
"useAiConfigurator": false,
},
}),
},
}, },
} }
Expect(isOnlineProfiling(dgdr)).Should(BeTrue()) Expect(isOnlineProfiling(dgdr)).Should(BeTrue())
...@@ -903,23 +791,19 @@ var _ = Describe("DGDR Validation", func() { ...@@ -903,23 +791,19 @@ var _ = Describe("DGDR Validation", func() {
Context("validateSpec", func() { Context("validateSpec", func() {
It("Should pass validation for valid spec", func() { It("Should pass validation for valid spec", func() {
ctx := context.Background() ctx := context.Background()
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model", Model: "test-model",
Backend: "vllm", Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ Image: "test-profiler:latest",
ProfilerImage: "test-profiler:latest", Hardware: &nvidiacomv1beta1.HardwareSpec{
Config: createTestConfig(map[string]interface{}{ NumGPUsPerNode: ptr.To[int32](8),
"engine": map[string]interface{}{ GPUSKU: "H100-SXM5-80GB",
"config": "/tmp/test-config.yaml", VRAMMB: ptr.To(81920.0),
}, },
"sla": map[string]interface{}{ SLA: &nvidiacomv1beta1.SLASpec{
"ttft": 100.0, TTFT: ptr.To(100.0),
"itl": 1500.0, ITL: ptr.To(1500.0),
"isl": 3000,
"osl": 5,
},
}),
}, },
}, },
} }
...@@ -930,18 +814,19 @@ var _ = Describe("DGDR Validation", func() { ...@@ -930,18 +814,19 @@ var _ = Describe("DGDR Validation", func() {
It("Should pass validation with minimal config", func() { It("Should pass validation with minimal config", func() {
ctx := context.Background() ctx := context.Background()
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model", Model: "test-model",
Backend: "vllm", Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ Image: "test-profiler:latest",
ProfilerImage: "test-profiler:latest", Hardware: &nvidiacomv1beta1.HardwareSpec{
Config: createTestConfig(map[string]interface{}{ NumGPUsPerNode: ptr.To[int32](8),
"sla": map[string]interface{}{ GPUSKU: "H100-SXM5-80GB",
"ttft": 100.0, VRAMMB: ptr.To(81920.0),
"itl": 1500.0,
}, },
}), SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
}, },
}, },
} }
...@@ -971,7 +856,7 @@ var _ = Describe("DGDR Profiler Arguments", func() { ...@@ -971,7 +856,7 @@ var _ = Describe("DGDR Profiler Arguments", func() {
}) })
Context("When creating profiling job with inline config", func() { Context("When creating profiling job with inline config", func() {
It("Should pass config as --profile-config argument for online profiling", func() { It("Should pass config as --config argument for online profiling", func() {
ctx := context.Background() ctx := context.Background()
namespace := "default" namespace := "default"
dgdrName := "test-args-online" dgdrName := "test-args-online"
...@@ -986,36 +871,23 @@ var _ = Describe("DGDR Profiler Arguments", func() { ...@@ -986,36 +871,23 @@ var _ = Describe("DGDR Profiler Arguments", func() {
Expect(k8sClient.Create(ctx, sa)).Should(Succeed()) Expect(k8sClient.Create(ctx, sa)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, sa) }() defer func() { _ = k8sClient.Delete(ctx, sa) }()
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: dgdrName, Name: dgdrName,
Namespace: namespace, Namespace: namespace,
}, },
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model", Model: "test-model",
Backend: "trtllm", Backend: "trtllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ Image: "test-profiler:latest",
ProfilerImage: "test-profiler:latest", Hardware: &nvidiacomv1beta1.HardwareSpec{
Config: createTestConfig(map[string]interface{}{ GPUSKU: "H200-SXM",
"engine": map[string]interface{}{ NumGPUsPerNode: ptr.To[int32](8),
"config": "/tmp/test-config.yaml", VRAMMB: ptr.To(81920.0),
"profiler_image": "test-profiler:latest",
},
"sla": map[string]interface{}{
"ttft": 50.0,
"itl": 10.0,
"isl": 3000,
"osl": 500,
},
"hardware": map[string]interface{}{
"gpu_type": "h200_sxm",
"min_num_gpus_per_engine": 2,
"max_num_gpus_per_engine": 4,
}, },
"sweep": map[string]interface{}{ SLA: &nvidiacomv1beta1.SLASpec{
"use_ai_configurator": false, TTFT: ptr.To(50.0),
}, ITL: ptr.To(10.0),
}),
}, },
}, },
} }
...@@ -1024,7 +896,7 @@ var _ = Describe("DGDR Profiler Arguments", func() { ...@@ -1024,7 +896,7 @@ var _ = Describe("DGDR Profiler Arguments", func() {
defer func() { _ = k8sClient.Delete(ctx, dgdr) }() defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Re-fetch DGDR to get proper metadata from API server // Re-fetch DGDR to get proper metadata from API server
var fetchedDGDR nvidiacomv1alpha1.DynamoGraphDeploymentRequest var fetchedDGDR nvidiacomv1beta1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &fetchedDGDR)).Should(Succeed()) Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &fetchedDGDR)).Should(Succeed())
// Create profiling job with properly initialized DGDR // Create profiling job with properly initialized DGDR
...@@ -1036,12 +908,12 @@ var _ = Describe("DGDR Profiler Arguments", func() { ...@@ -1036,12 +908,12 @@ var _ = Describe("DGDR Profiler Arguments", func() {
job := &batchv1.Job{} job := &batchv1.Job{}
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job)).Should(Succeed()) Expect(k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job)).Should(Succeed())
// Verify profiler container has --profile-config argument // Verify profiler container has --config argument
profilerContainer := job.Spec.Template.Spec.Containers[0] profilerContainer := job.Spec.Template.Spec.Containers[0]
args := profilerContainer.Args args := profilerContainer.Args
// Check that --profile-config argument is present // Check that --config argument is present
Expect(args).Should(ContainElement("--profile-config")) Expect(args).Should(ContainElement("--config"))
// Clean up // Clean up
_ = k8sClient.Delete(ctx, job) _ = k8sClient.Delete(ctx, job)
...@@ -1062,39 +934,24 @@ var _ = Describe("DGDR Profiler Arguments", func() { ...@@ -1062,39 +934,24 @@ var _ = Describe("DGDR Profiler Arguments", func() {
Expect(k8sClient.Create(ctx, sa)).Should(Succeed()) Expect(k8sClient.Create(ctx, sa)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, sa) }() defer func() { _ = k8sClient.Delete(ctx, sa) }()
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: dgdrName, Name: dgdrName,
Namespace: namespace, Namespace: namespace,
}, },
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model", Model: "test-model",
Backend: "trtllm", Backend: "trtllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ Image: "test-profiler:latest",
ProfilerImage: "test-profiler:latest", SearchStrategy: "rapid",
Config: createTestConfig(map[string]interface{}{ Hardware: &nvidiacomv1beta1.HardwareSpec{
"engine": map[string]interface{}{ GPUSKU: "H200-SXM",
"config": "/tmp/test-config.yaml", NumGPUsPerNode: ptr.To[int32](8),
"profiler_image": "test-profiler:latest", VRAMMB: ptr.To(81920.0),
},
"sla": map[string]interface{}{
"ttft": 50.0,
"itl": 10.0,
"isl": 3000,
"osl": 500,
}, },
"hardware": map[string]interface{}{ SLA: &nvidiacomv1beta1.SLASpec{
"gpu_type": "h200_sxm", TTFT: ptr.To(50.0),
"min_num_gpus_per_engine": 1, ITL: ptr.To(10.0),
"max_num_gpus_per_engine": 8,
},
"sweep": map[string]interface{}{
"use_ai_configurator": true,
"aic_system": "h200_sxm",
"aic_hf_id": "Qwen/Qwen3-32B",
"aic_backend_version": "0.20.0",
},
}),
}, },
}, },
} }
...@@ -1103,7 +960,7 @@ var _ = Describe("DGDR Profiler Arguments", func() { ...@@ -1103,7 +960,7 @@ var _ = Describe("DGDR Profiler Arguments", func() {
defer func() { _ = k8sClient.Delete(ctx, dgdr) }() defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Re-fetch DGDR to get proper metadata from API server // Re-fetch DGDR to get proper metadata from API server
var fetchedDGDR nvidiacomv1alpha1.DynamoGraphDeploymentRequest var fetchedDGDR nvidiacomv1beta1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &fetchedDGDR)).Should(Succeed()) Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &fetchedDGDR)).Should(Succeed())
// Create profiling job with properly initialized DGDR // Create profiling job with properly initialized DGDR
...@@ -1115,12 +972,12 @@ var _ = Describe("DGDR Profiler Arguments", func() { ...@@ -1115,12 +972,12 @@ var _ = Describe("DGDR Profiler Arguments", func() {
job := &batchv1.Job{} job := &batchv1.Job{}
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job)).Should(Succeed()) Expect(k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job)).Should(Succeed())
// Verify profiler container has --profile-config argument // Verify profiler container has --config argument
profilerContainer := job.Spec.Template.Spec.Containers[0] profilerContainer := job.Spec.Template.Spec.Containers[0]
args := profilerContainer.Args args := profilerContainer.Args
// Check that --profile-config argument is present // Check that --config argument is present
Expect(args).Should(ContainElement("--profile-config")) Expect(args).Should(ContainElement("--config"))
// Clean up // Clean up
_ = k8sClient.Delete(ctx, job) _ = k8sClient.Delete(ctx, job)
...@@ -1141,24 +998,23 @@ var _ = Describe("DGDR Profiler Arguments", func() { ...@@ -1141,24 +998,23 @@ var _ = Describe("DGDR Profiler Arguments", func() {
Expect(k8sClient.Create(ctx, sa)).Should(Succeed()) Expect(k8sClient.Create(ctx, sa)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, sa) }() defer func() { _ = k8sClient.Delete(ctx, sa) }()
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: dgdrName, Name: dgdrName,
Namespace: namespace, Namespace: namespace,
}, },
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model", Model: "test-model",
Backend: "trtllm", Backend: "trtllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ Image: "test-profiler:latest",
ProfilerImage: "test-profiler:latest", Hardware: &nvidiacomv1beta1.HardwareSpec{
Config: createTestConfig(map[string]interface{}{ NumGPUsPerNode: ptr.To[int32](8),
"sla": map[string]interface{}{ GPUSKU: "H100-SXM5-80GB",
"ttft": 50.0, VRAMMB: ptr.To(81920.0),
"itl": 10.0,
"isl": 3000,
"osl": 500,
}, },
}), SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(50.0),
ITL: ptr.To(10.0),
}, },
}, },
} }
...@@ -1167,7 +1023,7 @@ var _ = Describe("DGDR Profiler Arguments", func() { ...@@ -1167,7 +1023,7 @@ var _ = Describe("DGDR Profiler Arguments", func() {
defer func() { _ = k8sClient.Delete(ctx, dgdr) }() defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Re-fetch DGDR to get proper metadata from API server // Re-fetch DGDR to get proper metadata from API server
var fetchedDGDR nvidiacomv1alpha1.DynamoGraphDeploymentRequest var fetchedDGDR nvidiacomv1beta1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &fetchedDGDR)).Should(Succeed()) Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &fetchedDGDR)).Should(Succeed())
// Create profiling job with properly initialized DGDR // Create profiling job with properly initialized DGDR
...@@ -1222,27 +1078,23 @@ var _ = Describe("DGDR Error Handling", func() { ...@@ -1222,27 +1078,23 @@ var _ = Describe("DGDR Error Handling", func() {
namespace := defaultNamespace namespace := defaultNamespace
dgdrName := "test-error-capture" dgdrName := "test-error-capture"
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: dgdrName, Name: dgdrName,
Namespace: namespace, Namespace: namespace,
}, },
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model", Model: "test-model",
Backend: "vllm", Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ Image: "test-profiler:latest",
ProfilerImage: "test-profiler:latest", Hardware: &nvidiacomv1beta1.HardwareSpec{
Config: createTestConfig(map[string]interface{}{ NumGPUsPerNode: ptr.To[int32](8),
"engine": map[string]interface{}{ GPUSKU: "H100-SXM5-80GB",
"config": "/tmp/test-config.yaml", VRAMMB: ptr.To(81920.0),
},
"sla": map[string]interface{}{
"ttft": 100.0,
"itl": 1500.0,
"isl": 3000,
"osl": 5,
}, },
}), SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
}, },
}, },
} }
...@@ -1251,7 +1103,7 @@ var _ = Describe("DGDR Error Handling", func() { ...@@ -1251,7 +1103,7 @@ var _ = Describe("DGDR Error Handling", func() {
defer func() { _ = k8sClient.Delete(ctx, dgdr) }() defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Set status to Profiling // Set status to Profiling
dgdr.Status.State = nvidiacomv1alpha1.DGDRStateProfiling dgdr.Status.Phase = nvidiacomv1beta1.DGDRPhaseProfiling
Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed()) Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed())
// Create failed job // Create failed job
...@@ -1331,12 +1183,12 @@ var _ = Describe("DGDR Error Handling", func() { ...@@ -1331,12 +1183,12 @@ var _ = Describe("DGDR Error Handling", func() {
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
// Verify DGDR transitioned to Failed state // Verify DGDR transitioned to Failed state
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed()) Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
Expect(updated.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStateFailed)) Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhaseFailed))
// Verify error condition contains detailed error // Verify error condition contains detailed error
condition := meta.FindStatusCondition(updated.Status.Conditions, ConditionTypeProfiling) condition := meta.FindStatusCondition(updated.Status.Conditions, nvidiacomv1beta1.ConditionTypeProfiling)
Expect(condition).NotTo(BeNil()) Expect(condition).NotTo(BeNil())
Expect(condition.Status).Should(Equal(metav1.ConditionFalse)) Expect(condition.Status).Should(Equal(metav1.ConditionFalse))
Expect(condition.Message).Should(ContainSubstring("profiling job failed")) Expect(condition.Message).Should(ContainSubstring("profiling job failed"))
...@@ -1535,22 +1387,18 @@ spec: ...@@ -1535,22 +1387,18 @@ spec:
defer func() { _ = k8sClient.Delete(ctx, gpuNode) }() defer func() { _ = k8sClient.Delete(ctx, gpuNode) }()
// Create DGDR WITHOUT hardware config (should use GPU discovery) // Create DGDR WITHOUT hardware config (should use GPU discovery)
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: dgdrName, Name: dgdrName,
Namespace: namespace, Namespace: namespace,
}, },
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model", Model: "test-model",
Backend: "vllm", Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ Image: "test-profiler:latest",
ProfilerImage: "test-profiler:latest", SLA: &nvidiacomv1beta1.SLASpec{
Config: &apiextensionsv1.JSON{ TTFT: ptr.To(100.0),
Raw: []byte(`{ ITL: ptr.To(1500.0),
"sla": {"ttft": 100.0, "itl": 1500.0},
"engine": {"minNumGpusPerEngine": 1, "maxNumGpusPerEngine": 8}
}`),
},
}, },
}, },
} }
...@@ -1568,9 +1416,9 @@ spec: ...@@ -1568,9 +1416,9 @@ spec:
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
// Should transition to Pending (validation passed) // Should transition to Pending (validation passed)
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated) _ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
Expect(updated.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStatePending)) Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhasePending))
}) })
It("Should respect manual hardware config over GPU discovery", func() { It("Should respect manual hardware config over GPU discovery", func() {
...@@ -1593,27 +1441,23 @@ spec: ...@@ -1593,27 +1441,23 @@ spec:
defer func() { _ = k8sClient.Delete(ctx, gpuNode) }() defer func() { _ = k8sClient.Delete(ctx, gpuNode) }()
// Create DGDR WITH manual hardware config (A100, not H100) // Create DGDR WITH manual hardware config (A100, not H100)
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: dgdrName, Name: dgdrName,
Namespace: namespace, Namespace: namespace,
}, },
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model", Model: "test-model",
Backend: "vllm", Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ Image: "test-profiler:latest",
ProfilerImage: "test-profiler:latest", Hardware: &nvidiacomv1beta1.HardwareSpec{
Config: &apiextensionsv1.JSON{ NumGPUsPerNode: ptr.To[int32](4),
Raw: []byte(`{ GPUSKU: "A100-SXM4-40GB",
"sla": {"ttft": 100.0, "itl": 1500.0}, VRAMMB: ptr.To(40960.0),
"hardware": {
"numGpusPerNode": 4,
"gpuModel": "A100-SXM4-40GB",
"gpuVramMib": 40960,
"system": "a100_sxm"
}
}`),
}, },
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
}, },
}, },
} }
...@@ -1631,9 +1475,9 @@ spec: ...@@ -1631,9 +1475,9 @@ spec:
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
// Should transition to Pending (validation passed with manual config) // Should transition to Pending (validation passed with manual config)
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated) _ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
Expect(updated.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStatePending)) Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhasePending))
}) })
It("Should succeed with GPU discovery when cluster has GPU nodes", func() { It("Should succeed with GPU discovery when cluster has GPU nodes", func() {
...@@ -1656,21 +1500,18 @@ spec: ...@@ -1656,21 +1500,18 @@ spec:
defer func() { _ = k8sClient.Delete(ctx, node) }() defer func() { _ = k8sClient.Delete(ctx, node) }()
// Create DGDR WITHOUT hardware config - should use GPU discovery // Create DGDR WITHOUT hardware config - should use GPU discovery
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: dgdrName, Name: dgdrName,
Namespace: namespace, Namespace: namespace,
}, },
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model", Model: "test-model",
Backend: "vllm", Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ Image: "test-profiler:latest",
ProfilerImage: "test-profiler:latest", SLA: &nvidiacomv1beta1.SLASpec{
Config: &apiextensionsv1.JSON{ TTFT: ptr.To(100.0),
Raw: []byte(`{ ITL: ptr.To(1500.0),
"sla": {"ttft": 100.0, "itl": 1500.0}
}`),
},
}, },
}, },
} }
...@@ -1688,9 +1529,9 @@ spec: ...@@ -1688,9 +1529,9 @@ spec:
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
// Should transition to Pending // Should transition to Pending
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated) _ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
Expect(updated.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStatePending)) Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhasePending))
}) })
It("Should pass validation with explicit GPU ranges without GPU discovery", func() { It("Should pass validation with explicit GPU ranges without GPU discovery", func() {
...@@ -1700,28 +1541,21 @@ spec: ...@@ -1700,28 +1541,21 @@ spec:
// Intentionally don't create GPU nodes to test that explicit ranges work without GPU discovery // Intentionally don't create GPU nodes to test that explicit ranges work without GPU discovery
// Create DGDR with explicit minNumGpusPerEngine/maxNumGpusPerEngine // Create DGDR with explicit minNumGpusPerEngine/maxNumGpusPerEngine
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: dgdrName, Name: dgdrName,
Namespace: namespace, Namespace: namespace,
}, },
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model", Model: "test-model",
Backend: "vllm", Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ Image: "test-profiler:latest",
ProfilerImage: "test-profiler:latest", Hardware: &nvidiacomv1beta1.HardwareSpec{
Config: &apiextensionsv1.JSON{ NumGPUsPerNode: ptr.To[int32](8),
Raw: []byte(`{
"sla": {"ttft": 100.0, "itl": 1500.0},
"engine": {
"minNumGpusPerEngine": 2,
"maxNumGpusPerEngine": 4
},
"hardware": {
"numGpusPerNode": 8
}
}`),
}, },
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
}, },
}, },
} }
...@@ -1739,9 +1573,9 @@ spec: ...@@ -1739,9 +1573,9 @@ spec:
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
// Should transition to Pending // Should transition to Pending
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated) _ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
Expect(updated.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStatePending)) Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhasePending))
}) })
It("Should use GPU discovery with heterogeneous nodes (picks best)", func() { It("Should use GPU discovery with heterogeneous nodes (picks best)", func() {
...@@ -1778,22 +1612,18 @@ spec: ...@@ -1778,22 +1612,18 @@ spec:
}() }()
// Create DGDR without hardware config // Create DGDR without hardware config
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: dgdrName, Name: dgdrName,
Namespace: namespace, Namespace: namespace,
}, },
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model", Model: "test-model",
Backend: "vllm", Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ Image: "test-profiler:latest",
ProfilerImage: "test-profiler:latest", SLA: &nvidiacomv1beta1.SLASpec{
Config: &apiextensionsv1.JSON{ TTFT: ptr.To(100.0),
Raw: []byte(`{ ITL: ptr.To(1500.0),
"sla": {"ttft": 100.0, "itl": 1500.0},
"engine": {"minNumGpusPerEngine": 1, "maxNumGpusPerEngine": 8}
}`),
},
}, },
}, },
} }
...@@ -1811,9 +1641,393 @@ spec: ...@@ -1811,9 +1641,393 @@ spec:
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
// Should transition to Pending (using H100 config) // Should transition to Pending (using H100 config)
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated) _ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
Expect(updated.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStatePending)) Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhasePending))
})
})
Context("v1beta1-specific behavior", func() {
It("Should transition to Deployed when DGD reaches Ready", func() {
ctx := context.Background()
dgdrName := "test-dgdr-deployed-phase"
namespace := defaultNamespace
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
},
},
}
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Set DGDR to Deploying with a DGDName
dgdr.Status.Phase = nvidiacomv1beta1.DGDRPhaseDeploying
dgdr.Status.DGDName = "test-dgd-deployed"
Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed())
// Create the DGD in Ready state
dgd := &dgdv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-deployed",
Namespace: namespace,
Labels: map[string]string{
nvidiacomv1beta1.LabelDGDRName: dgdrName,
nvidiacomv1beta1.LabelDGDRNamespace: namespace,
},
},
Spec: dgdv1alpha1.DynamoGraphDeploymentSpec{},
}
Expect(k8sClient.Create(ctx, dgd)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, dgd) }()
// Set DGD to Successful state
dgd.Status.State = dgdv1alpha1.DGDStateSuccessful
Expect(k8sClient.Status().Update(ctx, dgd)).Should(Succeed())
// Reconcile — should transition DGDR to Deployed
_, err := reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
})
Expect(err).NotTo(HaveOccurred())
var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhaseDeployed))
})
It("Should set Succeeded condition at each phase transition", func() {
ctx := context.Background()
dgdrName := "test-dgdr-succeeded-cond"
namespace := defaultNamespace
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
},
},
}
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// First reconcile: initial validation → Pending
_, err := reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
})
Expect(err).NotTo(HaveOccurred())
var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
// Check that Succeeded condition exists with reason matching the phase
succeededCond := meta.FindStatusCondition(updated.Status.Conditions, nvidiacomv1beta1.ConditionTypeSucceeded)
Expect(succeededCond).NotTo(BeNil())
Expect(succeededCond.Reason).Should(Equal(string(nvidiacomv1beta1.DGDRPhasePending)))
})
It("Should set ProfilingPhase on entry to Profiling and clear on exit", func() {
ctx := context.Background()
dgdrName := "test-dgdr-profiling-phase"
namespace := defaultNamespace
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
},
},
}
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Transition through initial validation to Pending
_, err := reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
})
Expect(err).NotTo(HaveOccurred())
// Reconcile again to start profiling (creates job, transitions to Profiling)
_, err = reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
})
Expect(err).NotTo(HaveOccurred())
// Check ProfilingPhase is set
var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhaseProfiling))
Expect(updated.Status.ProfilingPhase).Should(Equal(nvidiacomv1beta1.ProfilingPhaseInitializing))
// Simulate profiling completion
jobName := getProfilingJobName(&updated)
job := &batchv1.Job{}
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job)).Should(Succeed())
job.Status.Conditions = []batchv1.JobCondition{{
Type: batchv1.JobComplete,
Status: corev1.ConditionTrue,
}}
Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed())
dgdYAML := `apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: test-dgd-profphase
spec:
services:
Frontend:
replicas: 1`
cm := &corev1.ConfigMap{
ObjectMeta: metav1.ObjectMeta{
Name: getOutputConfigMapName(&updated),
Namespace: namespace,
},
Data: map[string]string{
ProfilingOutputFile: dgdYAML,
},
}
Expect(k8sClient.Create(ctx, cm)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, cm) }()
// Reconcile to complete profiling
_, err = reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
})
Expect(err).NotTo(HaveOccurred())
// ProfilingPhase should be cleared after profiling completes
// Note: Due to the r.Update/r.Status().Update ordering in generateDGDSpec,
// ProfilingPhase may not be cleared in a single reconcile. Verify the phase
// transition happened correctly.
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
Expect(updated.Status.Phase).ShouldNot(Equal(nvidiacomv1beta1.DGDRPhaseProfiling))
})
It("Should use spec.features.mocker.enabled to select mocker output", func() {
ctx := context.Background()
dgdrName := "test-dgdr-mocker"
namespace := defaultNamespace
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
},
Features: &nvidiacomv1beta1.FeaturesSpec{
Mocker: &nvidiacomv1beta1.MockerSpec{Enabled: true},
},
},
}
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Transition to Profiling
dgdr.Status.Phase = nvidiacomv1beta1.DGDRPhaseProfiling
dgdr.Status.ObservedGeneration = dgdr.Generation
Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed())
// Create completed job
jobName := getProfilingJobName(dgdr)
job := &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{Name: jobName, Namespace: namespace},
Spec: batchv1.JobSpec{
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Containers: []corev1.Container{{Name: "test", Image: "test"}},
RestartPolicy: corev1.RestartPolicyNever,
},
},
},
}
Expect(k8sClient.Create(ctx, job)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, job) }()
job.Status.Conditions = []batchv1.JobCondition{{
Type: batchv1.JobComplete, Status: corev1.ConditionTrue,
}}
Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed())
// Create output ConfigMap with mocker output file
dgdYAML := `apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: test-dgd-mocker
spec:
services:
Frontend:
replicas: 1`
cm := &corev1.ConfigMap{
ObjectMeta: metav1.ObjectMeta{
Name: getOutputConfigMapName(dgdr),
Namespace: namespace,
},
Data: map[string]string{
ProfilingOutputFileMocker: dgdYAML,
},
}
Expect(k8sClient.Create(ctx, cm)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, cm) }()
// Reconcile — should read from mocker output file
_, err := reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
})
Expect(err).NotTo(HaveOccurred())
// Verify the generated spec came from the mocker file
var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
Expect(updated.Annotations["nvidia.com/generated-dgd-spec"]).Should(ContainSubstring("test-dgd-mocker"))
})
It("Should populate profilingJobName in status", func() {
ctx := context.Background()
dgdrName := "test-dgdr-jobname"
namespace := defaultNamespace
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
},
},
}
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Reconcile through initial validation
_, err := reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
})
Expect(err).NotTo(HaveOccurred())
// Reconcile to create profiling job
_, err = reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
})
Expect(err).NotTo(HaveOccurred())
// Check profilingJobName is set in status
var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
Expect(updated.Status.ProfilingJobName).Should(Equal(getProfilingJobName(&updated)))
// Clean up job
job := &batchv1.Job{}
_ = k8sClient.Get(ctx, types.NamespacedName{Name: updated.Status.ProfilingJobName, Namespace: namespace}, job)
_ = k8sClient.Delete(ctx, job)
})
It("Should validate typed hardware fields without blob parsing", func() {
ctx := context.Background()
dgdrName := "test-dgdr-typed-hw"
namespace := defaultNamespace
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
GPUSKU: "A100-SXM4-40GB",
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
},
},
}
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Reconcile — partial hardware (GPUSKU only) should pass validation
_, err := reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
})
Expect(err).NotTo(HaveOccurred())
var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhasePending))
}) })
}) })
}) })
...@@ -27,6 +27,7 @@ import ( ...@@ -27,6 +27,7 @@ import (
"testing" "testing"
"github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
. "github.com/onsi/ginkgo/v2" . "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega" . "github.com/onsi/gomega"
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
...@@ -102,6 +103,8 @@ var _ = BeforeSuite(func() { ...@@ -102,6 +103,8 @@ var _ = BeforeSuite(func() {
//+kubebuilder:scaffold:scheme //+kubebuilder:scaffold:scheme
err = v1alpha1.AddToScheme(scheme) err = v1alpha1.AddToScheme(scheme)
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
err = v1beta1.AddToScheme(scheme)
Expect(err).NotTo(HaveOccurred())
err = corev1.AddToScheme(scheme) err = corev1.AddToScheme(scheme)
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
err = autoscalingv2.AddToScheme(scheme) err = autoscalingv2.AddToScheme(scheme)
......
...@@ -18,31 +18,18 @@ ...@@ -18,31 +18,18 @@
package validation package validation
import ( import (
"encoding/json"
"errors" "errors"
"fmt" "fmt"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
"k8s.io/apimachinery/pkg/util/yaml"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission" "sigs.k8s.io/controller-runtime/pkg/webhook/admission"
) )
// toFloat64 converts a numeric value (int or float64) to float64.
// Returns 0 if the value is neither int nor float64.
func toFloat64(val any) float64 {
switch v := val.(type) {
case float64:
return v
case int:
return float64(v)
default:
return 0
}
}
// DynamoGraphDeploymentRequestValidator validates DynamoGraphDeploymentRequest resources. // DynamoGraphDeploymentRequestValidator validates DynamoGraphDeploymentRequest resources.
// This validator can be used by both webhooks and controllers for consistent validation. // This validator can be used by both webhooks and controllers for consistent validation.
type DynamoGraphDeploymentRequestValidator struct { type DynamoGraphDeploymentRequestValidator struct {
request *nvidiacomv1alpha1.DynamoGraphDeploymentRequest request *nvidiacomv1beta1.DynamoGraphDeploymentRequest
isClusterWideOperator bool isClusterWideOperator bool
gpuDiscoveryEnabled bool gpuDiscoveryEnabled bool
} }
...@@ -50,7 +37,7 @@ type DynamoGraphDeploymentRequestValidator struct { ...@@ -50,7 +37,7 @@ type DynamoGraphDeploymentRequestValidator struct {
// NewDynamoGraphDeploymentRequestValidator creates a new validator for DynamoGraphDeploymentRequest. // NewDynamoGraphDeploymentRequestValidator creates a new validator for DynamoGraphDeploymentRequest.
// isClusterWide indicates whether the operator has cluster-wide permissions. // isClusterWide indicates whether the operator has cluster-wide permissions.
// gpuDiscoveryEnabled indicates whether Helm provisioned node read access for the operator. // gpuDiscoveryEnabled indicates whether Helm provisioned node read access for the operator.
func NewDynamoGraphDeploymentRequestValidator(request *nvidiacomv1alpha1.DynamoGraphDeploymentRequest, isClusterWide bool, gpuDiscoveryEnabled bool) *DynamoGraphDeploymentRequestValidator { func NewDynamoGraphDeploymentRequestValidator(request *nvidiacomv1beta1.DynamoGraphDeploymentRequest, isClusterWide bool, gpuDiscoveryEnabled bool) *DynamoGraphDeploymentRequestValidator {
return &DynamoGraphDeploymentRequestValidator{ return &DynamoGraphDeploymentRequestValidator{
request: request, request: request,
isClusterWideOperator: isClusterWide, isClusterWideOperator: isClusterWide,
...@@ -61,105 +48,43 @@ func NewDynamoGraphDeploymentRequestValidator(request *nvidiacomv1alpha1.DynamoG ...@@ -61,105 +48,43 @@ func NewDynamoGraphDeploymentRequestValidator(request *nvidiacomv1alpha1.DynamoG
// Validate performs stateless validation on the DynamoGraphDeploymentRequest. // Validate performs stateless validation on the DynamoGraphDeploymentRequest.
// Returns warnings and error. // Returns warnings and error.
func (v *DynamoGraphDeploymentRequestValidator) Validate() (admission.Warnings, error) { func (v *DynamoGraphDeploymentRequestValidator) Validate() (admission.Warnings, error) {
var warnings admission.Warnings
var err error var err error
// Warn about deprecated enableGpuDiscovery field // Validate image is specified (required for the profiling job container).
if v.request.Spec.EnableGPUDiscovery != nil { if v.request.Spec.Image == "" {
warnings = append(warnings, "spec.enableGpuDiscovery is deprecated and will be removed in v1beta1. GPU discovery is now always attempted automatically. This field has no effect.") err = errors.Join(err, errors.New("spec.image is required"))
}
// Validate profiler image is specified
if v.request.Spec.ProfilingConfig.ProfilerImage == "" {
err = errors.Join(err, errors.New("spec.profilingConfig.profilerImage is required"))
}
// Validate that profilingConfig.config is provided
if v.request.Spec.ProfilingConfig.Config == nil || len(v.request.Spec.ProfilingConfig.Config.Raw) == 0 {
err = errors.Join(err, errors.New("spec.profilingConfig.config is required and must not be empty"))
} }
// Note: GPU discovery is now automatic for cluster-wide operators // Disallow searchStrategy: thorough with backend: auto.
// Namespace-restricted operators automatically skip GPU discovery and require manual hardware config // "thorough" sweeps more configurations and requires a concrete backend to be selected;
// "auto" defers backend selection and is only compatible with the "rapid" search strategy.
// Parse config to validate structure (only if config is present) if v.request.Spec.SearchStrategy == nvidiacomv1beta1.SearchStrategyThorough &&
if v.request.Spec.ProfilingConfig.Config != nil && len(v.request.Spec.ProfilingConfig.Config.Raw) > 0 { v.request.Spec.Backend == nvidiacomv1beta1.BackendTypeAuto {
var config map[string]any err = errors.Join(err, fmt.Errorf(
if parseErr := yaml.Unmarshal(v.request.Spec.ProfilingConfig.Config.Raw, &config); parseErr != nil { "spec.searchStrategy %q is incompatible with spec.backend %q: set spec.backend to a specific backend (sglang, trtllm, or vllm)",
err = errors.Join(err, fmt.Errorf("failed to parse spec.profilingConfig.config: %w", parseErr)) nvidiacomv1beta1.SearchStrategyThorough,
} else { nvidiacomv1beta1.BackendTypeAuto,
// Warn if deployment.model or engine.backend are specified in config (they will be overwritten by spec fields) ))
if engineConfig, ok := config["engine"].(map[string]any); ok {
if backend, ok := engineConfig["backend"].(string); ok && backend != "" && backend != v.request.Spec.Backend {
warnings = append(warnings, fmt.Sprintf("spec.profilingConfig.config.engine.backend (%s) will be overwritten by spec.backend (%s)", backend, v.request.Spec.Backend))
}
}
if deployment, ok := config["deployment"].(map[string]any); ok {
if model, ok := deployment["model"].(string); ok && model != "" && model != v.request.Spec.Model {
warnings = append(warnings, fmt.Sprintf("spec.profilingConfig.config.deployment.model (%s) will be overwritten by spec.model (%s)", model, v.request.Spec.Model))
}
}
}
} }
// Validate GPU hardware information is available (last, so other errors are collected first) // Validate GPU hardware information is available (last, so other errors are collected first).
if gpuErr := v.validateGPUHardwareInfo(); gpuErr != nil { if gpuErr := v.validateGPUHardwareInfo(); gpuErr != nil {
err = errors.Join(err, gpuErr) err = errors.Join(err, gpuErr)
} }
return warnings, err return nil, err
} }
// validateGPUHardwareInfo ensures GPU hardware information will be available for profiling. // validateGPUHardwareInfo ensures GPU hardware information will be available for profiling.
// Returns an error at admission time if GPU discovery is disabled and no manual hardware config is provided. // Returns an error at admission time if GPU discovery is disabled and no manual hardware config is provided.
func (v *DynamoGraphDeploymentRequestValidator) validateGPUHardwareInfo() error { func (v *DynamoGraphDeploymentRequestValidator) validateGPUHardwareInfo() error {
// Parse profiling config // Check if manual hardware config is provided via typed spec.hardware fields.
var config map[string]any
if v.request.Spec.ProfilingConfig.Config != nil {
if err := yaml.Unmarshal(v.request.Spec.ProfilingConfig.Config.Raw, &config); err != nil {
// Config parse errors will be caught by other validators
return nil
}
} else {
config = make(map[string]any)
}
// Check if manual hardware config is provided
hardwareVal, hasHardware := config["hardware"]
var hasManualHardwareConfig bool var hasManualHardwareConfig bool
if hasHardware && hardwareVal != nil { if hw := v.request.Spec.Hardware; hw != nil {
if hardwareConfig, ok := hardwareVal.(map[string]any); ok { hasManualHardwareConfig = hw.GPUSKU != "" || hw.VRAMMB != nil || hw.NumGPUsPerNode != nil
// Check if essential hardware fields are provided
_, hasGPUModel := hardwareConfig["gpuModel"]
_, hasGPUVram := hardwareConfig["gpuVramMib"]
_, hasNumGPUs := hardwareConfig["numGpusPerNode"]
hasManualHardwareConfig = hasGPUModel || hasGPUVram || hasNumGPUs
}
}
// Check if explicit GPU ranges are provided
var hasExplicitGPURanges bool
if engineVal, hasEngine := config["engine"]; hasEngine && engineVal != nil {
if engineConfig, ok := engineVal.(map[string]any); ok {
minGPUs, hasMin := engineConfig["minNumGpusPerEngine"]
maxGPUs, hasMax := engineConfig["maxNumGpusPerEngine"]
// Validate explicit GPU ranges
if hasMin && hasMax {
minVal := toFloat64(minGPUs)
maxVal := toFloat64(maxGPUs)
// Validate that min <= max
if minVal > maxVal {
return fmt.Errorf("invalid GPU range: minNumGpusPerEngine (%v) cannot be greater than maxNumGpusPerEngine (%v)",
minVal, maxVal)
}
hasExplicitGPURanges = minVal > 0 && maxVal > 0
}
}
} }
if hasManualHardwareConfig || hasExplicitGPURanges { if hasManualHardwareConfig {
return nil return nil
} }
...@@ -169,13 +94,40 @@ func (v *DynamoGraphDeploymentRequestValidator) validateGPUHardwareInfo() error ...@@ -169,13 +94,40 @@ func (v *DynamoGraphDeploymentRequestValidator) validateGPUHardwareInfo() error
return nil return nil
} }
return errors.New("GPU hardware configuration required: GPU discovery is disabled (set dynamo-operator.gpuDiscovery.enabled=true in Helm values, or provide hardware config in spec.profilingConfig.config)") return errors.New("GPU hardware configuration required: GPU discovery is disabled (set dynamo-operator.gpuDiscovery.enabled=true in Helm values, or provide hardware config in spec.hardware)")
} }
// ValidateUpdate performs stateful validation comparing old and new DynamoGraphDeploymentRequest. // ValidateUpdate performs stateful validation comparing old and new DynamoGraphDeploymentRequest.
// Returns warnings and error. // Returns warnings and error.
func (v *DynamoGraphDeploymentRequestValidator) ValidateUpdate(old *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (admission.Warnings, error) { func (v *DynamoGraphDeploymentRequestValidator) ValidateUpdate(old *nvidiacomv1beta1.DynamoGraphDeploymentRequest) (admission.Warnings, error) {
// TODO: Add update validation logic for DynamoGraphDeploymentRequest // Reject spec changes when the resource is in a non-editable lifecycle phase.
// Placeholder for future immutability checks // During Profiling, Deploying, or Deployed the controller is actively reconciling
// the resource and spec mutations would conflict with in-flight operations.
phase := old.Status.Phase
immutablePhases := map[nvidiacomv1beta1.DGDRPhase]bool{
nvidiacomv1beta1.DGDRPhaseProfiling: true,
nvidiacomv1beta1.DGDRPhaseDeploying: true,
nvidiacomv1beta1.DGDRPhaseDeployed: true,
}
if immutablePhases[phase] {
// Compare specs — if they differ, reject the update.
oldSpec := old.Spec
newSpec := v.request.Spec
if !specEqual(oldSpec, newSpec) {
return nil, fmt.Errorf("spec updates are forbidden while the resource is in phase %q; delete and recreate the resource to change its spec", phase)
}
}
return nil, nil return nil, nil
} }
// specEqual performs a JSON-round-trip comparison of two DynamoGraphDeploymentRequestSpec values.
func specEqual(a, b nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec) bool {
aj, err1 := json.Marshal(a)
bj, err2 := json.Marshal(b)
if err1 != nil || err2 != nil {
return false
}
return string(aj) == string(bj)
}
...@@ -21,7 +21,7 @@ import ( ...@@ -21,7 +21,7 @@ import (
"context" "context"
"fmt" "fmt"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts" "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/observability" "github.com/ai-dynamo/dynamo/deploy/operator/internal/observability"
internalwebhook "github.com/ai-dynamo/dynamo/deploy/operator/internal/webhook" internalwebhook "github.com/ai-dynamo/dynamo/deploy/operator/internal/webhook"
...@@ -34,7 +34,7 @@ import ( ...@@ -34,7 +34,7 @@ import (
const ( const (
// DynamoGraphDeploymentRequestWebhookName is the name of the validating webhook handler for DynamoGraphDeploymentRequest. // DynamoGraphDeploymentRequestWebhookName is the name of the validating webhook handler for DynamoGraphDeploymentRequest.
DynamoGraphDeploymentRequestWebhookName = "dynamographdeploymentrequest-validating-webhook" DynamoGraphDeploymentRequestWebhookName = "dynamographdeploymentrequest-validating-webhook"
dynamoGraphDeploymentRequestWebhookPath = "/validate-nvidia-com-v1alpha1-dynamographdeploymentrequest" dynamoGraphDeploymentRequestWebhookPath = "/validate-nvidia-com-v1beta1-dynamographdeploymentrequest"
) )
// DynamoGraphDeploymentRequestHandler is a handler for validating DynamoGraphDeploymentRequest resources. // DynamoGraphDeploymentRequestHandler is a handler for validating DynamoGraphDeploymentRequest resources.
...@@ -137,15 +137,15 @@ func (h *DynamoGraphDeploymentRequestHandler) RegisterWithManager(mgr manager.Ma ...@@ -137,15 +137,15 @@ func (h *DynamoGraphDeploymentRequestHandler) RegisterWithManager(mgr manager.Ma
observedValidator := observability.NewObservedValidator(leaseAwareValidator, consts.ResourceTypeDynamoGraphDeploymentRequest) observedValidator := observability.NewObservedValidator(leaseAwareValidator, consts.ResourceTypeDynamoGraphDeploymentRequest)
webhook := admission. webhook := admission.
WithCustomValidator(mgr.GetScheme(), &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{}, observedValidator). WithCustomValidator(mgr.GetScheme(), &nvidiacomv1beta1.DynamoGraphDeploymentRequest{}, observedValidator).
WithRecoverPanic(true) WithRecoverPanic(true)
mgr.GetWebhookServer().Register(dynamoGraphDeploymentRequestWebhookPath, webhook) mgr.GetWebhookServer().Register(dynamoGraphDeploymentRequestWebhookPath, webhook)
return nil return nil
} }
// castToDynamoGraphDeploymentRequest attempts to cast a runtime.Object to a DynamoGraphDeploymentRequest. // castToDynamoGraphDeploymentRequest attempts to cast a runtime.Object to a DynamoGraphDeploymentRequest.
func castToDynamoGraphDeploymentRequest(obj runtime.Object) (*nvidiacomv1alpha1.DynamoGraphDeploymentRequest, error) { func castToDynamoGraphDeploymentRequest(obj runtime.Object) (*nvidiacomv1beta1.DynamoGraphDeploymentRequest, error) {
request, ok := obj.(*nvidiacomv1alpha1.DynamoGraphDeploymentRequest) request, ok := obj.(*nvidiacomv1beta1.DynamoGraphDeploymentRequest)
if !ok { if !ok {
return nil, fmt.Errorf("expected DynamoGraphDeploymentRequest but got %T", obj) return nil, fmt.Errorf("expected DynamoGraphDeploymentRequest but got %T", obj)
} }
......
...@@ -21,125 +21,125 @@ import ( ...@@ -21,125 +21,125 @@ import (
"strings" "strings"
"testing" "testing"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
) )
func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) { func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
validConfig := `{"engine": {"backend": "vllm"}, "deployment": {"model": "test-model"}}` vram := float64(81920)
validConfigWithHardware := `{"engine": {"backend": "vllm"}, "deployment": {"model": "test-model"}, "hardware": {"numGpusPerNode": 8, "gpuModel": "H100-SXM5-80GB", "gpuVramMib": 81920}}` gpuCount := int32(8)
minimalConfig := `{"sla": {"ttft": 200.0}}`
configWithDifferentBackend := `{"engine": {"backend": "sglang"}}`
configWithDifferentModel := `{"deployment": {"model": "different-model"}}`
invalidYAML := `{invalid yaml`
// errMsg: if non-empty, an error is expected and each newline-separated substring must appear in it. // errMsg: if non-empty, an error is expected and each newline-separated substring must appear in it.
// expectedWarning: if non-empty, at least one warning must contain this substring.
tests := []struct { tests := []struct {
name string name string
request *nvidiacomv1alpha1.DynamoGraphDeploymentRequest request *nvidiacomv1beta1.DynamoGraphDeploymentRequest
isClusterWide bool isClusterWide bool
gpuDiscoveryEnabled bool gpuDiscoveryEnabled bool
errMsg string errMsg string
expectedWarning string
}{ }{
{ {
name: "valid request", name: "valid request",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ request: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{Name: "test-dgdr", Namespace: "default"},
Name: "test-dgdr", Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b", Model: "llama-3-8b",
Backend: "vllm", Backend: nvidiacomv1beta1.BackendTypeVllm,
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ Image: "profiler:latest",
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(validConfig),
}, },
}, },
isClusterWide: true,
},
{
name: "missing image",
request: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{Name: "test-dgdr", Namespace: "default"},
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: nvidiacomv1beta1.BackendTypeVllm,
Image: "",
}, },
}, },
isClusterWide: true, isClusterWide: true,
errMsg: "spec.image is required",
}, },
{ {
name: "missing profiler image", name: "thorough + auto is invalid",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ request: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{Name: "test-dgdr", Namespace: "default"},
Name: "test-dgdr", Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b", Model: "llama-3-8b",
Backend: "vllm", Image: "profiler:latest",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ Backend: nvidiacomv1beta1.BackendTypeAuto,
ProfilerImage: "", SearchStrategy: nvidiacomv1beta1.SearchStrategyThorough,
Config: &apiextensionsv1.JSON{
Raw: []byte(validConfig),
}, },
}, },
isClusterWide: true,
errMsg: `spec.searchStrategy "thorough" is incompatible with spec.backend "auto"`,
},
{
name: "rapid + auto is valid (default combination)",
request: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{Name: "test-dgdr", Namespace: "default"},
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Image: "profiler:latest",
Backend: nvidiacomv1beta1.BackendTypeAuto,
SearchStrategy: nvidiacomv1beta1.SearchStrategyRapid,
}, },
}, },
isClusterWide: true, isClusterWide: true,
errMsg: "spec.profilingConfig.profilerImage is required",
}, },
{ {
name: "missing profiling config", name: "thorough + vllm is valid",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ request: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{Name: "test-dgdr", Namespace: "default"},
Name: "test-dgdr", Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b", Model: "llama-3-8b",
Backend: "vllm", Image: "profiler:latest",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ Backend: nvidiacomv1beta1.BackendTypeVllm,
ProfilerImage: "profiler:latest", SearchStrategy: nvidiacomv1beta1.SearchStrategyThorough,
Config: nil,
},
}, },
}, },
isClusterWide: true, isClusterWide: true,
errMsg: "spec.profilingConfig.config is required and must not be empty",
}, },
{ {
name: "empty profiling config", name: "thorough + trtllm is valid",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ request: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{Name: "test-dgdr", Namespace: "default"},
Name: "test-dgdr", Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b", Model: "llama-3-8b",
Backend: "vllm", Image: "profiler:latest",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ Backend: nvidiacomv1beta1.BackendTypeTrtllm,
ProfilerImage: "profiler:latest", SearchStrategy: nvidiacomv1beta1.SearchStrategyThorough,
Config: &apiextensionsv1.JSON{
Raw: []byte{},
}, },
}, },
isClusterWide: true,
},
{
name: "thorough + sglang is valid",
request: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{Name: "test-dgdr", Namespace: "default"},
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Image: "profiler:latest",
Backend: nvidiacomv1beta1.BackendTypeSglang,
SearchStrategy: nvidiacomv1beta1.SearchStrategyThorough,
}, },
}, },
isClusterWide: true, isClusterWide: true,
errMsg: "spec.profilingConfig.config is required and must not be empty",
}, },
{ {
name: "namespace-scoped operator with manual hardware config (should pass)", name: "namespace-scoped operator with manual hardware config (should pass)",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ request: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{Name: "test-dgdr", Namespace: "default"},
Name: "test-dgdr", Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b", Model: "llama-3-8b",
Backend: "vllm", Backend: nvidiacomv1beta1.BackendTypeVllm,
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ Image: "profiler:latest",
ProfilerImage: "profiler:latest", Hardware: &nvidiacomv1beta1.HardwareSpec{
Config: &apiextensionsv1.JSON{ GPUSKU: "H100-SXM5-80GB",
Raw: []byte(validConfigWithHardware), VRAMMB: &vram,
}, NumGPUsPerNode: &gpuCount,
}, },
}, },
}, },
...@@ -148,20 +148,12 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) { ...@@ -148,20 +148,12 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
}, },
{ {
name: "namespace-scoped operator with GPU discovery enabled (should pass without manual config)", name: "namespace-scoped operator with GPU discovery enabled (should pass without manual config)",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ request: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{Name: "test-dgdr", Namespace: "default"},
Name: "test-dgdr", Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b", Model: "llama-3-8b",
Backend: "vllm", Backend: nvidiacomv1beta1.BackendTypeVllm,
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ Image: "profiler:latest",
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(minimalConfig),
},
},
}, },
}, },
isClusterWide: false, isClusterWide: false,
...@@ -169,20 +161,12 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) { ...@@ -169,20 +161,12 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
}, },
{ {
name: "namespace-scoped operator with GPU discovery disabled and no hardware config (should error)", name: "namespace-scoped operator with GPU discovery disabled and no hardware config (should error)",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ request: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{Name: "test-dgdr", Namespace: "default"},
Name: "test-dgdr", Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b", Model: "llama-3-8b",
Backend: "vllm", Backend: nvidiacomv1beta1.BackendTypeVllm,
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ Image: "profiler:latest",
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(minimalConfig),
},
},
}, },
}, },
isClusterWide: false, isClusterWide: false,
...@@ -190,93 +174,25 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) { ...@@ -190,93 +174,25 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
errMsg: "GPU hardware configuration required: GPU discovery is disabled", errMsg: "GPU hardware configuration required: GPU discovery is disabled",
}, },
{ {
name: "invalid config YAML", name: "multiple errors (missing image and thorough+auto)",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ request: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{Name: "test-dgdr", Namespace: "default"},
Name: "test-dgdr", Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b", Model: "llama-3-8b",
Backend: "vllm", Backend: nvidiacomv1beta1.BackendTypeAuto,
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ SearchStrategy: nvidiacomv1beta1.SearchStrategyThorough,
ProfilerImage: "profiler:latest", Image: "",
Config: &apiextensionsv1.JSON{
Raw: []byte(invalidYAML),
},
},
}, },
}, },
isClusterWide: true, isClusterWide: true,
errMsg: "failed to parse spec.profilingConfig.config", errMsg: "spec.image is required\nspec.searchStrategy",
},
{
name: "warning for different backend in config",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(configWithDifferentBackend),
},
},
},
},
isClusterWide: true,
expectedWarning: "spec.profilingConfig.config.engine.backend (sglang) will be overwritten by spec.backend (vllm)",
},
{
name: "warning for different model in config",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(configWithDifferentModel),
},
},
},
},
isClusterWide: true,
expectedWarning: "spec.profilingConfig.config.deployment.model (different-model) will be overwritten by spec.model (llama-3-8b)",
},
{
name: "multiple errors (missing profiler image and missing config)",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "",
Config: nil,
},
},
},
isClusterWide: false,
errMsg: "spec.profilingConfig.profilerImage is required\nspec.profilingConfig.config is required and must not be empty",
}, },
} }
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { t.Run(tt.name, func(t *testing.T) {
validator := NewDynamoGraphDeploymentRequestValidator(tt.request, tt.isClusterWide, tt.gpuDiscoveryEnabled) validator := NewDynamoGraphDeploymentRequestValidator(tt.request, tt.isClusterWide, tt.gpuDiscoveryEnabled)
warnings, err := validator.Validate() _, err := validator.Validate()
wantErr := tt.errMsg != "" wantErr := tt.errMsg != ""
if (err != nil) != wantErr { if (err != nil) != wantErr {
...@@ -290,80 +206,159 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) { ...@@ -290,80 +206,159 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
} }
} }
} }
wantWarning := tt.expectedWarning != ""
if wantWarning && len(warnings) == 0 {
t.Errorf("Validate() expected warning %q but got none", tt.expectedWarning)
}
if wantWarning && len(warnings) > 0 && !strings.Contains(warnings[0], tt.expectedWarning) {
t.Errorf("Validate() warning %q does not contain %q", warnings[0], tt.expectedWarning)
}
}) })
} }
} }
func TestDynamoGraphDeploymentRequestValidator_ValidateUpdate(t *testing.T) { func TestDynamoGraphDeploymentRequestValidator_ValidateUpdate(t *testing.T) {
validConfig := `{"engine": {"backend": "vllm"}}`
tests := []struct { tests := []struct {
name string name string
oldRequest *nvidiacomv1alpha1.DynamoGraphDeploymentRequest oldRequest *nvidiacomv1beta1.DynamoGraphDeploymentRequest
newRequest *nvidiacomv1alpha1.DynamoGraphDeploymentRequest newRequest *nvidiacomv1beta1.DynamoGraphDeploymentRequest
wantErr bool wantErr bool
errMsg string
wantWarnings bool wantWarnings bool
}{ }{
{ {
name: "no changes", name: "no changes",
oldRequest: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ oldRequest: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b", Model: "llama-3-8b",
Backend: "vllm", Backend: nvidiacomv1beta1.BackendTypeVllm,
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ Image: "profiler:latest",
ProfilerImage: "profiler:latest", },
Config: &apiextensionsv1.JSON{
Raw: []byte(validConfig),
}, },
newRequest: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: nvidiacomv1beta1.BackendTypeVllm,
Image: "profiler:latest",
}, },
}, },
wantErr: false,
}, },
newRequest: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ {
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ name: "changing model name is allowed when not in immutable phase",
oldRequest: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b", Model: "llama-3-8b",
Backend: "vllm", Backend: nvidiacomv1beta1.BackendTypeVllm,
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ Image: "profiler:latest",
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(validConfig),
}, },
}, },
newRequest: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-70b",
Backend: nvidiacomv1beta1.BackendTypeVllm,
Image: "profiler:latest",
}, },
}, },
wantErr: false, wantErr: false,
}, },
{ {
name: "changing model name is allowed", name: "spec change rejected during Profiling phase",
oldRequest: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ oldRequest: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b", Model: "llama-3-8b",
Backend: "vllm", Backend: nvidiacomv1beta1.BackendTypeVllm,
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ Image: "profiler:latest",
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(validConfig),
}, },
Status: nvidiacomv1beta1.DynamoGraphDeploymentRequestStatus{
Phase: nvidiacomv1beta1.DGDRPhaseProfiling,
}, },
}, },
newRequest: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-70b",
Backend: nvidiacomv1beta1.BackendTypeVllm,
Image: "profiler:latest",
},
},
wantErr: true,
errMsg: "spec updates are forbidden while the resource is in phase",
},
{
name: "spec change rejected during Deploying phase",
oldRequest: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: nvidiacomv1beta1.BackendTypeVllm,
Image: "profiler:latest",
},
Status: nvidiacomv1beta1.DynamoGraphDeploymentRequestStatus{
Phase: nvidiacomv1beta1.DGDRPhaseDeploying,
}, },
newRequest: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ },
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ newRequest: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-70b", Model: "llama-3-70b",
Backend: "vllm", Backend: nvidiacomv1beta1.BackendTypeVllm,
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ Image: "profiler:latest",
ProfilerImage: "profiler:latest", },
Config: &apiextensionsv1.JSON{ },
Raw: []byte(validConfig), wantErr: true,
errMsg: "spec updates are forbidden while the resource is in phase",
}, },
{
name: "spec change rejected during Deployed phase",
oldRequest: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: nvidiacomv1beta1.BackendTypeVllm,
Image: "profiler:latest",
}, },
Status: nvidiacomv1beta1.DynamoGraphDeploymentRequestStatus{
Phase: nvidiacomv1beta1.DGDRPhaseDeployed,
},
},
newRequest: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-70b",
Backend: nvidiacomv1beta1.BackendTypeVllm,
Image: "profiler:latest",
},
},
wantErr: true,
errMsg: "spec updates are forbidden while the resource is in phase",
},
{
name: "no spec change during immutable phase is allowed",
oldRequest: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: nvidiacomv1beta1.BackendTypeVllm,
Image: "profiler:latest",
},
Status: nvidiacomv1beta1.DynamoGraphDeploymentRequestStatus{
Phase: nvidiacomv1beta1.DGDRPhaseProfiling,
},
},
newRequest: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: nvidiacomv1beta1.BackendTypeVllm,
Image: "profiler:latest",
},
},
wantErr: false,
},
{
name: "spec change allowed during Failed phase",
oldRequest: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: nvidiacomv1beta1.BackendTypeVllm,
Image: "profiler:latest",
},
Status: nvidiacomv1beta1.DynamoGraphDeploymentRequestStatus{
Phase: nvidiacomv1beta1.DGDRPhaseFailed,
},
},
newRequest: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-70b",
Backend: nvidiacomv1beta1.BackendTypeVllm,
Image: "profiler:latest",
}, },
}, },
wantErr: false, wantErr: false,
...@@ -376,12 +371,21 @@ func TestDynamoGraphDeploymentRequestValidator_ValidateUpdate(t *testing.T) { ...@@ -376,12 +371,21 @@ func TestDynamoGraphDeploymentRequestValidator_ValidateUpdate(t *testing.T) {
warnings, err := validator.ValidateUpdate(tt.oldRequest) warnings, err := validator.ValidateUpdate(tt.oldRequest)
if (err != nil) != tt.wantErr { if (err != nil) != tt.wantErr {
t.Errorf("DynamoGraphDeploymentRequestValidator.ValidateUpdate() error = %v, wantErr %v", err, tt.wantErr) t.Errorf("ValidateUpdate() error = %v, wantErr %v", err, tt.wantErr)
return return
} }
if tt.wantErr && tt.errMsg != "" {
if !strings.Contains(err.Error(), tt.errMsg) {
t.Errorf("ValidateUpdate() error %q does not contain %q", err.Error(), tt.errMsg)
}
}
if tt.wantWarnings && len(warnings) == 0 { if tt.wantWarnings && len(warnings) == 0 {
t.Errorf("DynamoGraphDeploymentRequestValidator.ValidateUpdate() expected warnings but got none") t.Errorf("ValidateUpdate() expected warnings but got none")
}
if !tt.wantWarnings && len(warnings) > 0 {
t.Errorf("ValidateUpdate() unexpected warnings: %v", warnings)
} }
}) })
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment