Unverified Commit d100c6e2 authored by Jonathan Tong's avatar Jonathan Tong Committed by GitHub
Browse files

feat: use v1beta1 DGDR in controller (#6498)


Signed-off-by: default avatarJont828 <jt572@cornell.edu>
Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
Co-authored-by: default avatarCopilot <223556219+Copilot@users.noreply.github.com>
Co-authored-by: default avatarHannah Zhang <hannahz@nvidia.com>
parent 5a319aed
......@@ -461,7 +461,7 @@ spec:
type: object
type: object
served: true
storage: true
storage: false
subresources:
status: {}
- additionalPrinterColumns:
......@@ -9247,6 +9247,6 @@ spec:
type: object
type: object
served: true
storage: false
storage: true
subresources:
status: {}
......@@ -145,7 +145,7 @@ webhooks:
service:
name: {{ include "dynamo-operator.fullname" . }}-webhook-service
namespace: {{ .Release.Namespace }}
path: /validate-nvidia-com-v1alpha1-dynamographdeploymentrequest
path: /validate-nvidia-com-v1beta1-dynamographdeploymentrequest
failurePolicy: {{ .Values.webhook.failurePolicy }}
name: vdynamographdeploymentrequest.kb.io
{{- if .Values.webhook.namespaceSelector }}
......@@ -161,6 +161,7 @@ webhooks:
- nvidia.com
apiVersions:
- v1alpha1
- v1beta1
operations:
- CREATE
- UPDATE
......
......@@ -279,7 +279,6 @@ type DynamoGraphDeploymentRequestStatus struct {
//
// +kubebuilder:object:root=true
// +kubebuilder:subresource:status
// +kubebuilder:storageversion
// +kubebuilder:resource:shortName=dgdr
// +kubebuilder:deprecatedversion:warning="nvidia.com/v1alpha1 DynamoGraphDeploymentRequest is deprecated; use nvidia.com/v1beta1 DynamoGraphDeploymentRequest"
// +kubebuilder:printcolumn:name="Model",type=string,JSONPath=`.spec.model`
......
......@@ -496,6 +496,7 @@ type DynamoGraphDeploymentRequestStatus struct {
//
// +kubebuilder:object:root=true
// +kubebuilder:subresource:status
// +kubebuilder:storageversion
// +kubebuilder:resource:shortName=dgdr
// +kubebuilder:printcolumn:name="Model",type=string,JSONPath=`.spec.model`
// +kubebuilder:printcolumn:name="Backend",type=string,JSONPath=`.spec.backend`
......
......@@ -461,7 +461,7 @@ spec:
type: object
type: object
served: true
storage: true
storage: false
subresources:
status: {}
- additionalPrinterColumns:
......@@ -9247,6 +9247,6 @@ spec:
type: object
type: object
served: true
storage: false
storage: true
subresources:
status: {}
......@@ -20,6 +20,8 @@ package controller
import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"io"
"text/template"
......@@ -45,7 +47,8 @@ import (
sigsyaml "sigs.k8s.io/yaml"
configv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/config/v1alpha1"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
dgdv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
commonController "github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/gpu"
......@@ -53,37 +56,6 @@ import (
)
const (
// Condition types
ConditionTypeValidation = "Validation"
ConditionTypeProfiling = "Profiling"
ConditionTypeSpecGenerated = "SpecGenerated"
ConditionTypeDeploymentReady = "DeploymentReady"
// Event reasons
EventReasonInitialized = "Initialized"
EventReasonValidationFailed = "ValidationFailed"
EventReasonProfilingJobCreated = "ProfilingJobCreated"
EventReasonProfilingJobFailed = "ProfilingJobFailed"
EventReasonAIConfiguratorFailed = "AIConfiguratorFailed"
EventReasonSpecGenerated = "SpecGenerated"
EventReasonSpecChangeRejected = "SpecChangeRejected"
EventReasonDeploymentCreated = "DeploymentCreated"
EventReasonDeploymentReady = "DeploymentReady"
EventReasonDeploymentDegraded = "DeploymentDegraded"
EventReasonDeploymentDeleted = "DeploymentDeleted"
// Label keys
LabelApp = "app"
LabelDGDR = "dgdr"
LabelDGDRName = "dgdr.nvidia.com/name"
LabelDGDRNamespace = "dgdr.nvidia.com/namespace"
LabelManagedBy = "nvidia.com/managed-by"
// Label values
LabelValueDynamoProfiler = "dynamo-profiler"
LabelValueAICProfiler = "aic-profiler"
LabelValueDynamoOperator = "dynamo-operator"
// Job naming
JobNamePrefixOnline = "profile-online-"
JobNamePrefixAIC = "profile-aic-"
......@@ -101,6 +73,12 @@ const (
// Annotation keys
AnnotationAdditionalResources = "dgdr.nvidia.com/additional-resources"
// Annotation keys for v1alpha1 round-trip compatibility.
// The conversion layer stores v1alpha1 fields that have no v1beta1 spec equivalent
// as annotations so the controller can still honour them for converted resources.
AnnotationConfigMapRef = "nvidia.com/dgdr-config-map-ref"
AnnotationOutputPVC = "nvidia.com/dgdr-output-pvc"
// Size limits
MaxAnnotationSize = 250000 // ~250KB, below K8s 256KB limit
......@@ -108,16 +86,16 @@ const (
SidecarImage = "bitnami/kubectl:latest"
// Volume names
VolumeNameProfilingConfig = "profiling-config"
VolumeNameProfilingOutput = "profiling-output"
VolumeNameProfilingConfig = "profiling-config"
VolumeNameModelCache = "model-cache"
// Volume paths
ProfilingOutputPath = "/data"
ProfilingOutputFile = "config_with_planner.yaml"
ProfilingOutputFileMocker = "mocker_config_with_planner.yaml"
ProfilingConfigPath = "/config"
ProfilingConfigFile = "disagg.yaml"
ProfilingConfigMountPath = "/config"
ProfilingConfigDefaultKey = "disagg.yaml"
DefaultModelCacheMountPath = "/opt/model-cache"
// Command line arguments
......@@ -139,7 +117,7 @@ const (
MessageDeploymentDegraded = "DynamoGraphDeployment %s degraded from Ready to %s"
MessageDeploymentDeleted = "DGD %s was deleted. DGDR will not recreate it. Delete this DGDR and create a new one to redeploy."
MessageInvalidState = "Invalid state"
MessageSpecChangeRejected = "Cannot modify spec in state '%s'. DynamoGraphDeploymentRequest is immutable once profiling starts. Create a new resource with a different name instead."
MessageSpecChangeRejected = "Cannot modify spec in phase '%s'. DynamoGraphDeploymentRequest is immutable once profiling starts. Create a new resource with a different name instead."
MessageJobCreationFailed = "JobCreationFailed"
MessageDeploymentCreationFailed = "DeploymentCreationFailed"
MessageResultsRetrievalFailed = "ResultsRetrievalFailed"
......@@ -149,38 +127,6 @@ const (
MessageConfigMapNotFound = "ConfigMap %s not found in namespace %s"
MessageConfigMapKeyNotFound = "key %s not found in ConfigMap %s"
MessageModelCachePVCNotFound = "model cache PVC %s not found in namespace %s"
// Validation messages
ValidationErrorModelRequired = "model is required"
ValidationErrorITLPositive = "sla.itl must be positive"
ValidationErrorTTFTPositive = "sla.ttft must be positive"
ValidationErrorInvalidBackend = "invalid backend: %s (must be vllm, sglang, or trtllm)"
// Valid backend values
BackendVLLM = "vllm"
BackendSGLang = "sglang"
BackendTRTLLM = "trtllm"
// Profiling config field names for v1alpha1; note: will be removed in v1beta1
ConfigKeyDeployment = "deployment"
ConfigKeyModelCache = "modelCache"
ConfigKeyPVCName = "pvcName"
ConfigKeyPVCPath = "pvcPath"
ConfigKeyMountPath = "mountPath"
ConfigKeyHardware = "hardware"
ConfigKeyEngine = "engine"
ConfigKeyOutputDir = "output_dir"
ConfigKeyNumGpusPerNode = "numGpusPerNode"
ConfigKeyGPUModel = "gpuModel"
ConfigKeyGPUVramMib = "gpuVramMib"
ConfigKeySystem = "system"
ConfigKeyMinNumGpusPerEng = "minNumGpusPerEngine"
ConfigKeyMaxNumGpusPerEng = "maxNumGpusPerEngine"
ConfigKeyBackend = "backend"
ConfigKeyConfig = "config"
ConfigKeyNamespace = "namespace"
ConfigKeyModel = "model"
ConfigKeyDGDImage = "dgd_image"
)
// shell script template for the output copier sidecar
......@@ -318,7 +264,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) GetRecorder() record.EventRecor
}
// FinalizeResource implements commonController.Finalizer interface
func (r *DynamoGraphDeploymentRequestReconciler) FinalizeResource(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error {
func (r *DynamoGraphDeploymentRequestReconciler) FinalizeResource(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) error {
logger := log.FromContext(ctx)
logger.Info("DGDR finalized successfully", "name", dgdr.Name)
......@@ -342,7 +288,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) Reconcile(ctx context.Context,
logger.Info("Reconciling DynamoGraphDeploymentRequest", "name", req.Name, "namespace", req.Namespace)
// Fetch the DGDR instance
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{}
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{}
if err := r.Get(ctx, req.NamespacedName, dgdr); err != nil {
if apierrors.IsNotFound(err) {
logger.Info("DGDR resource not found, ignoring since object must be deleted")
......@@ -365,99 +311,98 @@ func (r *DynamoGraphDeploymentRequestReconciler) Reconcile(ctx context.Context,
// Check for spec changes (immutability enforcement)
if dgdr.Status.ObservedGeneration > 0 && dgdr.Status.ObservedGeneration != dgdr.Generation {
// Spec changed after initial processing
if dgdr.Status.State == nvidiacomv1alpha1.DGDRStateProfiling || dgdr.Status.State == nvidiacomv1alpha1.DGDRStateDeploying ||
dgdr.Status.State == nvidiacomv1alpha1.DGDRStateReady || dgdr.Status.State == nvidiacomv1alpha1.DGDRStateDeploymentDeleted {
logger.Info("Spec change detected in immutable state",
"state", dgdr.Status.State,
if dgdr.Status.Phase == nvidiacomv1beta1.DGDRPhaseProfiling || dgdr.Status.Phase == nvidiacomv1beta1.DGDRPhaseDeploying ||
dgdr.Status.Phase == nvidiacomv1beta1.DGDRPhaseReady || dgdr.Status.Phase == nvidiacomv1beta1.DGDRPhaseDeployed {
logger.Info("Spec change detected in immutable phase",
"phase", dgdr.Status.Phase,
"observedGeneration", dgdr.Status.ObservedGeneration,
"currentGeneration", dgdr.Generation)
r.Recorder.Event(dgdr, corev1.EventTypeWarning, EventReasonSpecChangeRejected,
fmt.Sprintf(MessageSpecChangeRejected, dgdr.Status.State))
r.Recorder.Event(dgdr, corev1.EventTypeWarning, nvidiacomv1beta1.EventReasonSpecChangeRejected,
fmt.Sprintf(MessageSpecChangeRejected, dgdr.Status.Phase))
// Keep the old observedGeneration to continue rejecting changes
// No state transition - stay in current state with old spec
// No phase transition - stay in current phase with old spec
return ctrl.Result{}, nil
}
}
// State machine: handle different states
switch dgdr.Status.State {
case nvidiacomv1alpha1.DGDRStateInitializing, "":
return r.handleInitialState(ctx, dgdr)
case nvidiacomv1alpha1.DGDRStatePending:
return r.handlePendingState(ctx, dgdr)
case nvidiacomv1alpha1.DGDRStateProfiling:
return r.handleProfilingState(ctx, dgdr)
case nvidiacomv1alpha1.DGDRStateDeploying:
return r.handleDeployingState(ctx, dgdr)
case nvidiacomv1alpha1.DGDRStateReady:
return r.handleReadyState(ctx, dgdr)
case nvidiacomv1alpha1.DGDRStateDeploymentDeleted:
return r.handleDeploymentDeletedState(ctx, dgdr)
case nvidiacomv1alpha1.DGDRStateFailed:
return r.handleFailedState(ctx, dgdr)
// Phase machine: handle different phases
switch dgdr.Status.Phase {
case nvidiacomv1beta1.DGDRPhasePending, "":
return r.handlePendingPhase(ctx, dgdr)
case nvidiacomv1beta1.DGDRPhaseProfiling:
return r.handleProfilingPhase(ctx, dgdr)
case nvidiacomv1beta1.DGDRPhaseDeploying:
return r.handleDeployingPhase(ctx, dgdr)
case nvidiacomv1beta1.DGDRPhaseReady:
return r.handleReadyPhase(ctx, dgdr)
case nvidiacomv1beta1.DGDRPhaseDeployed:
return r.handleDeployedPhase(ctx, dgdr)
case nvidiacomv1beta1.DGDRPhaseFailed:
return r.handleFailedPhase(ctx, dgdr)
default:
logger.Info("Unknown state", "state", dgdr.Status.State)
return r.updateStateAndRequeue(ctx, dgdr, nvidiacomv1alpha1.DGDRStateFailed, MessageInvalidState)
logger.Info("Unknown phase", "phase", dgdr.Status.Phase)
return r.updatePhaseAndRequeue(ctx, dgdr, nvidiacomv1beta1.DGDRPhaseFailed, MessageInvalidState)
}
}
// handleInitialState processes newly created DGDR resources
func (r *DynamoGraphDeploymentRequestReconciler) handleInitialState(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
// handlePendingPhase processes newly created or pending DGDR resources.
// When ObservedGeneration == 0, performs initial validation (merged from v1alpha1 Initializing state).
// Otherwise, starts the profiling process.
func (r *DynamoGraphDeploymentRequestReconciler) handlePendingPhase(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
logger := log.FromContext(ctx)
logger.Info("Handling initial state", "name", dgdr.Name)
// First-time processing: validate spec (merged from handleInitialState)
if dgdr.Status.ObservedGeneration == 0 {
logger.Info("Handling initial validation", "name", dgdr.Name)
// Validate the spec
if err := r.validateSpec(ctx, dgdr); err != nil {
r.Recorder.Event(dgdr, corev1.EventTypeWarning, EventReasonValidationFailed, err.Error())
return r.updateStateWithCondition(ctx, dgdr, nvidiacomv1alpha1.DGDRStateFailed, ConditionTypeValidation, metav1.ConditionFalse, EventReasonValidationFailed, err.Error())
r.Recorder.Event(dgdr, corev1.EventTypeWarning, nvidiacomv1beta1.EventReasonValidationFailed, err.Error())
return r.updatePhaseWithCondition(ctx, dgdr, nvidiacomv1beta1.DGDRPhaseFailed, nvidiacomv1beta1.ConditionTypeValidation, metav1.ConditionFalse, nvidiacomv1beta1.EventReasonValidationFailed, err.Error())
}
// Set observedGeneration to track the spec we're processing
dgdr.Status.ObservedGeneration = dgdr.Generation
// Populate backend in status from spec for display in kubectl output
dgdr.Status.Backend = dgdr.Spec.Backend
// Initialize status
r.Recorder.Event(dgdr, corev1.EventTypeNormal, EventReasonInitialized, MessageInitialized)
return r.updateStateAndRequeue(ctx, dgdr, nvidiacomv1alpha1.DGDRStatePending, MessageInitialized)
}
r.Recorder.Event(dgdr, corev1.EventTypeNormal, nvidiacomv1beta1.EventReasonInitialized, MessageInitialized)
return r.updatePhaseAndRequeue(ctx, dgdr, nvidiacomv1beta1.DGDRPhasePending, MessageInitialized)
}
// handlePendingState starts the profiling process
func (r *DynamoGraphDeploymentRequestReconciler) handlePendingState(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
logger := log.FromContext(ctx)
logger.Info("Handling pending state", "name", dgdr.Name)
logger.Info("Handling pending phase", "name", dgdr.Name)
// Create profiling job (online or AIC)
if err := r.createProfilingJob(ctx, dgdr); err != nil {
r.Recorder.Event(dgdr, corev1.EventTypeWarning, EventReasonProfilingJobFailed, err.Error())
return r.updateStateWithCondition(ctx, dgdr, nvidiacomv1alpha1.DGDRStateFailed, ConditionTypeProfiling, metav1.ConditionFalse, MessageJobCreationFailed, err.Error())
r.Recorder.Event(dgdr, corev1.EventTypeWarning, nvidiacomv1beta1.EventReasonProfilingJobFailed, err.Error())
return r.updatePhaseWithCondition(ctx, dgdr, nvidiacomv1beta1.DGDRPhaseFailed, nvidiacomv1beta1.ConditionTypeProfiling, metav1.ConditionFalse, MessageJobCreationFailed, err.Error())
}
// Record event with appropriate message
if isOnlineProfiling(dgdr) {
r.Recorder.Event(dgdr, corev1.EventTypeNormal, EventReasonProfilingJobCreated, MessageProfilingJobCreated)
r.Recorder.Event(dgdr, corev1.EventTypeNormal, nvidiacomv1beta1.EventReasonProfilingJobCreated, MessageProfilingJobCreated)
} else {
r.Recorder.Event(dgdr, corev1.EventTypeNormal, EventReasonProfilingJobCreated, MessageAICProfilingJobCreated)
r.Recorder.Event(dgdr, corev1.EventTypeNormal, nvidiacomv1beta1.EventReasonProfilingJobCreated, MessageAICProfilingJobCreated)
}
// Update to Profiling state with Running status
return r.updateStateWithCondition(ctx, dgdr, nvidiacomv1alpha1.DGDRStateProfiling, ConditionTypeProfiling, metav1.ConditionFalse, "ProfilingRunning", MessageProfilingInProgress)
// Update to Profiling phase with Running status
dgdr.SetProfilingPhase(nvidiacomv1beta1.ProfilingPhaseInitializing)
return r.updatePhaseWithCondition(ctx, dgdr, nvidiacomv1beta1.DGDRPhaseProfiling, nvidiacomv1beta1.ConditionTypeProfiling, metav1.ConditionFalse, "ProfilingRunning", MessageProfilingInProgress)
}
// handleProfilingState monitors profiling progress and generates spec when complete
func (r *DynamoGraphDeploymentRequestReconciler) handleProfilingState(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
// handleProfilingPhase monitors profiling progress and generates spec when complete
func (r *DynamoGraphDeploymentRequestReconciler) handleProfilingPhase(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
logger := log.FromContext(ctx)
logger.Info("Handling profiling state", "name", dgdr.Name)
logger.Info("Handling profiling phase", "name", dgdr.Name)
// Check profiling job status (both online and offline/AIC run as Jobs)
// Note: We watch the Job via Owns(), so we'll be triggered automatically on Job changes
completed, err := r.checkProfilingJobStatus(ctx, dgdr)
if err != nil {
r.Recorder.Event(dgdr, corev1.EventTypeWarning, MessageProfilingCheckFailed, err.Error())
// Job failed - transition to Failed state
return r.updateStateWithCondition(ctx, dgdr, nvidiacomv1alpha1.DGDRStateFailed, ConditionTypeProfiling, metav1.ConditionFalse, "ProfilingFailed", err.Error())
// Job failed - clear profiling sub-phase and transition to Failed
dgdr.ClearProfilingPhase()
return r.updatePhaseWithCondition(ctx, dgdr, nvidiacomv1beta1.DGDRPhaseFailed, nvidiacomv1beta1.ConditionTypeProfiling, metav1.ConditionFalse, "ProfilingFailed", err.Error())
}
if !completed {
......@@ -466,9 +411,12 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleProfilingState(ctx contex
return ctrl.Result{}, nil
}
// Profiling complete — clear the profiling sub-phase
dgdr.ClearProfilingPhase()
// Mark profiling as completed successfully
meta.SetStatusCondition(&dgdr.Status.Conditions, metav1.Condition{
Type: ConditionTypeProfiling,
Type: nvidiacomv1beta1.ConditionTypeProfiling,
Status: metav1.ConditionTrue,
ObservedGeneration: dgdr.Generation,
Reason: "ProfilingCompleted",
......@@ -478,18 +426,16 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleProfilingState(ctx contex
// Retrieve profiling results and generate spec
if err := r.generateDGDSpec(ctx, dgdr); err != nil {
r.Recorder.Event(dgdr, corev1.EventTypeWarning, MessageGenerationFailed, err.Error())
return r.updateStateWithCondition(ctx, dgdr, nvidiacomv1alpha1.DGDRStateFailed, ConditionTypeSpecGenerated, metav1.ConditionFalse, MessageGenerationFailed, err.Error())
return r.updatePhaseWithCondition(ctx, dgdr, nvidiacomv1beta1.DGDRPhaseFailed, nvidiacomv1beta1.ConditionTypeSpecGenerated, metav1.ConditionFalse, MessageGenerationFailed, err.Error())
}
// Record spec generation event
r.Recorder.Event(dgdr, corev1.EventTypeNormal, EventReasonSpecGenerated, MessageSpecGenerated)
r.Recorder.Event(dgdr, corev1.EventTypeNormal, nvidiacomv1beta1.EventReasonSpecGenerated, MessageSpecGenerated)
// Create additional resources (ConfigMaps) immediately after profiling
// This ensures that the `planner-profile-data` ConfigMap is available for both auto and manual deployment
// v1beta1 uses the DGDR namespace for additional resources.
targetNamespace := dgdr.Namespace
if dgdr.Spec.DeploymentOverrides != nil && dgdr.Spec.DeploymentOverrides.Namespace != "" {
targetNamespace = dgdr.Spec.DeploymentOverrides.Namespace
}
if err := r.createAdditionalResources(ctx, dgdr, targetNamespace); err != nil {
logger.Error(err, "Failed to create additional resources after profiling")
// Don't fail the DGDR, just log the error - ConfigMaps can be created manually
......@@ -497,31 +443,48 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleProfilingState(ctx contex
fmt.Sprintf("Failed to create ConfigMaps from profiling output: %v", err))
}
// If autoApply is enabled, transition to Deploying state
// If autoApply is enabled, transition to Deploying phase
if dgdr.Spec.AutoApply {
logger.Info("AutoApply enabled, transitioning to Deploying state")
return r.updateStateWithCondition(ctx, dgdr, nvidiacomv1alpha1.DGDRStateDeploying, ConditionTypeSpecGenerated, metav1.ConditionTrue, EventReasonSpecGenerated, MessageSpecGenerated)
logger.Info("AutoApply enabled, transitioning to Deploying phase")
return r.updatePhaseWithCondition(ctx, dgdr, nvidiacomv1beta1.DGDRPhaseDeploying, nvidiacomv1beta1.ConditionTypeSpecGenerated, metav1.ConditionTrue, nvidiacomv1beta1.EventReasonSpecGenerated, MessageSpecGenerated)
}
// Otherwise, transition to Ready state
return r.updateStateWithCondition(ctx, dgdr, nvidiacomv1alpha1.DGDRStateReady, ConditionTypeSpecGenerated, metav1.ConditionTrue, EventReasonSpecGenerated, MessageSpecAvailable)
// Otherwise, transition to Ready phase
return r.updatePhaseWithCondition(ctx, dgdr, nvidiacomv1beta1.DGDRPhaseReady, nvidiacomv1beta1.ConditionTypeSpecGenerated, metav1.ConditionTrue, nvidiacomv1beta1.EventReasonSpecGenerated, MessageSpecAvailable)
}
// handleReadyState handles DGDR in Ready state
func (r *DynamoGraphDeploymentRequestReconciler) handleReadyState(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
// handleReadyPhase handles DGDR in Ready phase (profiling complete, spec available)
func (r *DynamoGraphDeploymentRequestReconciler) handleReadyPhase(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
logger := log.FromContext(ctx)
logger.Info("DGDR is ready", "name", dgdr.Name)
// If autoApply is not enabled, nothing to monitor
if !dgdr.Spec.AutoApply {
// Nothing to monitor in Ready phase - spec is available for manual application
return ctrl.Result{}, nil
}
// handleDeployingPhase handles DGD creation and monitors deployment
func (r *DynamoGraphDeploymentRequestReconciler) handleDeployingPhase(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
logger := log.FromContext(ctx)
logger.Info("Handling deploying phase", "name", dgdr.Name)
if !dgdr.Spec.AutoApply {
// Shouldn't be in this phase without autoApply
logger.Info("AutoApply not enabled, transitioning to Ready")
dgdr.Status.Phase = nvidiacomv1beta1.DGDRPhaseReady
setSucceededCondition(dgdr, nvidiacomv1beta1.DGDRPhaseReady)
return ctrl.Result{}, r.Status().Update(ctx, dgdr)
}
// Check if DGD still exists and monitor its status
dgd := &nvidiacomv1alpha1.DynamoGraphDeployment{}
// Check if we need to create DGD
if dgdr.Status.DGDName == "" {
return r.createDGD(ctx, dgdr)
}
// DGD was already created, check its status
dgd := &dgdv1alpha1.DynamoGraphDeployment{}
err := r.Get(ctx, types.NamespacedName{
Name: dgdr.Status.Deployment.Name,
Namespace: dgdr.Status.Deployment.Namespace,
Name: dgdr.Status.DGDName,
Namespace: dgdr.Namespace,
}, dgd)
if apierrors.IsNotFound(err) {
......@@ -533,52 +496,50 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleReadyState(ctx context.Co
return ctrl.Result{}, err
}
// Update deployment status
dgdr.Status.Deployment.State = dgd.Status.State
// Check if DGD is Ready
var condStatus metav1.ConditionStatus
var condReason, condMessage string
// Check if DGD degraded from Ready
if dgd.Status.State != nvidiacomv1alpha1.DGDStateSuccessful {
logger.Info("DGD degraded, transitioning back to Deploying",
"dgdState", dgd.Status.State)
if dgd.Status.State == dgdv1alpha1.DGDStateSuccessful {
logger.Info("DGD is Ready, transitioning to Deployed phase")
dgdr.Status.Phase = nvidiacomv1beta1.DGDRPhaseDeployed
setSucceededCondition(dgdr, nvidiacomv1beta1.DGDRPhaseDeployed)
dgdr.Status.State = nvidiacomv1alpha1.DGDRStateDeploying
r.Recorder.Event(dgdr, corev1.EventTypeNormal, nvidiacomv1beta1.EventReasonDeploymentReady,
fmt.Sprintf(MessageDeploymentReady, dgd.Name))
r.Recorder.Event(dgdr, corev1.EventTypeWarning, EventReasonDeploymentDegraded,
fmt.Sprintf(MessageDeploymentDegraded, dgd.Name, string(dgd.Status.State)))
condStatus = metav1.ConditionTrue
condReason = nvidiacomv1beta1.EventReasonDeploymentReady
condMessage = fmt.Sprintf(MessageDeploymentReady, dgd.Name)
} else {
logger.Info("DGD not yet ready", "name", dgd.Name, "state", dgd.Status.State)
condStatus = metav1.ConditionFalse
condReason = "DeploymentInProgress"
condMessage = fmt.Sprintf("DGD %s is in %s state", dgd.Name, string(dgd.Status.State))
}
updateDeploymentInfo(dgdr, dgd)
meta.SetStatusCondition(&dgdr.Status.Conditions, metav1.Condition{
Type: ConditionTypeDeploymentReady,
Status: metav1.ConditionFalse,
Reason: EventReasonDeploymentDegraded,
Message: fmt.Sprintf("Deployment degraded to %s", string(dgd.Status.State)),
Type: nvidiacomv1beta1.ConditionTypeDeploymentReady,
Status: condStatus,
Reason: condReason,
Message: condMessage,
})
}
return ctrl.Result{}, r.Status().Update(ctx, dgdr)
}
// handleDeployingState handles DGD creation and monitors deployment
func (r *DynamoGraphDeploymentRequestReconciler) handleDeployingState(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
// handleDeployedPhase monitors a healthy DGD and detects degradation or deletion
func (r *DynamoGraphDeploymentRequestReconciler) handleDeployedPhase(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
logger := log.FromContext(ctx)
logger.Info("Handling deploying state", "name", dgdr.Name)
logger.Info("DGDR is deployed", "name", dgdr.Name)
if !dgdr.Spec.AutoApply {
// Shouldn't be in this state without autoApply
logger.Info("AutoApply not enabled, transitioning to Ready")
dgdr.Status.State = nvidiacomv1alpha1.DGDRStateReady
return ctrl.Result{}, r.Status().Update(ctx, dgdr)
}
// Check if we need to create DGD
if dgdr.Status.Deployment == nil || !dgdr.Status.Deployment.Created {
return r.createDGD(ctx, dgdr)
}
// DGD was already created, check its status
dgd := &nvidiacomv1alpha1.DynamoGraphDeployment{}
// Check if DGD still exists and monitor its status
dgd := &dgdv1alpha1.DynamoGraphDeployment{}
err := r.Get(ctx, types.NamespacedName{
Name: dgdr.Status.Deployment.Name,
Namespace: dgdr.Status.Deployment.Namespace,
Name: dgdr.Status.DGDName,
Namespace: dgdr.Namespace,
}, dgd)
if apierrors.IsNotFound(err) {
......@@ -590,51 +551,54 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleDeployingState(ctx contex
return ctrl.Result{}, err
}
// Update deployment status
dgdr.Status.Deployment.State = dgd.Status.State
// Check if DGD degraded from Ready
if dgd.Status.State != dgdv1alpha1.DGDStateSuccessful {
logger.Info("DGD degraded, transitioning back to Deploying",
"dgdState", dgd.Status.State)
// Check if DGD is Ready
if dgd.Status.State == nvidiacomv1alpha1.DGDStateSuccessful {
logger.Info("DGD is Ready, transitioning to Ready state")
dgdr.Status.State = nvidiacomv1alpha1.DGDRStateReady
dgdr.Status.Phase = nvidiacomv1beta1.DGDRPhaseDeploying
setSucceededCondition(dgdr, nvidiacomv1beta1.DGDRPhaseDeploying)
updateDeploymentInfo(dgdr, dgd)
r.Recorder.Event(dgdr, corev1.EventTypeNormal, EventReasonDeploymentReady,
fmt.Sprintf(MessageDeploymentReady, dgd.Name))
r.Recorder.Event(dgdr, corev1.EventTypeWarning, nvidiacomv1beta1.EventReasonDeploymentDegraded,
fmt.Sprintf(MessageDeploymentDegraded, dgd.Name, string(dgd.Status.State)))
meta.SetStatusCondition(&dgdr.Status.Conditions, metav1.Condition{
Type: ConditionTypeDeploymentReady,
Status: metav1.ConditionTrue,
Reason: EventReasonDeploymentReady,
Message: fmt.Sprintf(MessageDeploymentReady, dgd.Name),
Type: nvidiacomv1beta1.ConditionTypeDeploymentReady,
Status: metav1.ConditionFalse,
Reason: nvidiacomv1beta1.EventReasonDeploymentDegraded,
Message: fmt.Sprintf("Deployment degraded to %s", string(dgd.Status.State)),
})
} else {
// DGD is healthy — update replica info only if changed
if !updateDeploymentInfo(dgdr, dgd) {
// Nothing changed, skip the status write
return ctrl.Result{}, nil
}
}
return ctrl.Result{}, r.Status().Update(ctx, dgdr)
}
// handleDeploymentDeletedState is a terminal state for when auto-created DGD is deleted
func (r *DynamoGraphDeploymentRequestReconciler) handleDeploymentDeletedState(_ context.Context, _ *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
// Terminal state - nothing to do
// User must delete this DGDR and create a new one to redeploy
return ctrl.Result{}, nil
}
// handleDGDDeleted handles the case when auto-created DGD is deleted by user
func (r *DynamoGraphDeploymentRequestReconciler) handleDGDDeleted(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
// handleDGDDeleted handles the case when auto-created DGD is deleted by user.
// In v1beta1, this transitions to Failed (DeploymentDeleted phase was removed).
func (r *DynamoGraphDeploymentRequestReconciler) handleDGDDeleted(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
logger := log.FromContext(ctx)
logger.Info("DGD was deleted by user, transitioning to DeploymentDeleted state")
logger.Info("DGD was deleted by user, transitioning to Failed phase")
dgdr.Status.State = nvidiacomv1alpha1.DGDRStateDeploymentDeleted
dgdr.Status.Phase = nvidiacomv1beta1.DGDRPhaseFailed
setSucceededCondition(dgdr, nvidiacomv1beta1.DGDRPhaseFailed)
r.Recorder.Event(dgdr, corev1.EventTypeWarning, EventReasonDeploymentDeleted,
fmt.Sprintf(MessageDeploymentDeleted, dgdr.Status.Deployment.Name))
r.Recorder.Event(dgdr, corev1.EventTypeWarning, nvidiacomv1beta1.EventReasonDeploymentDeleted,
fmt.Sprintf(MessageDeploymentDeleted, dgdr.Status.DGDName))
dgdr.Status.Deployment = nil
dgdr.Status.DGDName = ""
dgdr.Status.DeploymentInfo = nil
meta.SetStatusCondition(&dgdr.Status.Conditions, metav1.Condition{
Type: ConditionTypeDeploymentReady,
Type: nvidiacomv1beta1.ConditionTypeDeploymentReady,
Status: metav1.ConditionFalse,
Reason: EventReasonDeploymentDeleted,
Reason: nvidiacomv1beta1.EventReasonDeploymentDeleted,
Message: "Deployment was deleted by user. Create a new DGDR to redeploy.",
})
......@@ -642,45 +606,24 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleDGDDeleted(ctx context.Co
}
// createDGD creates a DynamoGraphDeployment with the generated spec
func (r *DynamoGraphDeploymentRequestReconciler) createDGD(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
func (r *DynamoGraphDeploymentRequestReconciler) createDGD(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
logger := log.FromContext(ctx)
// Extract DGD from RawExtension
if dgdr.Status.GeneratedDeployment == nil {
return ctrl.Result{}, fmt.Errorf("generatedDeployment is not set")
// Extract DGD spec from annotation (stored by generateDGDSpec)
dgdSpecYAML, ok := dgdr.Annotations["nvidia.com/generated-dgd-spec"]
if !ok || dgdSpecYAML == "" {
return ctrl.Result{}, fmt.Errorf("generated DGD spec not found in annotation nvidia.com/generated-dgd-spec")
}
generatedDGD := &nvidiacomv1alpha1.DynamoGraphDeployment{}
// RawExtension can have either Object (already decoded) or Raw (JSON bytes)
if dgdr.Status.GeneratedDeployment.Object != nil {
var ok bool
generatedDGD, ok = dgdr.Status.GeneratedDeployment.Object.(*nvidiacomv1alpha1.DynamoGraphDeployment)
if !ok {
return ctrl.Result{}, fmt.Errorf("generatedDeployment.Object is not a DynamoGraphDeployment")
}
} else if dgdr.Status.GeneratedDeployment.Raw != nil {
if err := yaml.Unmarshal(dgdr.Status.GeneratedDeployment.Raw, generatedDGD); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to unmarshal generated deployment: %w", err)
}
} else {
return ctrl.Result{}, fmt.Errorf("generatedDeployment has neither Object nor Raw set")
generatedDGD := &dgdv1alpha1.DynamoGraphDeployment{}
if err := yaml.Unmarshal([]byte(dgdSpecYAML), generatedDGD); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to unmarshal generated deployment from annotation: %w", err)
}
// Determine DGD name and namespace
// Determine DGD name and namespace from generated deployment
dgdName := generatedDGD.Name
dgdNamespace := dgdr.Namespace
// Apply deployment overrides
if dgdr.Spec.DeploymentOverrides != nil {
if dgdr.Spec.DeploymentOverrides.Name != "" {
dgdName = dgdr.Spec.DeploymentOverrides.Name
}
if dgdr.Spec.DeploymentOverrides.Namespace != "" {
dgdNamespace = dgdr.Spec.DeploymentOverrides.Namespace
}
}
// Build labels (start with generated DGD's labels)
labels := make(map[string]string)
if generatedDGD.Labels != nil {
......@@ -689,16 +632,9 @@ func (r *DynamoGraphDeploymentRequestReconciler) createDGD(ctx context.Context,
}
}
// Add/override with managed labels
labels[LabelDGDRName] = dgdr.Name
labels[LabelDGDRNamespace] = dgdr.Namespace
labels[LabelManagedBy] = LabelValueDynamoOperator
// Merge custom labels from overrides
if dgdr.Spec.DeploymentOverrides != nil && dgdr.Spec.DeploymentOverrides.Labels != nil {
for k, v := range dgdr.Spec.DeploymentOverrides.Labels {
labels[k] = v
}
}
labels[nvidiacomv1beta1.LabelDGDRName] = dgdr.Name
labels[nvidiacomv1beta1.LabelDGDRNamespace] = dgdr.Namespace
labels[nvidiacomv1beta1.LabelManagedBy] = nvidiacomv1beta1.LabelValueDynamoOperator
// Build annotations (start with generated DGD's annotations)
annotations := make(map[string]string)
......@@ -707,15 +643,9 @@ func (r *DynamoGraphDeploymentRequestReconciler) createDGD(ctx context.Context,
annotations[k] = v
}
}
// Merge custom annotations from overrides
if dgdr.Spec.DeploymentOverrides != nil && dgdr.Spec.DeploymentOverrides.Annotations != nil {
for k, v := range dgdr.Spec.DeploymentOverrides.Annotations {
annotations[k] = v
}
}
// Create DGD from generated deployment
dgd := &nvidiacomv1alpha1.DynamoGraphDeployment{
dgd := &dgdv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: dgdName,
Namespace: dgdNamespace,
......@@ -735,12 +665,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) createDGD(ctx context.Context,
if apierrors.IsAlreadyExists(err) {
// DGD already exists, just update status
logger.Info("DGD already exists, updating status")
dgdr.Status.Deployment = &nvidiacomv1alpha1.DeploymentStatus{
Name: dgdName,
Namespace: dgdNamespace,
State: nvidiacomv1alpha1.DGDStatePending,
Created: true,
}
dgdr.Status.DGDName = dgdName
return ctrl.Result{}, r.Status().Update(ctx, dgdr)
}
r.Recorder.Event(dgdr, corev1.EventTypeWarning, MessageDeploymentCreationFailed, err.Error())
......@@ -748,20 +673,15 @@ func (r *DynamoGraphDeploymentRequestReconciler) createDGD(ctx context.Context,
}
// Update status
dgdr.Status.Deployment = &nvidiacomv1alpha1.DeploymentStatus{
Name: dgdName,
Namespace: dgdNamespace,
State: nvidiacomv1alpha1.DGDStatePending,
Created: true,
}
dgdr.Status.DGDName = dgdName
r.Recorder.Event(dgdr, corev1.EventTypeNormal, EventReasonDeploymentCreated,
r.Recorder.Event(dgdr, corev1.EventTypeNormal, nvidiacomv1beta1.EventReasonDeploymentCreated,
fmt.Sprintf(MessageDeploymentCreated, dgdName))
meta.SetStatusCondition(&dgdr.Status.Conditions, metav1.Condition{
Type: ConditionTypeDeploymentReady,
Type: nvidiacomv1beta1.ConditionTypeDeploymentReady,
Status: metav1.ConditionFalse,
Reason: EventReasonDeploymentCreated,
Reason: nvidiacomv1beta1.EventReasonDeploymentCreated,
Message: fmt.Sprintf("DGD %s created, waiting for Ready", dgdName),
})
......@@ -771,7 +691,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) createDGD(ctx context.Context,
}
// createAdditionalResources creates ConfigMaps from the profiling output that should be deployed alongside the DGD
func (r *DynamoGraphDeploymentRequestReconciler) createAdditionalResources(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest, targetNamespace string) error {
func (r *DynamoGraphDeploymentRequestReconciler) createAdditionalResources(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest, targetNamespace string) error {
logger := log.FromContext(ctx)
// Check if there are additional resources stored in annotations
......@@ -821,9 +741,9 @@ func (r *DynamoGraphDeploymentRequestReconciler) createAdditionalResources(ctx c
if cm.Labels == nil {
cm.Labels = make(map[string]string)
}
cm.Labels[LabelDGDRName] = dgdr.Name
cm.Labels[LabelDGDRNamespace] = dgdr.Namespace
cm.Labels[LabelManagedBy] = LabelValueDynamoOperator
cm.Labels[nvidiacomv1beta1.LabelDGDRName] = dgdr.Name
cm.Labels[nvidiacomv1beta1.LabelDGDRNamespace] = dgdr.Namespace
cm.Labels[nvidiacomv1beta1.LabelManagedBy] = nvidiacomv1beta1.LabelValueDynamoOperator
// Create the ConfigMap
if err := r.Create(ctx, cm); err != nil {
......@@ -844,168 +764,89 @@ func (r *DynamoGraphDeploymentRequestReconciler) createAdditionalResources(ctx c
return nil
}
// handleFailedState handles DGDR in Failed state
func (r *DynamoGraphDeploymentRequestReconciler) handleFailedState(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
// handleFailedPhase handles DGDR in Failed phase
func (r *DynamoGraphDeploymentRequestReconciler) handleFailedPhase(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
logger := log.FromContext(ctx)
logger.Info("DGDR is in failed state", "name", dgdr.Name)
logger.Info("DGDR is in failed phase", "name", dgdr.Name)
// Could implement retry logic here if desired
return ctrl.Result{}, nil
}
// getProfilingJobName returns the job name for a DGDR
func getProfilingJobName(dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) string {
func getProfilingJobName(dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) string {
// Use "profile-" prefix for all profiling jobs
return fmt.Sprintf("profile-%s", dgdr.Name)
}
// getOutputConfigMapName returns the ConfigMap name for profiling output
func getOutputConfigMapName(dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) string {
func getOutputConfigMapName(dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) string {
return fmt.Sprintf("%s%s", ConfigMapOutputPrefix, dgdr.Name)
}
// isOnlineProfiling determines whether online profiling or AI Configurator is being used
// based on the sweep.use_ai_configurator config value
func isOnlineProfiling(dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) bool {
if dgdr.Spec.ProfilingConfig.Config == nil {
return true
}
var config map[string]interface{}
if err := yaml.Unmarshal(dgdr.Spec.ProfilingConfig.Config.Raw, &config); err != nil {
return true // Default to online on parse error
}
if sweep, ok := config["sweep"].(map[string]interface{}); ok {
// Check camelCase first (preferred), then snake_case (backwards compat)
if useAIC, exists := sweep["useAiConfigurator"].(bool); exists {
return !useAIC
}
if useAIC, exists := sweep["use_ai_configurator"].(bool); exists {
return !useAIC
}
}
// Default to online profiling if not specified
// isOnlineProfiling returns true. In v1beta1, the profiler decides online vs AIC
// mode internally based on its config. The controller always uses the same label.
func isOnlineProfiling(_ *nvidiacomv1beta1.DynamoGraphDeploymentRequest) bool {
return true
}
// validateSpec validates the DGDR spec
func (r *DynamoGraphDeploymentRequestReconciler) validateSpec(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error {
// Validate ConfigMap if provided (for the DGD base config)
// This requires cluster access and cannot be done in the stateless validator
if dgdr.Spec.ProfilingConfig.ConfigMapRef != nil {
cm := &corev1.ConfigMap{}
err := r.Get(ctx, types.NamespacedName{
Name: dgdr.Spec.ProfilingConfig.ConfigMapRef.Name,
Namespace: dgdr.Namespace,
}, cm)
if err != nil {
if apierrors.IsNotFound(err) {
return fmt.Errorf(MessageConfigMapNotFound,
dgdr.Spec.ProfilingConfig.ConfigMapRef.Name, dgdr.Namespace)
}
return err
}
// Validate key exists
key := dgdr.Spec.ProfilingConfig.ConfigMapRef.Key
if key == "" {
key = "disagg.yaml"
}
if _, exists := cm.Data[key]; !exists {
return fmt.Errorf(MessageConfigMapKeyNotFound, key, cm.Name)
}
func (r *DynamoGraphDeploymentRequestReconciler) validateSpec(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) error {
var errs []error
// Validate image is specified (required for the profiling job container).
// Mirrors the webhook admission check so controller-side writes cannot bypass it.
if dgdr.Spec.Image == "" {
errs = append(errs, fmt.Errorf("spec.image is required"))
}
// Disallow searchStrategy: thorough with backend: auto.
// Mirrors the webhook admission check so controller-side writes cannot bypass it.
if dgdr.Spec.SearchStrategy == nvidiacomv1beta1.SearchStrategyThorough &&
dgdr.Spec.Backend == nvidiacomv1beta1.BackendTypeAuto {
errs = append(errs, fmt.Errorf(
"spec.searchStrategy %q is incompatible with spec.backend %q: set spec.backend to a specific backend (sglang, trtllm, or vllm)",
nvidiacomv1beta1.SearchStrategyThorough,
nvidiacomv1beta1.BackendTypeAuto,
))
}
// Validate model cache PVC if provided
modelCachePVC, _ := extractModelCachePVCConfig(dgdr)
if modelCachePVC != "" {
if dgdr.Spec.ModelCache != nil && dgdr.Spec.ModelCache.PVCName != "" {
pvc := &corev1.PersistentVolumeClaim{}
err := r.Get(ctx, types.NamespacedName{
Name: modelCachePVC,
Name: dgdr.Spec.ModelCache.PVCName,
Namespace: dgdr.Namespace,
}, pvc)
if err != nil {
if apierrors.IsNotFound(err) {
return fmt.Errorf(MessageModelCachePVCNotFound, modelCachePVC, dgdr.Namespace)
}
errs = append(errs, fmt.Errorf(MessageModelCachePVCNotFound, dgdr.Spec.ModelCache.PVCName, dgdr.Namespace))
} else {
return err
}
}
}
if err := r.validateGPUHardwareInfo(ctx, dgdr); err != nil {
return err
errs = append(errs, err)
}
// The profiler will validate the rest of the configuration
return nil
}
// toFloat64 converts a numeric value (int or float64) to float64.
// Returns 0 if the value is neither int nor float64.
func toFloat64(val interface{}) float64 {
switch v := val.(type) {
case float64:
return v
case int:
return float64(v)
default:
return 0
}
return errors.Join(errs...)
}
// validateGPUHardwareInfo ensures GPU hardware information is available when required for profiling
func (r *DynamoGraphDeploymentRequestReconciler) validateGPUHardwareInfo(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error {
func (r *DynamoGraphDeploymentRequestReconciler) validateGPUHardwareInfo(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) error {
logger := log.FromContext(ctx)
// Check for hardware info and GPU ranges
// TODO: will be cleaner once we swap to new DGDR schema (#6130)
var config map[string]interface{}
if dgdr.Spec.ProfilingConfig.Config != nil {
if err := yaml.Unmarshal(dgdr.Spec.ProfilingConfig.Config.Raw, &config); err != nil {
// Config parse errors will be caught later, skip validation here
return nil
}
} else {
config = make(map[string]interface{})
}
hardwareVal, hasHardware := config[ConfigKeyHardware]
var hasManualHardwareConfig bool
if hasHardware && hardwareVal != nil {
if hardwareConfig, ok := hardwareVal.(map[string]interface{}); ok {
_, hasGPUModel := hardwareConfig[ConfigKeyGPUModel]
_, hasGPUVram := hardwareConfig[ConfigKeyGPUVramMib]
_, hasNumGPUs := hardwareConfig[ConfigKeyNumGpusPerNode]
hasManualHardwareConfig = hasGPUModel || hasGPUVram || hasNumGPUs
}
}
var hasExplicitGPURanges bool
if engineVal, hasEngine := config[ConfigKeyEngine]; hasEngine && engineVal != nil {
if engineConfig, ok := engineVal.(map[string]interface{}); ok {
minGPUs, hasMin := engineConfig[ConfigKeyMinNumGpusPerEng]
maxGPUs, hasMax := engineConfig[ConfigKeyMaxNumGpusPerEng]
if hasMin && hasMax {
minVal := toFloat64(minGPUs)
maxVal := toFloat64(maxGPUs)
// Check if user provided hardware info in the typed spec
hasManualConfig := dgdr.Spec.Hardware != nil && (dgdr.Spec.Hardware.GPUSKU != "" ||
dgdr.Spec.Hardware.VRAMMB != nil ||
dgdr.Spec.Hardware.NumGPUsPerNode != nil)
// Validate that min <= max
if minVal > maxVal {
return fmt.Errorf("invalid GPU range: %s (%v) cannot be greater than %s (%v)",
ConfigKeyMinNumGpusPerEng, minVal, ConfigKeyMaxNumGpusPerEng, maxVal)
}
hasExplicitGPURanges = minVal > 0 && maxVal > 0
}
}
}
// If manual config or explicit ranges are provided, validation passes
if hasManualHardwareConfig || hasExplicitGPURanges {
// If manual config is provided, validation passes
if hasManualConfig {
return nil
}
......@@ -1019,37 +860,22 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateGPUHardwareInfo(ctx con
isNamespaceScoped := r.Config.Namespace.Restricted != ""
if isNamespaceScoped {
tmpl := template.Must(template.New("nsGPUErr").Parse(
`GPU hardware info required but cannot be auto-discovered.` +
return fmt.Errorf(
"GPU hardware info required but cannot be auto-discovered." +
"\n\nOptions to resolve:" +
"\n\n1. Re-enable GPU discovery (if it was disabled during Helm install):" +
"\n helm upgrade ... --set dynamo-operator.gpuDiscovery.enabled=true" +
"\n\n2. Add hardware config to profilingConfig.config.{{.Hardware}}:" +
"\n {{.NumGPUs}}: 8" +
"\n {{.GPUModel}}: \"H100-SXM5-80GB\"" +
"\n {{.GPUVram}}: 81920" +
"\n\n3. Or specify {{.Engine}}.{{.MinGPUs}} and {{.Engine}}.{{.MaxGPUs}} for explicit GPU search ranges.",
))
var buf bytes.Buffer
_ = tmpl.Execute(&buf, map[string]string{
"Hardware": ConfigKeyHardware,
"NumGPUs": ConfigKeyNumGpusPerNode,
"GPUModel": ConfigKeyGPUModel,
"GPUVram": ConfigKeyGPUVramMib,
"Engine": ConfigKeyEngine,
"MinGPUs": ConfigKeyMinNumGpusPerEng,
"MaxGPUs": ConfigKeyMaxNumGpusPerEng,
})
return fmt.Errorf("%s", buf.String())
"\n\n2. Add hardware config to spec.hardware:" +
"\n numGpusPerNode: 8" +
"\n gpuSku: \"H100-SXM5-80GB\"" +
"\n vramMb: 81920")
}
return fmt.Errorf("GPU hardware info required but auto-discovery failed. Add hardware config to profilingConfig.config.%s (%s, %s, %s) or specify %s.%s and %s.%s",
ConfigKeyHardware, ConfigKeyNumGpusPerNode, ConfigKeyGPUModel, ConfigKeyGPUVramMib,
ConfigKeyEngine, ConfigKeyMinNumGpusPerEng, ConfigKeyEngine, ConfigKeyMaxNumGpusPerEng)
return fmt.Errorf("GPU hardware info required but auto-discovery failed. Add spec.hardware.gpuSku, spec.hardware.vramMb, spec.hardware.numGpusPerNode")
}
// createProfilingJob creates a Kubernetes Job for profiling using SyncResource
func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error {
func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) error {
logger := log.FromContext(ctx)
// Delete any existing output ConfigMap to ensure fresh profiling results
......@@ -1087,21 +913,10 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
}
}
// Run GPU discovery before creating job (cluster-wide and namespace-restricted operators if they have node read permissions)
var gpuInfo *gpu.GPUInfo
logger.Info("Attempting GPU discovery for profiling job")
discoveredInfo, err := gpu.DiscoverGPUs(ctx, r.Client)
if err != nil {
// This path is expected for namespace-restricted operators without node read permissions
logger.Info("GPU discovery not available, using manual hardware configuration from profiling config",
"reason", err.Error())
} else {
gpuInfo = discoveredInfo
logger.Info("GPU discovery completed successfully",
"gpusPerNode", gpuInfo.GPUsPerNode,
"model", gpuInfo.Model,
"vramMiB", gpuInfo.VRAMPerGPU,
"system", gpuInfo.System)
// Enrich hardware from GPU discovery before marshalling the spec.
// This fills in gpuSku, vramMb, numGpusPerNode if the user didn't set them.
if err := r.enrichHardwareFromDiscovery(ctx, dgdr); err != nil {
logger.Info("GPU discovery not available, proceeding without enrichment", "reason", err.Error())
}
// Use SyncResource to create/update the job
......@@ -1109,7 +924,8 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
jobName := getProfilingJobName(dgdr)
outputConfigMapName := getOutputConfigMapName(dgdr)
configYAML, err := r.prepareProfilingConfig(dgdr, gpuInfo)
// Marshal the DGDR spec to JSON — the profiler receives the spec verbatim
specJSON, err := marshalDGDRSpec(dgdr)
if err != nil {
return nil, false, err
}
......@@ -1158,16 +974,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
},
}
// Add ConfigMap volume mount if provided
if dgdr.Spec.ProfilingConfig.ConfigMapRef != nil {
volumeMounts = append(volumeMounts, corev1.VolumeMount{
Name: VolumeNameProfilingConfig,
MountPath: ProfilingConfigPath,
ReadOnly: true,
})
}
// Add model cache PVC mount if configured in profilingConfig.config.deployment
// Add model cache PVC mount if configured
modelCachePVC, modelCacheMountPath := extractModelCachePVCConfig(dgdr)
if modelCachePVC != "" {
logger.Info("Mounting model cache PVC to profiler pod", "pvc", modelCachePVC, "mountPath", modelCacheMountPath)
......@@ -1178,29 +985,32 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
})
}
// Profiler args: pass the config as an inline YAML string via --profile-config
profilerArgs := []string{
"--profile-config", string(configYAML),
// v1alpha1 round-trip: mount ConfigMap if referenced via annotation
cmRef := configMapRefFromAnnotation(dgdr)
if cmRef != nil {
volumeMounts = append(volumeMounts, corev1.VolumeMount{
Name: VolumeNameProfilingConfig,
MountPath: ProfilingConfigMountPath,
ReadOnly: true,
})
}
// Use profiler image from profilingConfig
imageName := dgdr.Spec.ProfilingConfig.ProfilerImage
// Profiler args: pass the DGDR spec as JSON via --config
profilerArgs := []string{"--config", specJSON}
// Use image from spec
imageName := dgdr.Spec.Image
logger.Info("Using profiler image", "image", imageName)
profilerContainer := corev1.Container{
Name: ContainerNameProfiler,
Image: imageName,
Command: []string{"python", "-m", "dynamo.profiler.profile_sla"},
Command: []string{"python", "-m", "dynamo.profiler"},
Args: profilerArgs,
Env: profilerEnv,
VolumeMounts: volumeMounts,
}
// Apply resource requirements if specified in the DGDR
if dgdr.Spec.ProfilingConfig.Resources != nil {
profilerContainer.Resources = *dgdr.Spec.ProfilingConfig.Resources
}
// Generate sidecar script from template
tmpl, err := template.New("sidecar").Parse(sidecarScriptTemplate)
if err != nil {
......@@ -1232,15 +1042,16 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
}},
}
// Use PVC if specified, otherwise use emptyDir for profiling output
// Use PVC for profiling output if round-tripped v1alpha1 annotation is present,
// otherwise use emptyDir (v1beta1 default).
var profilingOutputVolume corev1.Volume
if dgdr.Spec.ProfilingConfig.OutputPVC != "" {
logger.Info("Using PVC for profiling output", "pvc", dgdr.Spec.ProfilingConfig.OutputPVC)
if outputPVC := outputPVCFromAnnotation(dgdr); outputPVC != "" {
logger.Info("Using PVC for profiling output (from v1alpha1 annotation)", "pvc", outputPVC)
profilingOutputVolume = corev1.Volume{
Name: VolumeNameProfilingOutput,
VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
ClaimName: dgdr.Spec.ProfilingConfig.OutputPVC,
ClaimName: outputPVC,
},
},
}
......@@ -1254,59 +1065,52 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
}
volumes := []corev1.Volume{profilingOutputVolume}
// Add ConfigMap volume if provided
if dgdr.Spec.ProfilingConfig.ConfigMapRef != nil {
key := dgdr.Spec.ProfilingConfig.ConfigMapRef.Key
if key == "" {
key = ProfilingConfigFile
// Add model cache PVC volume if configured
if modelCachePVC != "" {
volumes = append(volumes, corev1.Volume{
Name: VolumeNameModelCache,
VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
ClaimName: modelCachePVC,
ReadOnly: true,
},
},
})
}
// v1alpha1 round-trip: add ConfigMap volume if referenced via annotation
if cmRef != nil {
cmKey := cmRef.Key
if cmKey == "" {
cmKey = ProfilingConfigDefaultKey
}
volumes = append(volumes, corev1.Volume{
Name: VolumeNameProfilingConfig,
VolumeSource: corev1.VolumeSource{
ConfigMap: &corev1.ConfigMapVolumeSource{
LocalObjectReference: corev1.LocalObjectReference{
Name: dgdr.Spec.ProfilingConfig.ConfigMapRef.Name,
Name: cmRef.Name,
},
Items: []corev1.KeyToPath{{
Key: key,
Path: ProfilingConfigFile,
Key: cmKey,
Path: ProfilingConfigDefaultKey,
}},
},
},
})
}
// Add model cache PVC volume if configured
if modelCachePVC != "" {
volumes = append(volumes, corev1.Volume{
Name: VolumeNameModelCache,
VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
ClaimName: modelCachePVC,
ReadOnly: true,
},
},
})
}
// Limit retries to prevent infinite loop
backoffLimit := int32(3)
// Determine label based on whether AI Configurator is used
labelValue := LabelValueDynamoProfiler
if !isOnlineProfiling(dgdr) {
labelValue = LabelValueAICProfiler
}
podSpec := corev1.PodSpec{
ServiceAccountName: ServiceAccountProfilingJob,
RestartPolicy: corev1.RestartPolicyNever,
SecurityContext: &corev1.PodSecurityContext{
RunAsNonRoot: ptr.To(true), // Enforces that container cannot run as root
RunAsUser: ptr.To[int64](1000), // Run as UID 1000 (non-privileged user)
RunAsGroup: ptr.To[int64](1000), // Run with GID 1000 (non-privileged group)
FSGroup: ptr.To[int64](1000), // Volume files owned by GID 1000
RunAsNonRoot: ptr.To(true),
RunAsUser: ptr.To[int64](1000),
RunAsGroup: ptr.To[int64](1000),
FSGroup: ptr.To[int64](1000),
},
Containers: []corev1.Container{profilerContainer, sidecarContainer},
Volumes: volumes,
......@@ -1315,24 +1119,14 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
},
}
// Apply tolerations if specified in the DGDR
if len(dgdr.Spec.ProfilingConfig.Tolerations) > 0 {
podSpec.Tolerations = dgdr.Spec.ProfilingConfig.Tolerations
}
// Apply nodeSelector if specified in the DGDR
if len(dgdr.Spec.ProfilingConfig.NodeSelector) > 0 {
podSpec.NodeSelector = dgdr.Spec.ProfilingConfig.NodeSelector
}
job := &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Name: jobName,
Namespace: dgdr.Namespace,
Labels: map[string]string{
LabelApp: labelValue,
LabelDGDR: dgdr.Name,
LabelManagedBy: LabelValueDynamoOperator,
nvidiacomv1beta1.LabelApp: nvidiacomv1beta1.LabelValueDynamoProfiler,
nvidiacomv1beta1.LabelDGDR: dgdr.Name,
nvidiacomv1beta1.LabelManagedBy: nvidiacomv1beta1.LabelValueDynamoOperator,
},
},
Spec: batchv1.JobSpec{
......@@ -1343,6 +1137,9 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
},
}
// Apply overrides from spec.overrides.profilingJob if provided
applyProfilingJobOverrides(job, dgdr)
return job, false, nil
})
......@@ -1354,144 +1151,149 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
logger.Info("Profiling job created/updated", "job", job.Name)
}
// Store the job name in status for observability
dgdr.Status.ProfilingJobName = job.Name
return nil
}
// prepareProfilingConfig parses and modifies the profiling config
func (r *DynamoGraphDeploymentRequestReconciler) prepareProfilingConfig(dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest, gpuInfo *gpu.GPUInfo) ([]byte, error) {
// Parse the profiling config from JSON
var config map[string]interface{}
if err := yaml.Unmarshal(dgdr.Spec.ProfilingConfig.Config.Raw, &config); err != nil {
return nil, fmt.Errorf("failed to parse profiling config: %w", err)
// applyProfilingJobOverrides applies user-specified overrides from
// spec.overrides.profilingJob to both the pod spec and job spec.
func applyProfilingJobOverrides(job *batchv1.Job, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) {
if dgdr.Spec.Overrides == nil || dgdr.Spec.Overrides.ProfilingJob == nil {
return
}
// Set deployment.namespace if not already set
deploymentVal, hasDeployment := config[ConfigKeyDeployment]
var deploymentConfig map[string]interface{}
if !hasDeployment || deploymentVal == nil {
deploymentConfig = make(map[string]interface{})
config[ConfigKeyDeployment] = deploymentConfig
} else {
var ok bool
deploymentConfig, ok = deploymentVal.(map[string]interface{})
if !ok {
return nil, fmt.Errorf("profilingConfig.config.%s must be an object, got %T", ConfigKeyDeployment, deploymentVal)
overrides := dgdr.Spec.Overrides.ProfilingJob
podSpec := &job.Spec.Template.Spec
// Apply pod-level overrides
overridePS := overrides.Template.Spec
if len(overridePS.Containers) > 0 {
podSpec.Containers[0].Resources = overridePS.Containers[0].Resources
}
if len(overridePS.Tolerations) > 0 {
podSpec.Tolerations = overridePS.Tolerations
}
if _, hasNamespace := deploymentConfig[ConfigKeyNamespace]; !hasNamespace {
deploymentConfig[ConfigKeyNamespace] = dgdr.Namespace
if len(overridePS.NodeSelector) > 0 {
podSpec.NodeSelector = overridePS.NodeSelector
}
// Set deployment.model from spec.model
deploymentConfig[ConfigKeyModel] = dgdr.Spec.Model
// Set deployment.dgd_image from deploymentOverrides.workersImage if provided
if dgdr.Spec.DeploymentOverrides != nil && dgdr.Spec.DeploymentOverrides.WorkersImage != "" {
deploymentConfig[ConfigKeyDGDImage] = dgdr.Spec.DeploymentOverrides.WorkersImage
if len(overridePS.ImagePullSecrets) > 0 {
// Merge override secrets with existing ones (deduplicate by name)
seen := make(map[string]bool)
for _, s := range podSpec.ImagePullSecrets {
seen[s.Name] = true
}
// Set output_dir if not already set
if _, hasOutputDir := config[ConfigKeyOutputDir]; !hasOutputDir {
config[ConfigKeyOutputDir] = ProfilingOutputPath
for _, s := range overridePS.ImagePullSecrets {
if !seen[s.Name] {
podSpec.ImagePullSecrets = append(podSpec.ImagePullSecrets, s)
seen[s.Name] = true
}
// Set engine.backend from spec.backend
engineVal, hasEngine := config[ConfigKeyEngine]
var engineConfig map[string]interface{}
if !hasEngine || engineVal == nil {
engineConfig = make(map[string]interface{})
config[ConfigKeyEngine] = engineConfig
} else {
var ok bool
engineConfig, ok = engineVal.(map[string]interface{})
if !ok {
return nil, fmt.Errorf("profilingConfig.config.%s must be an object, got %T", ConfigKeyEngine, engineVal)
}
}
engineConfig[ConfigKeyBackend] = dgdr.Spec.Backend
// If ConfigMapRef is provided, set engine.config path
if dgdr.Spec.ProfilingConfig.ConfigMapRef != nil {
engineConfig[ConfigKeyConfig] = fmt.Sprintf("%s/%s", ProfilingConfigPath, ProfilingConfigFile)
if overridePS.ServiceAccountName != "" {
podSpec.ServiceAccountName = overridePS.ServiceAccountName
}
// User-specified values take precedence over auto-discovered values
if gpuInfo != nil {
hardwareVal, hasHardware := config["hardware"]
var hardwareConfig map[string]interface{}
if !hasHardware || hardwareVal == nil {
hardwareConfig = make(map[string]interface{})
config["hardware"] = hardwareConfig
} else {
var ok bool
hardwareConfig, ok = hardwareVal.(map[string]interface{})
if !ok {
return nil, fmt.Errorf("profilingConfig.config.hardware must be an object, got %T", hardwareVal)
}
// Apply job-level overrides
if overrides.BackoffLimit != nil {
job.Spec.BackoffLimit = overrides.BackoffLimit
}
}
if _, hasNumGpus := hardwareConfig[ConfigKeyNumGpusPerNode]; !hasNumGpus {
hardwareConfig[ConfigKeyNumGpusPerNode] = gpuInfo.GPUsPerNode
}
if _, hasGpuModel := hardwareConfig[ConfigKeyGPUModel]; !hasGpuModel {
hardwareConfig[ConfigKeyGPUModel] = gpuInfo.Model
}
if _, hasGpuVram := hardwareConfig[ConfigKeyGPUVramMib]; !hasGpuVram {
hardwareConfig[ConfigKeyGPUVramMib] = gpuInfo.VRAMPerGPU
}
if gpuInfo.System != "" {
if _, hasSystem := hardwareConfig[ConfigKeySystem]; !hasSystem {
hardwareConfig[ConfigKeySystem] = gpuInfo.System
// marshalDGDRSpec produces the JSON string passed to the profiler via --config.
// The profiler receives the DGDR spec verbatim — no bespoke key mapping needed.
func marshalDGDRSpec(dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) (string, error) {
specJSON, err := json.Marshal(dgdr.Spec)
if err != nil {
return "", fmt.Errorf("failed to marshal DGDR spec to JSON: %w", err)
}
return string(specJSON), nil
}
// enrichHardwareFromDiscovery fills in hardware fields that the user didn't set.
// Called before marshalDGDRSpec(). Mutates dgdr.Spec.Hardware in-place (memory only, not persisted).
func (r *DynamoGraphDeploymentRequestReconciler) enrichHardwareFromDiscovery(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) error {
if dgdr.Spec.Hardware == nil {
dgdr.Spec.Hardware = &nvidiacomv1beta1.HardwareSpec{}
}
hw := dgdr.Spec.Hardware
if hw.GPUSKU != "" && hw.VRAMMB != nil && hw.NumGPUsPerNode != nil {
return nil // all fields already set by user
}
// Serialize config to YAML for passing to profiler
configYAML, err := sigsyaml.Marshal(config)
gpuInfo, err := gpu.DiscoverGPUs(ctx, r.Client)
if err != nil {
return nil, fmt.Errorf("failed to marshal profiling config to YAML: %w", err)
return err
}
return configYAML, nil
}
logger := log.FromContext(ctx)
logger.Info("GPU discovery completed successfully",
"gpusPerNode", gpuInfo.GPUsPerNode,
"model", gpuInfo.Model,
"vramMiB", gpuInfo.VRAMPerGPU)
// extractModelCachePVCConfig extracts model cache PVC settings from the profiling config.
// Returns (pvcName, mountPath) - both empty if not configured.
func extractModelCachePVCConfig(dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (string, string) {
if dgdr.Spec.ProfilingConfig.Config == nil {
return "", ""
if hw.GPUSKU == "" {
hw.GPUSKU = gpuInfo.Model
}
var config map[string]interface{}
if err := yaml.Unmarshal(dgdr.Spec.ProfilingConfig.Config.Raw, &config); err != nil {
return "", ""
if hw.VRAMMB == nil {
vram := float64(gpuInfo.VRAMPerGPU)
hw.VRAMMB = &vram
}
deployment, ok := config[ConfigKeyDeployment].(map[string]interface{})
if !ok {
return "", ""
}
modelCache, ok := deployment[ConfigKeyModelCache].(map[string]interface{})
if !ok {
return "", ""
if hw.NumGPUsPerNode == nil {
n := int32(gpuInfo.GPUsPerNode)
hw.NumGPUsPerNode = &n
}
return nil
}
pvcName, _ := modelCache[ConfigKeyPVCName].(string)
if pvcName == "" {
// extractModelCachePVCConfig reads model cache PVC settings from the typed v1beta1 spec.
// Returns (pvcName, mountPath) — both empty if not configured.
func extractModelCachePVCConfig(dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) (string, string) {
if dgdr.Spec.ModelCache == nil || dgdr.Spec.ModelCache.PVCName == "" {
return "", ""
}
mountPath, _ := modelCache[ConfigKeyMountPath].(string)
mountPath := dgdr.Spec.ModelCache.PVCMountPath
if mountPath == "" {
mountPath = DefaultModelCacheMountPath
}
return dgdr.Spec.ModelCache.PVCName, mountPath
}
return pvcName, mountPath
// configMapKeySelector mirrors v1alpha1.ConfigMapKeySelector for annotation deserialization.
type configMapKeySelector struct {
Name string `json:"name"`
Key string `json:"key,omitempty"`
}
// configMapRefFromAnnotation reads the ConfigMap reference from the round-trip annotation.
// Returns nil for native v1beta1 resources (no annotation present).
func configMapRefFromAnnotation(dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) *configMapKeySelector {
if dgdr.Annotations == nil {
return nil
}
raw, ok := dgdr.Annotations[AnnotationConfigMapRef]
if !ok || raw == "" {
return nil
}
var ref configMapKeySelector
if err := json.Unmarshal([]byte(raw), &ref); err != nil {
return nil
}
return &ref
}
// outputPVCFromAnnotation reads the output PVC name from the round-trip annotation.
// Returns "" for native v1beta1 resources (always emptyDir).
func outputPVCFromAnnotation(dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) string {
if dgdr.Annotations == nil {
return ""
}
return dgdr.Annotations[AnnotationOutputPVC]
}
// checkProfilingJobStatus checks if the profiling job has completed
func (r *DynamoGraphDeploymentRequestReconciler) checkProfilingJobStatus(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (bool, error) {
func (r *DynamoGraphDeploymentRequestReconciler) checkProfilingJobStatus(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) (bool, error) {
logger := log.FromContext(ctx)
jobName := getProfilingJobName(dgdr)
......@@ -1520,7 +1322,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) checkProfilingJobStatus(ctx con
}
// getProfilingJobErrorDetails retrieves detailed error information from failed profiling job pods
func (r *DynamoGraphDeploymentRequestReconciler) getProfilingJobErrorDetails(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest, job *batchv1.Job) string {
func (r *DynamoGraphDeploymentRequestReconciler) getProfilingJobErrorDetails(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest, job *batchv1.Job) string {
logger := log.FromContext(ctx)
// List pods owned by this job
......@@ -1570,7 +1372,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) getProfilingJobErrorDetails(ctx
}
// generateDGDSpec generates DGD spec from profiling results (online or offline/AIC)
func (r *DynamoGraphDeploymentRequestReconciler) generateDGDSpec(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error {
func (r *DynamoGraphDeploymentRequestReconciler) generateDGDSpec(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) error {
logger := log.FromContext(ctx)
logger.Info("Generating DGD spec from profiling results", "name", dgdr.Name, "backend", dgdr.Spec.Backend)
......@@ -1589,10 +1391,10 @@ func (r *DynamoGraphDeploymentRequestReconciler) generateDGDSpec(ctx context.Con
return fmt.Errorf("failed to get output ConfigMap: %w", err)
}
// Select the right config file based on useMocker flag
// Select the right config file based on mocker feature flag
// Profiler always generates both real and mocker configs
var outputFile string
if dgdr.Spec.UseMocker {
if dgdr.Spec.Features != nil && dgdr.Spec.Features.Mocker != nil && dgdr.Spec.Features.Mocker.Enabled {
outputFile = ProfilingOutputFileMocker
logger.Info("Using mocker deployment config")
} else {
......@@ -1627,18 +1429,46 @@ func (r *DynamoGraphDeploymentRequestReconciler) generateDGDSpec(ctx context.Con
}
}
// Store the generated DGD in status
dgdr.Status.GeneratedDeployment = &runtime.RawExtension{
Object: dgd,
// Store the generated DGD name in status and cache the spec in an annotation for createDGD
dgdr.Status.DGDName = dgd.Name
// Store the generated DGD in ProfilingResults.SelectedConfig for status visibility
dgdJSON, err := json.Marshal(dgd)
if err != nil {
return fmt.Errorf("failed to marshal generated DGD to JSON: %w", err)
}
if dgdr.Status.ProfilingResults == nil {
dgdr.Status.ProfilingResults = &nvidiacomv1beta1.ProfilingResultsStatus{}
}
dgdr.Status.ProfilingResults.SelectedConfig = &runtime.RawExtension{Raw: dgdJSON}
// Serialize the DGD spec to an annotation so createDGD can retrieve it
dgdBytes, err := sigsyaml.Marshal(dgd)
if err != nil {
return fmt.Errorf("failed to marshal generated DGD: %w", err)
}
if dgdr.Annotations == nil {
dgdr.Annotations = make(map[string]string)
}
dgdr.Annotations["nvidia.com/generated-dgd-spec"] = string(dgdBytes)
// Update the object (annotations are on the object, not status)
if err := r.Update(ctx, dgdr); err != nil {
return fmt.Errorf("failed to update DGDR with generated DGD annotation: %w", err)
}
// Refetch the DGDR after the annotation update to get the latest resourceVersion
// and avoid conflicts with concurrent modifications before updating status.
if err := r.Get(ctx, types.NamespacedName{Name: dgdr.Name, Namespace: dgdr.Namespace}, dgdr); err != nil {
return fmt.Errorf("failed to refetch DGDR after annotation update: %w", err)
}
dgdr.Status.ProfilingResults = fmt.Sprintf("configmap/%s", outputConfigMapName)
return r.Status().Update(ctx, dgdr)
}
// storeAdditionalResources marshals additional resources to YAML and stores them in DGDR annotations.
// Validates annotation size and fails gracefully if too large.
func (r *DynamoGraphDeploymentRequestReconciler) storeAdditionalResources(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest, resources []*unstructured.Unstructured) error {
func (r *DynamoGraphDeploymentRequestReconciler) storeAdditionalResources(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest, resources []*unstructured.Unstructured) error {
if len(resources) == 0 {
return nil
}
......@@ -1673,10 +1503,10 @@ func (r *DynamoGraphDeploymentRequestReconciler) storeAdditionalResources(ctx co
// extractResourcesFromYAML parses multi-document YAML from profiling output,
// extracting the DynamoGraphDeployment and any ConfigMaps that should be deployed with it.
func (r *DynamoGraphDeploymentRequestReconciler) extractResourcesFromYAML(yamlContent []byte) (*nvidiacomv1alpha1.DynamoGraphDeployment, []*unstructured.Unstructured, error) {
func (r *DynamoGraphDeploymentRequestReconciler) extractResourcesFromYAML(yamlContent []byte) (*dgdv1alpha1.DynamoGraphDeployment, []*unstructured.Unstructured, error) {
decoder := yaml.NewYAMLOrJSONDecoder(bytes.NewReader(yamlContent), 4096)
var dgd *nvidiacomv1alpha1.DynamoGraphDeployment
var dgd *dgdv1alpha1.DynamoGraphDeployment
var additionalResources []*unstructured.Unstructured
for {
......@@ -1695,7 +1525,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) extractResourcesFromYAML(yamlCo
}
if obj.GetKind() == "DynamoGraphDeployment" {
dgd = &nvidiacomv1alpha1.DynamoGraphDeployment{}
dgd = &dgdv1alpha1.DynamoGraphDeployment{}
if err := runtime.DefaultUnstructuredConverter.FromUnstructured(obj.Object, dgd); err != nil {
return nil, nil, fmt.Errorf("failed to convert to DynamoGraphDeployment: %w", err)
}
......@@ -1713,31 +1543,90 @@ func (r *DynamoGraphDeploymentRequestReconciler) extractResourcesFromYAML(yamlCo
}
// extractDGDFromYAML is a convenience wrapper that extracts only the DGD (used by tests)
func (r *DynamoGraphDeploymentRequestReconciler) extractDGDFromYAML(yamlContent []byte) (*nvidiacomv1alpha1.DynamoGraphDeployment, error) {
func (r *DynamoGraphDeploymentRequestReconciler) extractDGDFromYAML(yamlContent []byte) (*dgdv1alpha1.DynamoGraphDeployment, error) {
dgd, _, err := r.extractResourcesFromYAML(yamlContent)
return dgd, err
}
// updateStateAndRequeue updates the DGDR state and requeues
func (r *DynamoGraphDeploymentRequestReconciler) updateStateAndRequeue(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest, state nvidiacomv1alpha1.DGDRState, _ string) (ctrl.Result, error) {
dgdr.Status.State = state
// updateDeploymentInfo populates status.deploymentInfo from DGD service replica counts.
func updateDeploymentInfo(dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest, dgd *dgdv1alpha1.DynamoGraphDeployment) bool {
var totalReplicas, totalAvailable int32
for _, svc := range dgd.Status.Services {
totalReplicas += svc.Replicas
if svc.AvailableReplicas != nil {
totalAvailable += *svc.AvailableReplicas
}
}
// Short-circuit if nothing changed
if cur := dgdr.Status.DeploymentInfo; cur != nil &&
cur.Replicas != nil && *cur.Replicas == totalReplicas &&
cur.AvailableReplicas != nil && *cur.AvailableReplicas == totalAvailable {
return false
}
dgdr.Status.DeploymentInfo = &nvidiacomv1beta1.DeploymentInfoStatus{
Replicas: &totalReplicas,
AvailableReplicas: &totalAvailable,
}
return true
}
// setSucceededCondition sets the aggregate Succeeded condition based on the current phase.
func setSucceededCondition(dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest, phase nvidiacomv1beta1.DGDRPhase) {
var status metav1.ConditionStatus
var reason, message string
switch phase {
case nvidiacomv1beta1.DGDRPhasePending, "":
status, reason, message = metav1.ConditionFalse, "Pending", "DGDR is pending"
case nvidiacomv1beta1.DGDRPhaseProfiling:
status, reason, message = metav1.ConditionFalse, "Profiling", "Profiling is in progress"
case nvidiacomv1beta1.DGDRPhaseReady:
status, reason, message = metav1.ConditionTrue, "SpecGenerated", "Profiling complete, spec available"
case nvidiacomv1beta1.DGDRPhaseDeploying:
status, reason, message = metav1.ConditionFalse, "Deploying", "Deployment is in progress"
case nvidiacomv1beta1.DGDRPhaseDeployed:
status, reason, message = metav1.ConditionTrue, "Deployed", "Deployment is healthy"
case nvidiacomv1beta1.DGDRPhaseFailed:
status, reason, message = metav1.ConditionFalse, "Failed", "DGDR has failed"
default:
status, reason, message = metav1.ConditionFalse, "Unknown", "Unknown phase"
}
meta.SetStatusCondition(&dgdr.Status.Conditions, metav1.Condition{
Type: nvidiacomv1beta1.ConditionTypeSucceeded,
Status: status,
ObservedGeneration: dgdr.Generation,
Reason: reason,
Message: message,
})
}
// updatePhaseAndRequeue updates the DGDR phase and requeues
func (r *DynamoGraphDeploymentRequestReconciler) updatePhaseAndRequeue(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest, phase nvidiacomv1beta1.DGDRPhase, message string) (ctrl.Result, error) {
logger := log.FromContext(ctx)
logger.Info("Updating DGDR phase", "name", dgdr.Name, "phase", phase, "message", message)
dgdr.Status.Phase = phase
setSucceededCondition(dgdr, phase)
if err := r.Status().Update(ctx, dgdr); err != nil {
return ctrl.Result{}, err
}
return ctrl.Result{Requeue: true}, nil
}
// updateStateWithCondition updates state and adds/updates a condition
func (r *DynamoGraphDeploymentRequestReconciler) updateStateWithCondition(
// updatePhaseWithCondition updates phase and adds/updates a condition
func (r *DynamoGraphDeploymentRequestReconciler) updatePhaseWithCondition(
ctx context.Context,
dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest,
state nvidiacomv1alpha1.DGDRState,
dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest,
phase nvidiacomv1beta1.DGDRPhase,
conditionType string,
status metav1.ConditionStatus,
reason string,
message string,
) (ctrl.Result, error) {
dgdr.Status.State = state
dgdr.Status.Phase = phase
setSucceededCondition(dgdr, phase)
condition := metav1.Condition{
Type: conditionType,
......@@ -1760,7 +1649,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) updateStateWithCondition(
// SetupWithManager sets up the controller with the Manager
func (r *DynamoGraphDeploymentRequestReconciler) SetupWithManager(mgr ctrl.Manager) error {
return ctrl.NewControllerManagedBy(mgr).
For(&nvidiacomv1alpha1.DynamoGraphDeploymentRequest{}).
For(&nvidiacomv1beta1.DynamoGraphDeploymentRequest{}).
Named(consts.ResourceTypeDynamoGraphDeploymentRequest).
Owns(&batchv1.Job{}, builder.WithPredicates(predicate.Funcs{
// ignore creation cause we don't want to be called again after we create the job
......@@ -1770,12 +1659,12 @@ func (r *DynamoGraphDeploymentRequestReconciler) SetupWithManager(mgr ctrl.Manag
GenericFunc: func(ge event.GenericEvent) bool { return true },
})). // Watch Jobs created by this controller (via ownerReference)
Watches(
&nvidiacomv1alpha1.DynamoGraphDeployment{},
&dgdv1alpha1.DynamoGraphDeployment{},
handler.EnqueueRequestsFromMapFunc(func(ctx context.Context, obj client.Object) []ctrl.Request {
// Find DGDR by label instead of owner reference
dgd := obj.(*nvidiacomv1alpha1.DynamoGraphDeployment)
dgdrName, hasName := dgd.Labels[LabelDGDRName]
dgdrNamespace, hasNamespace := dgd.Labels[LabelDGDRNamespace]
dgd := obj.(*dgdv1alpha1.DynamoGraphDeployment)
dgdrName, hasName := dgd.Labels[nvidiacomv1beta1.LabelDGDRName]
dgdrNamespace, hasNamespace := dgd.Labels[nvidiacomv1beta1.LabelDGDRNamespace]
if !hasName || !hasNamespace {
return nil
}
......
......@@ -19,23 +19,22 @@ package controller
import (
"context"
"encoding/json"
"time"
configv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/config/v1alpha1"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
dgdv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
commonController "github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/tools/record"
"k8s.io/utils/ptr"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
"sigs.k8s.io/yaml"
)
const (
......@@ -54,23 +53,6 @@ func (m *MockRBACManager) EnsureServiceAccountWithRBAC(ctx context.Context, targ
return nil
}
// Helper function to create JSON config for tests
func createTestConfig(config map[string]interface{}) *apiextensionsv1.JSON {
// Add default hardware config if not present to satisfy validation
if _, hasHardware := config["hardware"]; !hasHardware {
config["hardware"] = map[string]interface{}{
"numGpusPerNode": 8,
"gpuModel": "H100-SXM5-80GB",
"gpuVramMib": 81920,
}
}
jsonBytes, err := json.Marshal(config)
if err != nil {
panic(err)
}
return &apiextensionsv1.JSON{Raw: jsonBytes}
}
var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
const (
timeout = time.Second * 10
......@@ -106,27 +88,23 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
dgdrName := "test-dgdr-initial"
namespace := defaultNamespace
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "test-profiler:latest",
Config: createTestConfig(map[string]interface{}{
"engine": map[string]interface{}{
"config": "/tmp/test-config.yaml",
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
},
"sla": map[string]interface{}{
"ttft": 100.0,
"itl": 1500.0,
"isl": 3000,
"osl": 5,
},
}),
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
},
},
}
......@@ -144,14 +122,14 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
Expect(err).NotTo(HaveOccurred())
// Check status
Eventually(func() nvidiacomv1alpha1.DGDRState {
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
Eventually(func() nvidiacomv1beta1.DGDRPhase {
var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
return updated.Status.State
}, timeout, interval).Should(Equal(nvidiacomv1alpha1.DGDRStatePending))
return updated.Status.Phase
}, timeout, interval).Should(Equal(nvidiacomv1beta1.DGDRPhasePending))
// Verify observedGeneration is set
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
Expect(updated.Status.ObservedGeneration).Should(Equal(updated.Generation))
})
......@@ -161,22 +139,23 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
dgdrName := "test-dgdr-minimal"
namespace := defaultNamespace
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "test-profiler:latest",
Config: createTestConfig(map[string]interface{}{
"sla": map[string]interface{}{
"ttft": 100.0,
"itl": 1500.0,
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
},
}),
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
},
},
}
......@@ -194,11 +173,11 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
Expect(err).NotTo(HaveOccurred())
// Check status transitions to Pending (not Failed)
Eventually(func() nvidiacomv1alpha1.DGDRState {
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
Eventually(func() nvidiacomv1beta1.DGDRPhase {
var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
return updated.Status.State
}, timeout, interval).Should(Equal(nvidiacomv1alpha1.DGDRStatePending))
return updated.Status.Phase
}, timeout, interval).Should(Equal(nvidiacomv1beta1.DGDRPhasePending))
})
})
......@@ -231,31 +210,26 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
Expect(k8sClient.Create(ctx, sa)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, sa) }()
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
Annotations: map[string]string{
"nvidia.com/dgdr-config-map-ref": `{"name":"test-config","key":"disagg.yaml"}`,
},
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "test-profiler:latest",
Config: createTestConfig(map[string]interface{}{
"engine": map[string]interface{}{
"profiler_image": "test-profiler:latest",
},
"sla": map[string]interface{}{
"ttft": 100.0,
"itl": 1500.0,
"isl": 3000,
"osl": 5,
},
}),
ConfigMapRef: &nvidiacomv1alpha1.ConfigMapKeySelector{
Name: "test-config",
Key: "disagg.yaml",
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
},
},
}
......@@ -287,8 +261,8 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
jobName := getProfilingJobName(dgdr)
job := &batchv1.Job{}
_ = k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job)
Expect(job.Labels[LabelApp]).Should(Equal(LabelValueDynamoProfiler))
Expect(job.Labels[LabelDGDR]).Should(Equal(dgdrName))
Expect(job.Labels[nvidiacomv1beta1.LabelApp]).Should(Equal(nvidiacomv1beta1.LabelValueDynamoProfiler))
Expect(job.Labels[nvidiacomv1beta1.LabelDGDR]).Should(Equal(dgdrName))
// Verify job has profiler container
Expect(job.Spec.Template.Spec.Containers).Should(HaveLen(2))
......@@ -324,34 +298,24 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
Expect(k8sClient.Create(ctx, sa)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, sa) }()
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "trtllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "test-profiler:latest",
Config: createTestConfig(map[string]interface{}{
"engine": map[string]interface{}{
"config": "/tmp/test-config.yaml",
"profiler_image": "test-profiler:latest",
},
"sla": map[string]interface{}{
"ttft": 100.0,
"itl": 1500.0,
"isl": 3000,
"osl": 5,
},
"sweep": map[string]interface{}{
"use_ai_configurator": true,
"aic_system": "h200_sxm",
"aic_hf_id": "Qwen/Qwen3-32B",
"aic_backend_version": "0.20.0",
Image: "test-profiler:latest",
SearchStrategy: "rapid",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
},
}),
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
},
},
}
......@@ -377,8 +341,8 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
if err := k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job); err != nil {
return ""
}
return job.Labels[LabelApp]
}, timeout, interval).Should(Equal(LabelValueAICProfiler))
return job.Labels[nvidiacomv1beta1.LabelApp]
}, timeout, interval).Should(Equal(nvidiacomv1beta1.LabelValueDynamoProfiler))
// Clean up
jobName := getProfilingJobName(dgdr)
......@@ -395,27 +359,23 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
dgdrName := "test-dgdr-profiling-complete"
namespace := defaultNamespace
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "test-profiler:latest",
Config: createTestConfig(map[string]interface{}{
"engine": map[string]interface{}{
"config": "/tmp/test-config.yaml",
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
},
"sla": map[string]interface{}{
"ttft": 100.0,
"itl": 1500.0,
"isl": 3000,
"osl": 5,
},
}),
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
},
},
}
......@@ -424,7 +384,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Update status to Profiling using Status subresource
dgdr.Status.State = nvidiacomv1alpha1.DGDRStateProfiling
dgdr.Status.Phase = nvidiacomv1beta1.DGDRPhaseProfiling
Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed())
// Create completed profiling job
......@@ -492,14 +452,14 @@ spec:
Expect(err).NotTo(HaveOccurred())
// Get the updated DGDR
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
// Check that DGD spec was generated
Expect(updated.Status.GeneratedDeployment).NotTo(BeNil())
// Check that DGD spec was generated (stored in annotation)
Expect(updated.Annotations["nvidia.com/generated-dgd-spec"]).NotTo(BeEmpty())
// Verify state transitioned to Ready (since autoApply is false by default)
Expect(updated.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStateReady))
// autoApply defaults to true in v1beta1, so after profiling the DGDR transitions to Deploying
Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhaseDeploying))
})
})
......@@ -509,27 +469,23 @@ spec:
dgdrName := "test-dgdr-autoapply"
namespace := defaultNamespace
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "test-profiler:latest",
Config: createTestConfig(map[string]interface{}{
"engine": map[string]interface{}{
"config": "/tmp/test-config.yaml",
},
"sla": map[string]interface{}{
"ttft": 100.0,
"itl": 1500.0,
"isl": 3000,
"osl": 5,
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
},
}),
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
},
AutoApply: true,
},
......@@ -539,7 +495,7 @@ spec:
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Update status to Profiling using Status subresource
dgdr.Status.State = nvidiacomv1alpha1.DGDRStateProfiling
dgdr.Status.Phase = nvidiacomv1beta1.DGDRPhaseProfiling
Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed())
// Create completed profiling job
......@@ -607,9 +563,9 @@ spec:
Expect(err).NotTo(HaveOccurred())
// Get updated DGDR and check state is Deploying
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
Expect(updated.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStateDeploying))
Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhaseDeploying))
// Reconcile again to create DGD
_, err = reconciler.Reconcile(ctx, reconcile.Request{
......@@ -618,14 +574,12 @@ spec:
Expect(err).NotTo(HaveOccurred())
// Verify DGD was created
dgd := &nvidiacomv1alpha1.DynamoGraphDeployment{}
dgd := &dgdv1alpha1.DynamoGraphDeployment{}
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: "test-dgd-auto", Namespace: namespace}, dgd)).Should(Succeed())
// Get final DGDR status
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
Expect(updated.Status.Deployment).NotTo(BeNil())
Expect(updated.Status.Deployment.Created).Should(BeTrue())
Expect(updated.Status.Deployment.Name).Should(Equal("test-dgd-auto"))
Expect(updated.Status.DGDName).Should(Equal("test-dgd-auto"))
// Clean up DGD
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: "test-dgd-auto", Namespace: namespace}, dgd)).Should(Succeed())
......@@ -639,27 +593,23 @@ spec:
dgdrName := "test-dgdr-immutable"
namespace := defaultNamespace
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "test-profiler:latest",
Config: createTestConfig(map[string]interface{}{
"engine": map[string]interface{}{
"config": "/tmp/test-config.yaml",
},
"sla": map[string]interface{}{
"ttft": 100.0,
"itl": 1500.0,
"isl": 3000,
"osl": 5,
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
},
}),
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
},
},
}
......@@ -674,22 +624,18 @@ spec:
Expect(err).NotTo(HaveOccurred())
// Get current generation
var current nvidiacomv1alpha1.DynamoGraphDeploymentRequest
var current nvidiacomv1beta1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &current)).Should(Succeed())
initialGeneration := current.Generation
observedGeneration := current.Status.ObservedGeneration
// Manually set state to Profiling to simulate in-progress profiling
current.Status.State = nvidiacomv1alpha1.DGDRStateProfiling
current.Status.Phase = nvidiacomv1beta1.DGDRPhaseProfiling
Expect(k8sClient.Status().Update(ctx, &current)).Should(Succeed())
// Try to modify spec
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &current)).Should(Succeed())
// Unmarshal config, modify it, and marshal back
var config map[string]interface{}
Expect(yaml.Unmarshal(current.Spec.ProfilingConfig.Config.Raw, &config)).Should(Succeed())
config["sla"].(map[string]interface{})["ttft"] = 200.0
current.Spec.ProfilingConfig.Config = createTestConfig(config)
current.Spec.Model = "modified-model"
Expect(k8sClient.Update(ctx, &current)).Should(Succeed())
// Reconcile
......@@ -702,13 +648,13 @@ spec:
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &current)).Should(Succeed())
Expect(current.Generation).Should(BeNumerically(">", initialGeneration))
Expect(current.Status.ObservedGeneration).Should(Equal(observedGeneration))
Expect(current.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStateProfiling)) // State unchanged
Expect(current.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhaseProfiling)) // State unchanged
// Verify event was recorded
Eventually(func() bool {
select {
case event := <-recorder.Events:
return event == "Warning SpecChangeRejected Cannot modify spec in state 'Profiling'. DynamoGraphDeploymentRequest is immutable once profiling starts. Create a new resource with a different name instead."
return event == "Warning SpecChangeRejected Cannot modify spec in phase 'Profiling'. DynamoGraphDeploymentRequest is immutable once profiling starts. Create a new resource with a different name instead."
default:
return false
}
......@@ -717,32 +663,28 @@ spec:
})
Context("When handling DGD deletion", func() {
It("Should transition to DeploymentDeleted state", func() {
It("Should transition to Failed phase when DGD is deleted", func() {
ctx := context.Background()
dgdrName := "test-dgdr-dgd-deleted"
namespace := defaultNamespace
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "test-profiler:latest",
Config: createTestConfig(map[string]interface{}{
"engine": map[string]interface{}{
"config": "/tmp/test-config.yaml",
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
},
"sla": map[string]interface{}{
"ttft": 100.0,
"itl": 1500.0,
"isl": 3000,
"osl": 5,
},
}),
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
},
AutoApply: true,
},
......@@ -751,14 +693,9 @@ spec:
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Update status to Ready with Deployment info using Status subresource
dgdr.Status.State = nvidiacomv1alpha1.DGDRStateReady
dgdr.Status.Deployment = &nvidiacomv1alpha1.DeploymentStatus{
Name: "test-dgd-to-delete",
Namespace: namespace,
Created: true,
State: nvidiacomv1alpha1.DGDStateSuccessful,
}
// Update status to Deployed with Deployment info using Status subresource
dgdr.Status.Phase = nvidiacomv1beta1.DGDRPhaseDeployed
dgdr.Status.DGDName = "test-dgd-to-delete"
Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed())
// Reconcile when DGD doesn't exist (simulating deletion)
......@@ -767,10 +704,10 @@ spec:
})
Expect(err).NotTo(HaveOccurred())
// Get updated DGDR and check state transitioned to DeploymentDeleted
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
// Get updated DGDR and check phase transitioned to Failed
var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
Expect(updated.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStateDeploymentDeleted))
Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhaseFailed))
})
})
})
......@@ -778,7 +715,7 @@ spec:
var _ = Describe("DGDR Helper Functions", func() {
Context("getProfilingJobName", func() {
It("Should return correct job name", func() {
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
},
......@@ -789,7 +726,7 @@ var _ = Describe("DGDR Helper Functions", func() {
Context("getOutputConfigMapName", func() {
It("Should return correct ConfigMap name", func() {
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
},
......@@ -799,91 +736,42 @@ var _ = Describe("DGDR Helper Functions", func() {
})
Context("isOnlineProfiling", func() {
It("Should return true for online profiling (use_ai_configurator=false)", func() {
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{
"sweep": map[string]interface{}{
"use_ai_configurator": false,
},
}),
},
It("Should always return true regardless of spec", func() {
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
},
}
Expect(isOnlineProfiling(dgdr)).Should(BeTrue())
})
It("Should return false for AI Configurator profiling (use_ai_configurator=true)", func() {
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{
"sweep": map[string]interface{}{
"use_ai_configurator": true,
},
}),
},
},
}
Expect(isOnlineProfiling(dgdr)).Should(BeFalse())
})
It("Should return true by default when sweep section is missing", func() {
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{
"engine": map[string]interface{}{
"backend": "vllm",
},
}),
},
It("Should return true with search strategy rapid", func() {
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "trtllm",
SearchStrategy: "rapid",
},
}
Expect(isOnlineProfiling(dgdr)).Should(BeTrue())
})
It("Should return true by default when use_ai_configurator is not specified", func() {
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{
"sweep": map[string]interface{}{
"prefill_interpolation_granularity": 16,
},
}),
},
It("Should return true with search strategy thorough", func() {
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
SearchStrategy: "thorough",
},
}
Expect(isOnlineProfiling(dgdr)).Should(BeTrue())
})
It("Should return false for AI Configurator profiling (useAiConfigurator=true camelCase)", func() {
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{
"sweep": map[string]interface{}{
"useAiConfigurator": true,
},
}),
},
},
}
Expect(isOnlineProfiling(dgdr)).Should(BeFalse())
})
It("Should return true for online profiling (useAiConfigurator=false camelCase)", func() {
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{
"sweep": map[string]interface{}{
"useAiConfigurator": false,
},
}),
},
It("Should return true with nil spec fields", func() {
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
},
}
Expect(isOnlineProfiling(dgdr)).Should(BeTrue())
......@@ -903,23 +791,19 @@ var _ = Describe("DGDR Validation", func() {
Context("validateSpec", func() {
It("Should pass validation for valid spec", func() {
ctx := context.Background()
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "test-profiler:latest",
Config: createTestConfig(map[string]interface{}{
"engine": map[string]interface{}{
"config": "/tmp/test-config.yaml",
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
},
"sla": map[string]interface{}{
"ttft": 100.0,
"itl": 1500.0,
"isl": 3000,
"osl": 5,
},
}),
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
},
},
}
......@@ -930,18 +814,19 @@ var _ = Describe("DGDR Validation", func() {
It("Should pass validation with minimal config", func() {
ctx := context.Background()
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "test-profiler:latest",
Config: createTestConfig(map[string]interface{}{
"sla": map[string]interface{}{
"ttft": 100.0,
"itl": 1500.0,
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
},
}),
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
},
},
}
......@@ -971,7 +856,7 @@ var _ = Describe("DGDR Profiler Arguments", func() {
})
Context("When creating profiling job with inline config", func() {
It("Should pass config as --profile-config argument for online profiling", func() {
It("Should pass config as --config argument for online profiling", func() {
ctx := context.Background()
namespace := "default"
dgdrName := "test-args-online"
......@@ -986,36 +871,23 @@ var _ = Describe("DGDR Profiler Arguments", func() {
Expect(k8sClient.Create(ctx, sa)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, sa) }()
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "trtllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "test-profiler:latest",
Config: createTestConfig(map[string]interface{}{
"engine": map[string]interface{}{
"config": "/tmp/test-config.yaml",
"profiler_image": "test-profiler:latest",
},
"sla": map[string]interface{}{
"ttft": 50.0,
"itl": 10.0,
"isl": 3000,
"osl": 500,
},
"hardware": map[string]interface{}{
"gpu_type": "h200_sxm",
"min_num_gpus_per_engine": 2,
"max_num_gpus_per_engine": 4,
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
GPUSKU: "H200-SXM",
NumGPUsPerNode: ptr.To[int32](8),
VRAMMB: ptr.To(81920.0),
},
"sweep": map[string]interface{}{
"use_ai_configurator": false,
},
}),
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(50.0),
ITL: ptr.To(10.0),
},
},
}
......@@ -1024,7 +896,7 @@ var _ = Describe("DGDR Profiler Arguments", func() {
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Re-fetch DGDR to get proper metadata from API server
var fetchedDGDR nvidiacomv1alpha1.DynamoGraphDeploymentRequest
var fetchedDGDR nvidiacomv1beta1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &fetchedDGDR)).Should(Succeed())
// Create profiling job with properly initialized DGDR
......@@ -1036,12 +908,12 @@ var _ = Describe("DGDR Profiler Arguments", func() {
job := &batchv1.Job{}
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job)).Should(Succeed())
// Verify profiler container has --profile-config argument
// Verify profiler container has --config argument
profilerContainer := job.Spec.Template.Spec.Containers[0]
args := profilerContainer.Args
// Check that --profile-config argument is present
Expect(args).Should(ContainElement("--profile-config"))
// Check that --config argument is present
Expect(args).Should(ContainElement("--config"))
// Clean up
_ = k8sClient.Delete(ctx, job)
......@@ -1062,39 +934,24 @@ var _ = Describe("DGDR Profiler Arguments", func() {
Expect(k8sClient.Create(ctx, sa)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, sa) }()
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "trtllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "test-profiler:latest",
Config: createTestConfig(map[string]interface{}{
"engine": map[string]interface{}{
"config": "/tmp/test-config.yaml",
"profiler_image": "test-profiler:latest",
},
"sla": map[string]interface{}{
"ttft": 50.0,
"itl": 10.0,
"isl": 3000,
"osl": 500,
Image: "test-profiler:latest",
SearchStrategy: "rapid",
Hardware: &nvidiacomv1beta1.HardwareSpec{
GPUSKU: "H200-SXM",
NumGPUsPerNode: ptr.To[int32](8),
VRAMMB: ptr.To(81920.0),
},
"hardware": map[string]interface{}{
"gpu_type": "h200_sxm",
"min_num_gpus_per_engine": 1,
"max_num_gpus_per_engine": 8,
},
"sweep": map[string]interface{}{
"use_ai_configurator": true,
"aic_system": "h200_sxm",
"aic_hf_id": "Qwen/Qwen3-32B",
"aic_backend_version": "0.20.0",
},
}),
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(50.0),
ITL: ptr.To(10.0),
},
},
}
......@@ -1103,7 +960,7 @@ var _ = Describe("DGDR Profiler Arguments", func() {
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Re-fetch DGDR to get proper metadata from API server
var fetchedDGDR nvidiacomv1alpha1.DynamoGraphDeploymentRequest
var fetchedDGDR nvidiacomv1beta1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &fetchedDGDR)).Should(Succeed())
// Create profiling job with properly initialized DGDR
......@@ -1115,12 +972,12 @@ var _ = Describe("DGDR Profiler Arguments", func() {
job := &batchv1.Job{}
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job)).Should(Succeed())
// Verify profiler container has --profile-config argument
// Verify profiler container has --config argument
profilerContainer := job.Spec.Template.Spec.Containers[0]
args := profilerContainer.Args
// Check that --profile-config argument is present
Expect(args).Should(ContainElement("--profile-config"))
// Check that --config argument is present
Expect(args).Should(ContainElement("--config"))
// Clean up
_ = k8sClient.Delete(ctx, job)
......@@ -1141,24 +998,23 @@ var _ = Describe("DGDR Profiler Arguments", func() {
Expect(k8sClient.Create(ctx, sa)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, sa) }()
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "trtllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "test-profiler:latest",
Config: createTestConfig(map[string]interface{}{
"sla": map[string]interface{}{
"ttft": 50.0,
"itl": 10.0,
"isl": 3000,
"osl": 500,
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
},
}),
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(50.0),
ITL: ptr.To(10.0),
},
},
}
......@@ -1167,7 +1023,7 @@ var _ = Describe("DGDR Profiler Arguments", func() {
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Re-fetch DGDR to get proper metadata from API server
var fetchedDGDR nvidiacomv1alpha1.DynamoGraphDeploymentRequest
var fetchedDGDR nvidiacomv1beta1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &fetchedDGDR)).Should(Succeed())
// Create profiling job with properly initialized DGDR
......@@ -1222,27 +1078,23 @@ var _ = Describe("DGDR Error Handling", func() {
namespace := defaultNamespace
dgdrName := "test-error-capture"
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "test-profiler:latest",
Config: createTestConfig(map[string]interface{}{
"engine": map[string]interface{}{
"config": "/tmp/test-config.yaml",
},
"sla": map[string]interface{}{
"ttft": 100.0,
"itl": 1500.0,
"isl": 3000,
"osl": 5,
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
},
}),
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
},
},
}
......@@ -1251,7 +1103,7 @@ var _ = Describe("DGDR Error Handling", func() {
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Set status to Profiling
dgdr.Status.State = nvidiacomv1alpha1.DGDRStateProfiling
dgdr.Status.Phase = nvidiacomv1beta1.DGDRPhaseProfiling
Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed())
// Create failed job
......@@ -1331,12 +1183,12 @@ var _ = Describe("DGDR Error Handling", func() {
Expect(err).NotTo(HaveOccurred())
// Verify DGDR transitioned to Failed state
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
Expect(updated.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStateFailed))
Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhaseFailed))
// Verify error condition contains detailed error
condition := meta.FindStatusCondition(updated.Status.Conditions, ConditionTypeProfiling)
condition := meta.FindStatusCondition(updated.Status.Conditions, nvidiacomv1beta1.ConditionTypeProfiling)
Expect(condition).NotTo(BeNil())
Expect(condition.Status).Should(Equal(metav1.ConditionFalse))
Expect(condition.Message).Should(ContainSubstring("profiling job failed"))
......@@ -1535,22 +1387,18 @@ spec:
defer func() { _ = k8sClient.Delete(ctx, gpuNode) }()
// Create DGDR WITHOUT hardware config (should use GPU discovery)
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "test-profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(`{
"sla": {"ttft": 100.0, "itl": 1500.0},
"engine": {"minNumGpusPerEngine": 1, "maxNumGpusPerEngine": 8}
}`),
},
Image: "test-profiler:latest",
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
},
},
}
......@@ -1568,9 +1416,9 @@ spec:
Expect(err).NotTo(HaveOccurred())
// Should transition to Pending (validation passed)
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
Expect(updated.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStatePending))
Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhasePending))
})
It("Should respect manual hardware config over GPU discovery", func() {
......@@ -1593,27 +1441,23 @@ spec:
defer func() { _ = k8sClient.Delete(ctx, gpuNode) }()
// Create DGDR WITH manual hardware config (A100, not H100)
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "test-profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(`{
"sla": {"ttft": 100.0, "itl": 1500.0},
"hardware": {
"numGpusPerNode": 4,
"gpuModel": "A100-SXM4-40GB",
"gpuVramMib": 40960,
"system": "a100_sxm"
}
}`),
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](4),
GPUSKU: "A100-SXM4-40GB",
VRAMMB: ptr.To(40960.0),
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
},
},
}
......@@ -1631,9 +1475,9 @@ spec:
Expect(err).NotTo(HaveOccurred())
// Should transition to Pending (validation passed with manual config)
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
Expect(updated.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStatePending))
Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhasePending))
})
It("Should succeed with GPU discovery when cluster has GPU nodes", func() {
......@@ -1656,21 +1500,18 @@ spec:
defer func() { _ = k8sClient.Delete(ctx, node) }()
// Create DGDR WITHOUT hardware config - should use GPU discovery
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "test-profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(`{
"sla": {"ttft": 100.0, "itl": 1500.0}
}`),
},
Image: "test-profiler:latest",
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
},
},
}
......@@ -1688,9 +1529,9 @@ spec:
Expect(err).NotTo(HaveOccurred())
// Should transition to Pending
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
Expect(updated.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStatePending))
Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhasePending))
})
It("Should pass validation with explicit GPU ranges without GPU discovery", func() {
......@@ -1700,28 +1541,21 @@ spec:
// Intentionally don't create GPU nodes to test that explicit ranges work without GPU discovery
// Create DGDR with explicit minNumGpusPerEngine/maxNumGpusPerEngine
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "test-profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(`{
"sla": {"ttft": 100.0, "itl": 1500.0},
"engine": {
"minNumGpusPerEngine": 2,
"maxNumGpusPerEngine": 4
},
"hardware": {
"numGpusPerNode": 8
}
}`),
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
},
},
}
......@@ -1739,9 +1573,9 @@ spec:
Expect(err).NotTo(HaveOccurred())
// Should transition to Pending
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
Expect(updated.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStatePending))
Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhasePending))
})
It("Should use GPU discovery with heterogeneous nodes (picks best)", func() {
......@@ -1778,22 +1612,18 @@ spec:
}()
// Create DGDR without hardware config
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "test-profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(`{
"sla": {"ttft": 100.0, "itl": 1500.0},
"engine": {"minNumGpusPerEngine": 1, "maxNumGpusPerEngine": 8}
}`),
},
Image: "test-profiler:latest",
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
},
},
}
......@@ -1811,9 +1641,393 @@ spec:
Expect(err).NotTo(HaveOccurred())
// Should transition to Pending (using H100 config)
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
Expect(updated.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStatePending))
Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhasePending))
})
})
Context("v1beta1-specific behavior", func() {
It("Should transition to Deployed when DGD reaches Ready", func() {
ctx := context.Background()
dgdrName := "test-dgdr-deployed-phase"
namespace := defaultNamespace
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
},
},
}
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Set DGDR to Deploying with a DGDName
dgdr.Status.Phase = nvidiacomv1beta1.DGDRPhaseDeploying
dgdr.Status.DGDName = "test-dgd-deployed"
Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed())
// Create the DGD in Ready state
dgd := &dgdv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-deployed",
Namespace: namespace,
Labels: map[string]string{
nvidiacomv1beta1.LabelDGDRName: dgdrName,
nvidiacomv1beta1.LabelDGDRNamespace: namespace,
},
},
Spec: dgdv1alpha1.DynamoGraphDeploymentSpec{},
}
Expect(k8sClient.Create(ctx, dgd)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, dgd) }()
// Set DGD to Successful state
dgd.Status.State = dgdv1alpha1.DGDStateSuccessful
Expect(k8sClient.Status().Update(ctx, dgd)).Should(Succeed())
// Reconcile — should transition DGDR to Deployed
_, err := reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
})
Expect(err).NotTo(HaveOccurred())
var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhaseDeployed))
})
It("Should set Succeeded condition at each phase transition", func() {
ctx := context.Background()
dgdrName := "test-dgdr-succeeded-cond"
namespace := defaultNamespace
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
},
},
}
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// First reconcile: initial validation → Pending
_, err := reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
})
Expect(err).NotTo(HaveOccurred())
var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
// Check that Succeeded condition exists with reason matching the phase
succeededCond := meta.FindStatusCondition(updated.Status.Conditions, nvidiacomv1beta1.ConditionTypeSucceeded)
Expect(succeededCond).NotTo(BeNil())
Expect(succeededCond.Reason).Should(Equal(string(nvidiacomv1beta1.DGDRPhasePending)))
})
It("Should set ProfilingPhase on entry to Profiling and clear on exit", func() {
ctx := context.Background()
dgdrName := "test-dgdr-profiling-phase"
namespace := defaultNamespace
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
},
},
}
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Transition through initial validation to Pending
_, err := reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
})
Expect(err).NotTo(HaveOccurred())
// Reconcile again to start profiling (creates job, transitions to Profiling)
_, err = reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
})
Expect(err).NotTo(HaveOccurred())
// Check ProfilingPhase is set
var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhaseProfiling))
Expect(updated.Status.ProfilingPhase).Should(Equal(nvidiacomv1beta1.ProfilingPhaseInitializing))
// Simulate profiling completion
jobName := getProfilingJobName(&updated)
job := &batchv1.Job{}
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job)).Should(Succeed())
job.Status.Conditions = []batchv1.JobCondition{{
Type: batchv1.JobComplete,
Status: corev1.ConditionTrue,
}}
Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed())
dgdYAML := `apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: test-dgd-profphase
spec:
services:
Frontend:
replicas: 1`
cm := &corev1.ConfigMap{
ObjectMeta: metav1.ObjectMeta{
Name: getOutputConfigMapName(&updated),
Namespace: namespace,
},
Data: map[string]string{
ProfilingOutputFile: dgdYAML,
},
}
Expect(k8sClient.Create(ctx, cm)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, cm) }()
// Reconcile to complete profiling
_, err = reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
})
Expect(err).NotTo(HaveOccurred())
// ProfilingPhase should be cleared after profiling completes
// Note: Due to the r.Update/r.Status().Update ordering in generateDGDSpec,
// ProfilingPhase may not be cleared in a single reconcile. Verify the phase
// transition happened correctly.
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
Expect(updated.Status.Phase).ShouldNot(Equal(nvidiacomv1beta1.DGDRPhaseProfiling))
})
It("Should use spec.features.mocker.enabled to select mocker output", func() {
ctx := context.Background()
dgdrName := "test-dgdr-mocker"
namespace := defaultNamespace
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
},
Features: &nvidiacomv1beta1.FeaturesSpec{
Mocker: &nvidiacomv1beta1.MockerSpec{Enabled: true},
},
},
}
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Transition to Profiling
dgdr.Status.Phase = nvidiacomv1beta1.DGDRPhaseProfiling
dgdr.Status.ObservedGeneration = dgdr.Generation
Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed())
// Create completed job
jobName := getProfilingJobName(dgdr)
job := &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{Name: jobName, Namespace: namespace},
Spec: batchv1.JobSpec{
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Containers: []corev1.Container{{Name: "test", Image: "test"}},
RestartPolicy: corev1.RestartPolicyNever,
},
},
},
}
Expect(k8sClient.Create(ctx, job)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, job) }()
job.Status.Conditions = []batchv1.JobCondition{{
Type: batchv1.JobComplete, Status: corev1.ConditionTrue,
}}
Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed())
// Create output ConfigMap with mocker output file
dgdYAML := `apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: test-dgd-mocker
spec:
services:
Frontend:
replicas: 1`
cm := &corev1.ConfigMap{
ObjectMeta: metav1.ObjectMeta{
Name: getOutputConfigMapName(dgdr),
Namespace: namespace,
},
Data: map[string]string{
ProfilingOutputFileMocker: dgdYAML,
},
}
Expect(k8sClient.Create(ctx, cm)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, cm) }()
// Reconcile — should read from mocker output file
_, err := reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
})
Expect(err).NotTo(HaveOccurred())
// Verify the generated spec came from the mocker file
var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
Expect(updated.Annotations["nvidia.com/generated-dgd-spec"]).Should(ContainSubstring("test-dgd-mocker"))
})
It("Should populate profilingJobName in status", func() {
ctx := context.Background()
dgdrName := "test-dgdr-jobname"
namespace := defaultNamespace
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
},
},
}
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Reconcile through initial validation
_, err := reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
})
Expect(err).NotTo(HaveOccurred())
// Reconcile to create profiling job
_, err = reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
})
Expect(err).NotTo(HaveOccurred())
// Check profilingJobName is set in status
var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
Expect(updated.Status.ProfilingJobName).Should(Equal(getProfilingJobName(&updated)))
// Clean up job
job := &batchv1.Job{}
_ = k8sClient.Get(ctx, types.NamespacedName{Name: updated.Status.ProfilingJobName, Namespace: namespace}, job)
_ = k8sClient.Delete(ctx, job)
})
It("Should validate typed hardware fields without blob parsing", func() {
ctx := context.Background()
dgdrName := "test-dgdr-typed-hw"
namespace := defaultNamespace
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
GPUSKU: "A100-SXM4-40GB",
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
ITL: ptr.To(1500.0),
},
},
}
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Reconcile — partial hardware (GPUSKU only) should pass validation
_, err := reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
})
Expect(err).NotTo(HaveOccurred())
var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhasePending))
})
})
})
......@@ -27,6 +27,7 @@ import (
"testing"
"github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
......@@ -102,6 +103,8 @@ var _ = BeforeSuite(func() {
//+kubebuilder:scaffold:scheme
err = v1alpha1.AddToScheme(scheme)
Expect(err).NotTo(HaveOccurred())
err = v1beta1.AddToScheme(scheme)
Expect(err).NotTo(HaveOccurred())
err = corev1.AddToScheme(scheme)
Expect(err).NotTo(HaveOccurred())
err = autoscalingv2.AddToScheme(scheme)
......
......@@ -18,31 +18,18 @@
package validation
import (
"encoding/json"
"errors"
"fmt"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"k8s.io/apimachinery/pkg/util/yaml"
nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
)
// toFloat64 converts a numeric value (int or float64) to float64.
// Returns 0 if the value is neither int nor float64.
func toFloat64(val any) float64 {
switch v := val.(type) {
case float64:
return v
case int:
return float64(v)
default:
return 0
}
}
// DynamoGraphDeploymentRequestValidator validates DynamoGraphDeploymentRequest resources.
// This validator can be used by both webhooks and controllers for consistent validation.
type DynamoGraphDeploymentRequestValidator struct {
request *nvidiacomv1alpha1.DynamoGraphDeploymentRequest
request *nvidiacomv1beta1.DynamoGraphDeploymentRequest
isClusterWideOperator bool
gpuDiscoveryEnabled bool
}
......@@ -50,7 +37,7 @@ type DynamoGraphDeploymentRequestValidator struct {
// NewDynamoGraphDeploymentRequestValidator creates a new validator for DynamoGraphDeploymentRequest.
// isClusterWide indicates whether the operator has cluster-wide permissions.
// gpuDiscoveryEnabled indicates whether Helm provisioned node read access for the operator.
func NewDynamoGraphDeploymentRequestValidator(request *nvidiacomv1alpha1.DynamoGraphDeploymentRequest, isClusterWide bool, gpuDiscoveryEnabled bool) *DynamoGraphDeploymentRequestValidator {
func NewDynamoGraphDeploymentRequestValidator(request *nvidiacomv1beta1.DynamoGraphDeploymentRequest, isClusterWide bool, gpuDiscoveryEnabled bool) *DynamoGraphDeploymentRequestValidator {
return &DynamoGraphDeploymentRequestValidator{
request: request,
isClusterWideOperator: isClusterWide,
......@@ -61,105 +48,43 @@ func NewDynamoGraphDeploymentRequestValidator(request *nvidiacomv1alpha1.DynamoG
// Validate performs stateless validation on the DynamoGraphDeploymentRequest.
// Returns warnings and error.
func (v *DynamoGraphDeploymentRequestValidator) Validate() (admission.Warnings, error) {
var warnings admission.Warnings
var err error
// Warn about deprecated enableGpuDiscovery field
if v.request.Spec.EnableGPUDiscovery != nil {
warnings = append(warnings, "spec.enableGpuDiscovery is deprecated and will be removed in v1beta1. GPU discovery is now always attempted automatically. This field has no effect.")
}
// Validate profiler image is specified
if v.request.Spec.ProfilingConfig.ProfilerImage == "" {
err = errors.Join(err, errors.New("spec.profilingConfig.profilerImage is required"))
}
// Validate that profilingConfig.config is provided
if v.request.Spec.ProfilingConfig.Config == nil || len(v.request.Spec.ProfilingConfig.Config.Raw) == 0 {
err = errors.Join(err, errors.New("spec.profilingConfig.config is required and must not be empty"))
// Validate image is specified (required for the profiling job container).
if v.request.Spec.Image == "" {
err = errors.Join(err, errors.New("spec.image is required"))
}
// Note: GPU discovery is now automatic for cluster-wide operators
// Namespace-restricted operators automatically skip GPU discovery and require manual hardware config
// Parse config to validate structure (only if config is present)
if v.request.Spec.ProfilingConfig.Config != nil && len(v.request.Spec.ProfilingConfig.Config.Raw) > 0 {
var config map[string]any
if parseErr := yaml.Unmarshal(v.request.Spec.ProfilingConfig.Config.Raw, &config); parseErr != nil {
err = errors.Join(err, fmt.Errorf("failed to parse spec.profilingConfig.config: %w", parseErr))
} else {
// Warn if deployment.model or engine.backend are specified in config (they will be overwritten by spec fields)
if engineConfig, ok := config["engine"].(map[string]any); ok {
if backend, ok := engineConfig["backend"].(string); ok && backend != "" && backend != v.request.Spec.Backend {
warnings = append(warnings, fmt.Sprintf("spec.profilingConfig.config.engine.backend (%s) will be overwritten by spec.backend (%s)", backend, v.request.Spec.Backend))
}
}
if deployment, ok := config["deployment"].(map[string]any); ok {
if model, ok := deployment["model"].(string); ok && model != "" && model != v.request.Spec.Model {
warnings = append(warnings, fmt.Sprintf("spec.profilingConfig.config.deployment.model (%s) will be overwritten by spec.model (%s)", model, v.request.Spec.Model))
}
}
}
// Disallow searchStrategy: thorough with backend: auto.
// "thorough" sweeps more configurations and requires a concrete backend to be selected;
// "auto" defers backend selection and is only compatible with the "rapid" search strategy.
if v.request.Spec.SearchStrategy == nvidiacomv1beta1.SearchStrategyThorough &&
v.request.Spec.Backend == nvidiacomv1beta1.BackendTypeAuto {
err = errors.Join(err, fmt.Errorf(
"spec.searchStrategy %q is incompatible with spec.backend %q: set spec.backend to a specific backend (sglang, trtllm, or vllm)",
nvidiacomv1beta1.SearchStrategyThorough,
nvidiacomv1beta1.BackendTypeAuto,
))
}
// Validate GPU hardware information is available (last, so other errors are collected first)
// Validate GPU hardware information is available (last, so other errors are collected first).
if gpuErr := v.validateGPUHardwareInfo(); gpuErr != nil {
err = errors.Join(err, gpuErr)
}
return warnings, err
return nil, err
}
// validateGPUHardwareInfo ensures GPU hardware information will be available for profiling.
// Returns an error at admission time if GPU discovery is disabled and no manual hardware config is provided.
func (v *DynamoGraphDeploymentRequestValidator) validateGPUHardwareInfo() error {
// Parse profiling config
var config map[string]any
if v.request.Spec.ProfilingConfig.Config != nil {
if err := yaml.Unmarshal(v.request.Spec.ProfilingConfig.Config.Raw, &config); err != nil {
// Config parse errors will be caught by other validators
return nil
}
} else {
config = make(map[string]any)
}
// Check if manual hardware config is provided
hardwareVal, hasHardware := config["hardware"]
// Check if manual hardware config is provided via typed spec.hardware fields.
var hasManualHardwareConfig bool
if hasHardware && hardwareVal != nil {
if hardwareConfig, ok := hardwareVal.(map[string]any); ok {
// Check if essential hardware fields are provided
_, hasGPUModel := hardwareConfig["gpuModel"]
_, hasGPUVram := hardwareConfig["gpuVramMib"]
_, hasNumGPUs := hardwareConfig["numGpusPerNode"]
hasManualHardwareConfig = hasGPUModel || hasGPUVram || hasNumGPUs
}
}
// Check if explicit GPU ranges are provided
var hasExplicitGPURanges bool
if engineVal, hasEngine := config["engine"]; hasEngine && engineVal != nil {
if engineConfig, ok := engineVal.(map[string]any); ok {
minGPUs, hasMin := engineConfig["minNumGpusPerEngine"]
maxGPUs, hasMax := engineConfig["maxNumGpusPerEngine"]
// Validate explicit GPU ranges
if hasMin && hasMax {
minVal := toFloat64(minGPUs)
maxVal := toFloat64(maxGPUs)
// Validate that min <= max
if minVal > maxVal {
return fmt.Errorf("invalid GPU range: minNumGpusPerEngine (%v) cannot be greater than maxNumGpusPerEngine (%v)",
minVal, maxVal)
}
hasExplicitGPURanges = minVal > 0 && maxVal > 0
}
}
if hw := v.request.Spec.Hardware; hw != nil {
hasManualHardwareConfig = hw.GPUSKU != "" || hw.VRAMMB != nil || hw.NumGPUsPerNode != nil
}
if hasManualHardwareConfig || hasExplicitGPURanges {
if hasManualHardwareConfig {
return nil
}
......@@ -169,13 +94,40 @@ func (v *DynamoGraphDeploymentRequestValidator) validateGPUHardwareInfo() error
return nil
}
return errors.New("GPU hardware configuration required: GPU discovery is disabled (set dynamo-operator.gpuDiscovery.enabled=true in Helm values, or provide hardware config in spec.profilingConfig.config)")
return errors.New("GPU hardware configuration required: GPU discovery is disabled (set dynamo-operator.gpuDiscovery.enabled=true in Helm values, or provide hardware config in spec.hardware)")
}
// ValidateUpdate performs stateful validation comparing old and new DynamoGraphDeploymentRequest.
// Returns warnings and error.
func (v *DynamoGraphDeploymentRequestValidator) ValidateUpdate(old *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (admission.Warnings, error) {
// TODO: Add update validation logic for DynamoGraphDeploymentRequest
// Placeholder for future immutability checks
func (v *DynamoGraphDeploymentRequestValidator) ValidateUpdate(old *nvidiacomv1beta1.DynamoGraphDeploymentRequest) (admission.Warnings, error) {
// Reject spec changes when the resource is in a non-editable lifecycle phase.
// During Profiling, Deploying, or Deployed the controller is actively reconciling
// the resource and spec mutations would conflict with in-flight operations.
phase := old.Status.Phase
immutablePhases := map[nvidiacomv1beta1.DGDRPhase]bool{
nvidiacomv1beta1.DGDRPhaseProfiling: true,
nvidiacomv1beta1.DGDRPhaseDeploying: true,
nvidiacomv1beta1.DGDRPhaseDeployed: true,
}
if immutablePhases[phase] {
// Compare specs — if they differ, reject the update.
oldSpec := old.Spec
newSpec := v.request.Spec
if !specEqual(oldSpec, newSpec) {
return nil, fmt.Errorf("spec updates are forbidden while the resource is in phase %q; delete and recreate the resource to change its spec", phase)
}
}
return nil, nil
}
// specEqual performs a JSON-round-trip comparison of two DynamoGraphDeploymentRequestSpec values.
func specEqual(a, b nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec) bool {
aj, err1 := json.Marshal(a)
bj, err2 := json.Marshal(b)
if err1 != nil || err2 != nil {
return false
}
return string(aj) == string(bj)
}
......@@ -21,7 +21,7 @@ import (
"context"
"fmt"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/observability"
internalwebhook "github.com/ai-dynamo/dynamo/deploy/operator/internal/webhook"
......@@ -34,7 +34,7 @@ import (
const (
// DynamoGraphDeploymentRequestWebhookName is the name of the validating webhook handler for DynamoGraphDeploymentRequest.
DynamoGraphDeploymentRequestWebhookName = "dynamographdeploymentrequest-validating-webhook"
dynamoGraphDeploymentRequestWebhookPath = "/validate-nvidia-com-v1alpha1-dynamographdeploymentrequest"
dynamoGraphDeploymentRequestWebhookPath = "/validate-nvidia-com-v1beta1-dynamographdeploymentrequest"
)
// DynamoGraphDeploymentRequestHandler is a handler for validating DynamoGraphDeploymentRequest resources.
......@@ -137,15 +137,15 @@ func (h *DynamoGraphDeploymentRequestHandler) RegisterWithManager(mgr manager.Ma
observedValidator := observability.NewObservedValidator(leaseAwareValidator, consts.ResourceTypeDynamoGraphDeploymentRequest)
webhook := admission.
WithCustomValidator(mgr.GetScheme(), &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{}, observedValidator).
WithCustomValidator(mgr.GetScheme(), &nvidiacomv1beta1.DynamoGraphDeploymentRequest{}, observedValidator).
WithRecoverPanic(true)
mgr.GetWebhookServer().Register(dynamoGraphDeploymentRequestWebhookPath, webhook)
return nil
}
// castToDynamoGraphDeploymentRequest attempts to cast a runtime.Object to a DynamoGraphDeploymentRequest.
func castToDynamoGraphDeploymentRequest(obj runtime.Object) (*nvidiacomv1alpha1.DynamoGraphDeploymentRequest, error) {
request, ok := obj.(*nvidiacomv1alpha1.DynamoGraphDeploymentRequest)
func castToDynamoGraphDeploymentRequest(obj runtime.Object) (*nvidiacomv1beta1.DynamoGraphDeploymentRequest, error) {
request, ok := obj.(*nvidiacomv1beta1.DynamoGraphDeploymentRequest)
if !ok {
return nil, fmt.Errorf("expected DynamoGraphDeploymentRequest but got %T", obj)
}
......
......@@ -21,125 +21,125 @@ import (
"strings"
"testing"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
validConfig := `{"engine": {"backend": "vllm"}, "deployment": {"model": "test-model"}}`
validConfigWithHardware := `{"engine": {"backend": "vllm"}, "deployment": {"model": "test-model"}, "hardware": {"numGpusPerNode": 8, "gpuModel": "H100-SXM5-80GB", "gpuVramMib": 81920}}`
minimalConfig := `{"sla": {"ttft": 200.0}}`
configWithDifferentBackend := `{"engine": {"backend": "sglang"}}`
configWithDifferentModel := `{"deployment": {"model": "different-model"}}`
invalidYAML := `{invalid yaml`
vram := float64(81920)
gpuCount := int32(8)
// errMsg: if non-empty, an error is expected and each newline-separated substring must appear in it.
// expectedWarning: if non-empty, at least one warning must contain this substring.
tests := []struct {
name string
request *nvidiacomv1alpha1.DynamoGraphDeploymentRequest
request *nvidiacomv1beta1.DynamoGraphDeploymentRequest
isClusterWide bool
gpuDiscoveryEnabled bool
errMsg string
expectedWarning string
}{
{
name: "valid request",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
request: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{Name: "test-dgdr", Namespace: "default"},
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(validConfig),
Backend: nvidiacomv1beta1.BackendTypeVllm,
Image: "profiler:latest",
},
},
isClusterWide: true,
},
{
name: "missing image",
request: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{Name: "test-dgdr", Namespace: "default"},
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: nvidiacomv1beta1.BackendTypeVllm,
Image: "",
},
},
isClusterWide: true,
errMsg: "spec.image is required",
},
{
name: "missing profiler image",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
name: "thorough + auto is invalid",
request: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{Name: "test-dgdr", Namespace: "default"},
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "",
Config: &apiextensionsv1.JSON{
Raw: []byte(validConfig),
Image: "profiler:latest",
Backend: nvidiacomv1beta1.BackendTypeAuto,
SearchStrategy: nvidiacomv1beta1.SearchStrategyThorough,
},
},
isClusterWide: true,
errMsg: `spec.searchStrategy "thorough" is incompatible with spec.backend "auto"`,
},
{
name: "rapid + auto is valid (default combination)",
request: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{Name: "test-dgdr", Namespace: "default"},
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Image: "profiler:latest",
Backend: nvidiacomv1beta1.BackendTypeAuto,
SearchStrategy: nvidiacomv1beta1.SearchStrategyRapid,
},
},
isClusterWide: true,
errMsg: "spec.profilingConfig.profilerImage is required",
},
{
name: "missing profiling config",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
name: "thorough + vllm is valid",
request: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{Name: "test-dgdr", Namespace: "default"},
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: nil,
},
Image: "profiler:latest",
Backend: nvidiacomv1beta1.BackendTypeVllm,
SearchStrategy: nvidiacomv1beta1.SearchStrategyThorough,
},
},
isClusterWide: true,
errMsg: "spec.profilingConfig.config is required and must not be empty",
},
{
name: "empty profiling config",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
name: "thorough + trtllm is valid",
request: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{Name: "test-dgdr", Namespace: "default"},
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte{},
Image: "profiler:latest",
Backend: nvidiacomv1beta1.BackendTypeTrtllm,
SearchStrategy: nvidiacomv1beta1.SearchStrategyThorough,
},
},
isClusterWide: true,
},
{
name: "thorough + sglang is valid",
request: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{Name: "test-dgdr", Namespace: "default"},
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Image: "profiler:latest",
Backend: nvidiacomv1beta1.BackendTypeSglang,
SearchStrategy: nvidiacomv1beta1.SearchStrategyThorough,
},
},
isClusterWide: true,
errMsg: "spec.profilingConfig.config is required and must not be empty",
},
{
name: "namespace-scoped operator with manual hardware config (should pass)",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
request: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{Name: "test-dgdr", Namespace: "default"},
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(validConfigWithHardware),
},
Backend: nvidiacomv1beta1.BackendTypeVllm,
Image: "profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
GPUSKU: "H100-SXM5-80GB",
VRAMMB: &vram,
NumGPUsPerNode: &gpuCount,
},
},
},
......@@ -148,20 +148,12 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
},
{
name: "namespace-scoped operator with GPU discovery enabled (should pass without manual config)",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
request: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{Name: "test-dgdr", Namespace: "default"},
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(minimalConfig),
},
},
Backend: nvidiacomv1beta1.BackendTypeVllm,
Image: "profiler:latest",
},
},
isClusterWide: false,
......@@ -169,20 +161,12 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
},
{
name: "namespace-scoped operator with GPU discovery disabled and no hardware config (should error)",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
request: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{Name: "test-dgdr", Namespace: "default"},
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(minimalConfig),
},
},
Backend: nvidiacomv1beta1.BackendTypeVllm,
Image: "profiler:latest",
},
},
isClusterWide: false,
......@@ -190,93 +174,25 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
errMsg: "GPU hardware configuration required: GPU discovery is disabled",
},
{
name: "invalid config YAML",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
name: "multiple errors (missing image and thorough+auto)",
request: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{Name: "test-dgdr", Namespace: "default"},
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(invalidYAML),
},
},
Backend: nvidiacomv1beta1.BackendTypeAuto,
SearchStrategy: nvidiacomv1beta1.SearchStrategyThorough,
Image: "",
},
},
isClusterWide: true,
errMsg: "failed to parse spec.profilingConfig.config",
},
{
name: "warning for different backend in config",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(configWithDifferentBackend),
},
},
},
},
isClusterWide: true,
expectedWarning: "spec.profilingConfig.config.engine.backend (sglang) will be overwritten by spec.backend (vllm)",
},
{
name: "warning for different model in config",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(configWithDifferentModel),
},
},
},
},
isClusterWide: true,
expectedWarning: "spec.profilingConfig.config.deployment.model (different-model) will be overwritten by spec.model (llama-3-8b)",
},
{
name: "multiple errors (missing profiler image and missing config)",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "",
Config: nil,
},
},
},
isClusterWide: false,
errMsg: "spec.profilingConfig.profilerImage is required\nspec.profilingConfig.config is required and must not be empty",
errMsg: "spec.image is required\nspec.searchStrategy",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
validator := NewDynamoGraphDeploymentRequestValidator(tt.request, tt.isClusterWide, tt.gpuDiscoveryEnabled)
warnings, err := validator.Validate()
_, err := validator.Validate()
wantErr := tt.errMsg != ""
if (err != nil) != wantErr {
......@@ -290,80 +206,159 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
}
}
}
wantWarning := tt.expectedWarning != ""
if wantWarning && len(warnings) == 0 {
t.Errorf("Validate() expected warning %q but got none", tt.expectedWarning)
}
if wantWarning && len(warnings) > 0 && !strings.Contains(warnings[0], tt.expectedWarning) {
t.Errorf("Validate() warning %q does not contain %q", warnings[0], tt.expectedWarning)
}
})
}
}
func TestDynamoGraphDeploymentRequestValidator_ValidateUpdate(t *testing.T) {
validConfig := `{"engine": {"backend": "vllm"}}`
tests := []struct {
name string
oldRequest *nvidiacomv1alpha1.DynamoGraphDeploymentRequest
newRequest *nvidiacomv1alpha1.DynamoGraphDeploymentRequest
oldRequest *nvidiacomv1beta1.DynamoGraphDeploymentRequest
newRequest *nvidiacomv1beta1.DynamoGraphDeploymentRequest
wantErr bool
errMsg string
wantWarnings bool
}{
{
name: "no changes",
oldRequest: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
oldRequest: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(validConfig),
Backend: nvidiacomv1beta1.BackendTypeVllm,
Image: "profiler:latest",
},
},
newRequest: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: nvidiacomv1beta1.BackendTypeVllm,
Image: "profiler:latest",
},
},
wantErr: false,
},
newRequest: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
{
name: "changing model name is allowed when not in immutable phase",
oldRequest: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(validConfig),
Backend: nvidiacomv1beta1.BackendTypeVllm,
Image: "profiler:latest",
},
},
newRequest: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-70b",
Backend: nvidiacomv1beta1.BackendTypeVllm,
Image: "profiler:latest",
},
},
wantErr: false,
},
{
name: "changing model name is allowed",
oldRequest: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
name: "spec change rejected during Profiling phase",
oldRequest: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(validConfig),
Backend: nvidiacomv1beta1.BackendTypeVllm,
Image: "profiler:latest",
},
Status: nvidiacomv1beta1.DynamoGraphDeploymentRequestStatus{
Phase: nvidiacomv1beta1.DGDRPhaseProfiling,
},
},
newRequest: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-70b",
Backend: nvidiacomv1beta1.BackendTypeVllm,
Image: "profiler:latest",
},
},
wantErr: true,
errMsg: "spec updates are forbidden while the resource is in phase",
},
{
name: "spec change rejected during Deploying phase",
oldRequest: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: nvidiacomv1beta1.BackendTypeVllm,
Image: "profiler:latest",
},
Status: nvidiacomv1beta1.DynamoGraphDeploymentRequestStatus{
Phase: nvidiacomv1beta1.DGDRPhaseDeploying,
},
newRequest: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
},
newRequest: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-70b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(validConfig),
Backend: nvidiacomv1beta1.BackendTypeVllm,
Image: "profiler:latest",
},
},
wantErr: true,
errMsg: "spec updates are forbidden while the resource is in phase",
},
{
name: "spec change rejected during Deployed phase",
oldRequest: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: nvidiacomv1beta1.BackendTypeVllm,
Image: "profiler:latest",
},
Status: nvidiacomv1beta1.DynamoGraphDeploymentRequestStatus{
Phase: nvidiacomv1beta1.DGDRPhaseDeployed,
},
},
newRequest: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-70b",
Backend: nvidiacomv1beta1.BackendTypeVllm,
Image: "profiler:latest",
},
},
wantErr: true,
errMsg: "spec updates are forbidden while the resource is in phase",
},
{
name: "no spec change during immutable phase is allowed",
oldRequest: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: nvidiacomv1beta1.BackendTypeVllm,
Image: "profiler:latest",
},
Status: nvidiacomv1beta1.DynamoGraphDeploymentRequestStatus{
Phase: nvidiacomv1beta1.DGDRPhaseProfiling,
},
},
newRequest: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: nvidiacomv1beta1.BackendTypeVllm,
Image: "profiler:latest",
},
},
wantErr: false,
},
{
name: "spec change allowed during Failed phase",
oldRequest: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: nvidiacomv1beta1.BackendTypeVllm,
Image: "profiler:latest",
},
Status: nvidiacomv1beta1.DynamoGraphDeploymentRequestStatus{
Phase: nvidiacomv1beta1.DGDRPhaseFailed,
},
},
newRequest: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-70b",
Backend: nvidiacomv1beta1.BackendTypeVllm,
Image: "profiler:latest",
},
},
wantErr: false,
......@@ -376,12 +371,21 @@ func TestDynamoGraphDeploymentRequestValidator_ValidateUpdate(t *testing.T) {
warnings, err := validator.ValidateUpdate(tt.oldRequest)
if (err != nil) != tt.wantErr {
t.Errorf("DynamoGraphDeploymentRequestValidator.ValidateUpdate() error = %v, wantErr %v", err, tt.wantErr)
t.Errorf("ValidateUpdate() error = %v, wantErr %v", err, tt.wantErr)
return
}
if tt.wantErr && tt.errMsg != "" {
if !strings.Contains(err.Error(), tt.errMsg) {
t.Errorf("ValidateUpdate() error %q does not contain %q", err.Error(), tt.errMsg)
}
}
if tt.wantWarnings && len(warnings) == 0 {
t.Errorf("DynamoGraphDeploymentRequestValidator.ValidateUpdate() expected warnings but got none")
t.Errorf("ValidateUpdate() expected warnings but got none")
}
if !tt.wantWarnings && len(warnings) > 0 {
t.Errorf("ValidateUpdate() unexpected warnings: %v", warnings)
}
})
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment