Unverified Commit 72579ee7 authored by Thomas Montfort's avatar Thomas Montfort Committed by GitHub
Browse files

feat: add DGDR Status .state enum (#6396)

parent 7fb95825
...@@ -57,7 +57,7 @@ spec: ...@@ -57,7 +57,7 @@ spec:
specific performance and resource constraints, enabling SLA-driven deployments. specific performance and resource constraints, enabling SLA-driven deployments.
Lifecycle: Lifecycle:
1. Initial → Pending: Validates spec and prepares for profiling 1. Initializing → Pending: Validates spec and prepares for profiling
2. Pending → Profiling: Creates and runs profiling job (online or AIC) 2. Pending → Profiling: Creates and runs profiling job (online or AIC)
3. Profiling → Ready/Deploying: Generates DGD spec after profiling completes 3. Profiling → Ready/Deploying: Generates DGD spec after profiling completes
4. Deploying → Ready: When autoApply=true, monitors DGD until Ready 4. Deploying → Ready: When autoApply=true, monitors DGD until Ready
...@@ -451,11 +451,19 @@ spec: ...@@ -451,11 +451,19 @@ spec:
Format: "configmap/\<name\>" Format: "configmap/\<name\>"
type: string type: string
state: state:
description: |- default: Initializing
State is a high-level textual status of the deployment request lifecycle. description: State is a high-level textual status of the deployment request lifecycle.
Possible values: "", "Pending", "Profiling", "Deploying", "Ready", "DeploymentDeleted", "Failed" enum:
Empty string ("") represents the initial state before initialization. - Initializing
- Pending
- Profiling
- Deploying
- Ready
- DeploymentDeleted
- Failed
type: string type: string
required:
- state
type: object type: object
type: object type: object
served: true served: true
......
...@@ -28,8 +28,6 @@ import ( ...@@ -28,8 +28,6 @@ import (
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
runtime "k8s.io/apimachinery/pkg/runtime" runtime "k8s.io/apimachinery/pkg/runtime"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
) )
// EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN! // EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN!
...@@ -96,6 +94,19 @@ type ProfilingConfigSpec struct { ...@@ -96,6 +94,19 @@ type ProfilingConfigSpec struct {
NodeSelector map[string]string `json:"nodeSelector,omitempty"` NodeSelector map[string]string `json:"nodeSelector,omitempty"`
} }
// +kubebuilder:validation:Enum=Initializing;Pending;Profiling;Deploying;Ready;DeploymentDeleted;Failed
type DGDRState string
const (
DGDRStateInitializing DGDRState = "Initializing"
DGDRStatePending DGDRState = "Pending"
DGDRStateProfiling DGDRState = "Profiling"
DGDRStateDeploying DGDRState = "Deploying"
DGDRStateReady DGDRState = "Ready"
DGDRStateDeploymentDeleted DGDRState = "DeploymentDeleted"
DGDRStateFailed DGDRState = "Failed"
)
// DeploymentOverridesSpec allows users to customize metadata for auto-created DynamoGraphDeployments. // DeploymentOverridesSpec allows users to customize metadata for auto-created DynamoGraphDeployments.
// When autoApply is enabled, these overrides are applied to the generated DGD resource. // When autoApply is enabled, these overrides are applied to the generated DGD resource.
type DeploymentOverridesSpec struct { type DeploymentOverridesSpec struct {
...@@ -209,9 +220,8 @@ type DeploymentStatus struct { ...@@ -209,9 +220,8 @@ type DeploymentStatus struct {
// The controller updates this status as the DGDR progresses through its lifecycle. // The controller updates this status as the DGDR progresses through its lifecycle.
type DynamoGraphDeploymentRequestStatus struct { type DynamoGraphDeploymentRequestStatus struct {
// State is a high-level textual status of the deployment request lifecycle. // State is a high-level textual status of the deployment request lifecycle.
// Possible values: "", "Pending", "Profiling", "Deploying", "Ready", "DeploymentDeleted", "Failed" // +kubebuilder:default=Initializing
// Empty string ("") represents the initial state before initialization. State DGDRState `json:"state"`
State string `json:"state,omitempty"`
// Backend is extracted from profilingConfig.config.engine.backend for display purposes. // Backend is extracted from profilingConfig.config.engine.backend for display purposes.
// This field is populated by the controller and shown in kubectl output. // This field is populated by the controller and shown in kubectl output.
...@@ -253,7 +263,7 @@ type DynamoGraphDeploymentRequestStatus struct { ...@@ -253,7 +263,7 @@ type DynamoGraphDeploymentRequestStatus struct {
// specific performance and resource constraints, enabling SLA-driven deployments. // specific performance and resource constraints, enabling SLA-driven deployments.
// //
// Lifecycle: // Lifecycle:
// 1. Initial → Pending: Validates spec and prepares for profiling // 1. Initializing → Pending: Validates spec and prepares for profiling
// 2. Pending → Profiling: Creates and runs profiling job (online or AIC) // 2. Pending → Profiling: Creates and runs profiling job (online or AIC)
// 3. Profiling → Ready/Deploying: Generates DGD spec after profiling completes // 3. Profiling → Ready/Deploying: Generates DGD spec after profiling completes
// 4. Deploying → Ready: When autoApply=true, monitors DGD until Ready // 4. Deploying → Ready: When autoApply=true, monitors DGD until Ready
...@@ -283,16 +293,13 @@ type DynamoGraphDeploymentRequest struct { ...@@ -283,16 +293,13 @@ type DynamoGraphDeploymentRequest struct {
} }
// SetState updates the State field in the DGDR status. // SetState updates the State field in the DGDR status.
func (s *DynamoGraphDeploymentRequest) SetState(state string) { func (s *DynamoGraphDeploymentRequest) SetState(state DGDRState) {
s.Status.State = state s.Status.State = state
} }
// GetState returns the current lifecycle state // GetState returns the current lifecycle state
func (d *DynamoGraphDeploymentRequest) GetState() string { func (d *DynamoGraphDeploymentRequest) GetState() string {
if d.Status.State == "" { return string(d.Status.State)
return consts.ResourceStateUnknown
}
return d.Status.State
} }
// GetSpec returns the spec of this DGDR as a generic interface. // GetSpec returns the spec of this DGDR as a generic interface.
......
...@@ -57,7 +57,7 @@ spec: ...@@ -57,7 +57,7 @@ spec:
specific performance and resource constraints, enabling SLA-driven deployments. specific performance and resource constraints, enabling SLA-driven deployments.
Lifecycle: Lifecycle:
1. Initial → Pending: Validates spec and prepares for profiling 1. Initializing → Pending: Validates spec and prepares for profiling
2. Pending → Profiling: Creates and runs profiling job (online or AIC) 2. Pending → Profiling: Creates and runs profiling job (online or AIC)
3. Profiling → Ready/Deploying: Generates DGD spec after profiling completes 3. Profiling → Ready/Deploying: Generates DGD spec after profiling completes
4. Deploying → Ready: When autoApply=true, monitors DGD until Ready 4. Deploying → Ready: When autoApply=true, monitors DGD until Ready
...@@ -451,11 +451,19 @@ spec: ...@@ -451,11 +451,19 @@ spec:
Format: "configmap/\<name\>" Format: "configmap/\<name\>"
type: string type: string
state: state:
description: |- default: Initializing
State is a high-level textual status of the deployment request lifecycle. description: State is a high-level textual status of the deployment request lifecycle.
Possible values: "", "Pending", "Profiling", "Deploying", "Ready", "DeploymentDeleted", "Failed" enum:
Empty string ("") represents the initial state before initialization. - Initializing
- Pending
- Profiling
- Deploying
- Ready
- DeploymentDeleted
- Failed
type: string type: string
required:
- state
type: object type: object
type: object type: object
served: true served: true
......
...@@ -53,15 +53,6 @@ import ( ...@@ -53,15 +53,6 @@ import (
) )
const ( const (
// DGDR state constants
DGDRStateEmpty = ""
DGDRStatePending = "Pending"
DGDRStateProfiling = "Profiling"
DGDRStateDeploying = "Deploying"
DGDRStateReady = "Ready"
DGDRStateDeploymentDeleted = "DeploymentDeleted"
DGDRStateFailed = "Failed"
// Condition types // Condition types
ConditionTypeValidation = "Validation" ConditionTypeValidation = "Validation"
ConditionTypeProfiling = "Profiling" ConditionTypeProfiling = "Profiling"
...@@ -373,8 +364,8 @@ func (r *DynamoGraphDeploymentRequestReconciler) Reconcile(ctx context.Context, ...@@ -373,8 +364,8 @@ func (r *DynamoGraphDeploymentRequestReconciler) Reconcile(ctx context.Context,
// Check for spec changes (immutability enforcement) // Check for spec changes (immutability enforcement)
if dgdr.Status.ObservedGeneration > 0 && dgdr.Status.ObservedGeneration != dgdr.Generation { if dgdr.Status.ObservedGeneration > 0 && dgdr.Status.ObservedGeneration != dgdr.Generation {
// Spec changed after initial processing // Spec changed after initial processing
if dgdr.Status.State == DGDRStateProfiling || dgdr.Status.State == DGDRStateDeploying || if dgdr.Status.State == nvidiacomv1alpha1.DGDRStateProfiling || dgdr.Status.State == nvidiacomv1alpha1.DGDRStateDeploying ||
dgdr.Status.State == DGDRStateReady || dgdr.Status.State == DGDRStateDeploymentDeleted { dgdr.Status.State == nvidiacomv1alpha1.DGDRStateReady || dgdr.Status.State == nvidiacomv1alpha1.DGDRStateDeploymentDeleted {
logger.Info("Spec change detected in immutable state", logger.Info("Spec change detected in immutable state",
"state", dgdr.Status.State, "state", dgdr.Status.State,
"observedGeneration", dgdr.Status.ObservedGeneration, "observedGeneration", dgdr.Status.ObservedGeneration,
...@@ -390,23 +381,23 @@ func (r *DynamoGraphDeploymentRequestReconciler) Reconcile(ctx context.Context, ...@@ -390,23 +381,23 @@ func (r *DynamoGraphDeploymentRequestReconciler) Reconcile(ctx context.Context,
} }
// State machine: handle different states // State machine: handle different states
switch dgdr.Status.State { switch dgdr.Status.State {
case DGDRStateEmpty: case nvidiacomv1alpha1.DGDRStateInitializing, "":
return r.handleInitialState(ctx, dgdr) return r.handleInitialState(ctx, dgdr)
case DGDRStatePending: case nvidiacomv1alpha1.DGDRStatePending:
return r.handlePendingState(ctx, dgdr) return r.handlePendingState(ctx, dgdr)
case DGDRStateProfiling: case nvidiacomv1alpha1.DGDRStateProfiling:
return r.handleProfilingState(ctx, dgdr) return r.handleProfilingState(ctx, dgdr)
case DGDRStateDeploying: case nvidiacomv1alpha1.DGDRStateDeploying:
return r.handleDeployingState(ctx, dgdr) return r.handleDeployingState(ctx, dgdr)
case DGDRStateReady: case nvidiacomv1alpha1.DGDRStateReady:
return r.handleReadyState(ctx, dgdr) return r.handleReadyState(ctx, dgdr)
case DGDRStateDeploymentDeleted: case nvidiacomv1alpha1.DGDRStateDeploymentDeleted:
return r.handleDeploymentDeletedState(ctx, dgdr) return r.handleDeploymentDeletedState(ctx, dgdr)
case DGDRStateFailed: case nvidiacomv1alpha1.DGDRStateFailed:
return r.handleFailedState(ctx, dgdr) return r.handleFailedState(ctx, dgdr)
default: default:
logger.Info("Unknown state", "state", dgdr.Status.State) logger.Info("Unknown state", "state", dgdr.Status.State)
return r.updateStateAndRequeue(ctx, dgdr, DGDRStateFailed, MessageInvalidState) return r.updateStateAndRequeue(ctx, dgdr, nvidiacomv1alpha1.DGDRStateFailed, MessageInvalidState)
} }
} }
...@@ -418,7 +409,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleInitialState(ctx context. ...@@ -418,7 +409,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleInitialState(ctx context.
// Validate the spec // Validate the spec
if err := r.validateSpec(ctx, dgdr); err != nil { if err := r.validateSpec(ctx, dgdr); err != nil {
r.Recorder.Event(dgdr, corev1.EventTypeWarning, EventReasonValidationFailed, err.Error()) r.Recorder.Event(dgdr, corev1.EventTypeWarning, EventReasonValidationFailed, err.Error())
return r.updateStateWithCondition(ctx, dgdr, DGDRStateFailed, ConditionTypeValidation, metav1.ConditionFalse, EventReasonValidationFailed, err.Error()) return r.updateStateWithCondition(ctx, dgdr, nvidiacomv1alpha1.DGDRStateFailed, ConditionTypeValidation, metav1.ConditionFalse, EventReasonValidationFailed, err.Error())
} }
// Set observedGeneration to track the spec we're processing // Set observedGeneration to track the spec we're processing
...@@ -429,7 +420,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleInitialState(ctx context. ...@@ -429,7 +420,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleInitialState(ctx context.
// Initialize status // Initialize status
r.Recorder.Event(dgdr, corev1.EventTypeNormal, EventReasonInitialized, MessageInitialized) r.Recorder.Event(dgdr, corev1.EventTypeNormal, EventReasonInitialized, MessageInitialized)
return r.updateStateAndRequeue(ctx, dgdr, DGDRStatePending, MessageInitialized) return r.updateStateAndRequeue(ctx, dgdr, nvidiacomv1alpha1.DGDRStatePending, MessageInitialized)
} }
// handlePendingState starts the profiling process // handlePendingState starts the profiling process
...@@ -440,7 +431,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) handlePendingState(ctx context. ...@@ -440,7 +431,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) handlePendingState(ctx context.
// Create profiling job (online or AIC) // Create profiling job (online or AIC)
if err := r.createProfilingJob(ctx, dgdr); err != nil { if err := r.createProfilingJob(ctx, dgdr); err != nil {
r.Recorder.Event(dgdr, corev1.EventTypeWarning, EventReasonProfilingJobFailed, err.Error()) r.Recorder.Event(dgdr, corev1.EventTypeWarning, EventReasonProfilingJobFailed, err.Error())
return r.updateStateWithCondition(ctx, dgdr, DGDRStateFailed, ConditionTypeProfiling, metav1.ConditionFalse, MessageJobCreationFailed, err.Error()) return r.updateStateWithCondition(ctx, dgdr, nvidiacomv1alpha1.DGDRStateFailed, ConditionTypeProfiling, metav1.ConditionFalse, MessageJobCreationFailed, err.Error())
} }
// Record event with appropriate message // Record event with appropriate message
...@@ -451,7 +442,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) handlePendingState(ctx context. ...@@ -451,7 +442,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) handlePendingState(ctx context.
} }
// Update to Profiling state with Running status // Update to Profiling state with Running status
return r.updateStateWithCondition(ctx, dgdr, DGDRStateProfiling, ConditionTypeProfiling, metav1.ConditionFalse, "ProfilingRunning", MessageProfilingInProgress) return r.updateStateWithCondition(ctx, dgdr, nvidiacomv1alpha1.DGDRStateProfiling, ConditionTypeProfiling, metav1.ConditionFalse, "ProfilingRunning", MessageProfilingInProgress)
} }
// handleProfilingState monitors profiling progress and generates spec when complete // handleProfilingState monitors profiling progress and generates spec when complete
...@@ -465,7 +456,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleProfilingState(ctx contex ...@@ -465,7 +456,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleProfilingState(ctx contex
if err != nil { if err != nil {
r.Recorder.Event(dgdr, corev1.EventTypeWarning, MessageProfilingCheckFailed, err.Error()) r.Recorder.Event(dgdr, corev1.EventTypeWarning, MessageProfilingCheckFailed, err.Error())
// Job failed - transition to Failed state // Job failed - transition to Failed state
return r.updateStateWithCondition(ctx, dgdr, DGDRStateFailed, ConditionTypeProfiling, metav1.ConditionFalse, "ProfilingFailed", err.Error()) return r.updateStateWithCondition(ctx, dgdr, nvidiacomv1alpha1.DGDRStateFailed, ConditionTypeProfiling, metav1.ConditionFalse, "ProfilingFailed", err.Error())
} }
if !completed { if !completed {
...@@ -486,7 +477,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleProfilingState(ctx contex ...@@ -486,7 +477,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleProfilingState(ctx contex
// Retrieve profiling results and generate spec // Retrieve profiling results and generate spec
if err := r.generateDGDSpec(ctx, dgdr); err != nil { if err := r.generateDGDSpec(ctx, dgdr); err != nil {
r.Recorder.Event(dgdr, corev1.EventTypeWarning, MessageGenerationFailed, err.Error()) r.Recorder.Event(dgdr, corev1.EventTypeWarning, MessageGenerationFailed, err.Error())
return r.updateStateWithCondition(ctx, dgdr, DGDRStateFailed, ConditionTypeSpecGenerated, metav1.ConditionFalse, MessageGenerationFailed, err.Error()) return r.updateStateWithCondition(ctx, dgdr, nvidiacomv1alpha1.DGDRStateFailed, ConditionTypeSpecGenerated, metav1.ConditionFalse, MessageGenerationFailed, err.Error())
} }
// Record spec generation event // Record spec generation event
...@@ -508,11 +499,11 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleProfilingState(ctx contex ...@@ -508,11 +499,11 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleProfilingState(ctx contex
// If autoApply is enabled, transition to Deploying state // If autoApply is enabled, transition to Deploying state
if dgdr.Spec.AutoApply { if dgdr.Spec.AutoApply {
logger.Info("AutoApply enabled, transitioning to Deploying state") logger.Info("AutoApply enabled, transitioning to Deploying state")
return r.updateStateWithCondition(ctx, dgdr, DGDRStateDeploying, ConditionTypeSpecGenerated, metav1.ConditionTrue, EventReasonSpecGenerated, MessageSpecGenerated) return r.updateStateWithCondition(ctx, dgdr, nvidiacomv1alpha1.DGDRStateDeploying, ConditionTypeSpecGenerated, metav1.ConditionTrue, EventReasonSpecGenerated, MessageSpecGenerated)
} }
// Otherwise, transition to Ready state // Otherwise, transition to Ready state
return r.updateStateWithCondition(ctx, dgdr, DGDRStateReady, ConditionTypeSpecGenerated, metav1.ConditionTrue, EventReasonSpecGenerated, MessageSpecAvailable) return r.updateStateWithCondition(ctx, dgdr, nvidiacomv1alpha1.DGDRStateReady, ConditionTypeSpecGenerated, metav1.ConditionTrue, EventReasonSpecGenerated, MessageSpecAvailable)
} }
// handleReadyState handles DGDR in Ready state // handleReadyState handles DGDR in Ready state
...@@ -549,7 +540,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleReadyState(ctx context.Co ...@@ -549,7 +540,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleReadyState(ctx context.Co
logger.Info("DGD degraded, transitioning back to Deploying", logger.Info("DGD degraded, transitioning back to Deploying",
"dgdState", dgd.Status.State) "dgdState", dgd.Status.State)
dgdr.Status.State = DGDRStateDeploying dgdr.Status.State = nvidiacomv1alpha1.DGDRStateDeploying
r.Recorder.Event(dgdr, corev1.EventTypeWarning, EventReasonDeploymentDegraded, r.Recorder.Event(dgdr, corev1.EventTypeWarning, EventReasonDeploymentDegraded,
fmt.Sprintf(MessageDeploymentDegraded, dgd.Name, string(dgd.Status.State))) fmt.Sprintf(MessageDeploymentDegraded, dgd.Name, string(dgd.Status.State)))
...@@ -573,7 +564,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleDeployingState(ctx contex ...@@ -573,7 +564,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleDeployingState(ctx contex
if !dgdr.Spec.AutoApply { if !dgdr.Spec.AutoApply {
// Shouldn't be in this state without autoApply // Shouldn't be in this state without autoApply
logger.Info("AutoApply not enabled, transitioning to Ready") logger.Info("AutoApply not enabled, transitioning to Ready")
dgdr.Status.State = DGDRStateReady dgdr.Status.State = nvidiacomv1alpha1.DGDRStateReady
return ctrl.Result{}, r.Status().Update(ctx, dgdr) return ctrl.Result{}, r.Status().Update(ctx, dgdr)
} }
...@@ -604,7 +595,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleDeployingState(ctx contex ...@@ -604,7 +595,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleDeployingState(ctx contex
// Check if DGD is Ready // Check if DGD is Ready
if dgd.Status.State == nvidiacomv1alpha1.DGDStateSuccessful { if dgd.Status.State == nvidiacomv1alpha1.DGDStateSuccessful {
logger.Info("DGD is Ready, transitioning to Ready state") logger.Info("DGD is Ready, transitioning to Ready state")
dgdr.Status.State = DGDRStateReady dgdr.Status.State = nvidiacomv1alpha1.DGDRStateReady
r.Recorder.Event(dgdr, corev1.EventTypeNormal, EventReasonDeploymentReady, r.Recorder.Event(dgdr, corev1.EventTypeNormal, EventReasonDeploymentReady,
fmt.Sprintf(MessageDeploymentReady, dgd.Name)) fmt.Sprintf(MessageDeploymentReady, dgd.Name))
...@@ -632,7 +623,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleDGDDeleted(ctx context.Co ...@@ -632,7 +623,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleDGDDeleted(ctx context.Co
logger := log.FromContext(ctx) logger := log.FromContext(ctx)
logger.Info("DGD was deleted by user, transitioning to DeploymentDeleted state") logger.Info("DGD was deleted by user, transitioning to DeploymentDeleted state")
dgdr.Status.State = DGDRStateDeploymentDeleted dgdr.Status.State = nvidiacomv1alpha1.DGDRStateDeploymentDeleted
r.Recorder.Event(dgdr, corev1.EventTypeWarning, EventReasonDeploymentDeleted, r.Recorder.Event(dgdr, corev1.EventTypeWarning, EventReasonDeploymentDeleted,
fmt.Sprintf(MessageDeploymentDeleted, dgdr.Status.Deployment.Name)) fmt.Sprintf(MessageDeploymentDeleted, dgdr.Status.Deployment.Name))
...@@ -1732,7 +1723,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) extractDGDFromYAML(yamlContent ...@@ -1732,7 +1723,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) extractDGDFromYAML(yamlContent
} }
// updateStateAndRequeue updates the DGDR state and requeues // updateStateAndRequeue updates the DGDR state and requeues
func (r *DynamoGraphDeploymentRequestReconciler) updateStateAndRequeue(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest, state, _ string) (ctrl.Result, error) { func (r *DynamoGraphDeploymentRequestReconciler) updateStateAndRequeue(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest, state nvidiacomv1alpha1.DGDRState, _ string) (ctrl.Result, error) {
dgdr.Status.State = state dgdr.Status.State = state
if err := r.Status().Update(ctx, dgdr); err != nil { if err := r.Status().Update(ctx, dgdr); err != nil {
return ctrl.Result{}, err return ctrl.Result{}, err
...@@ -1744,7 +1735,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) updateStateAndRequeue(ctx conte ...@@ -1744,7 +1735,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) updateStateAndRequeue(ctx conte
func (r *DynamoGraphDeploymentRequestReconciler) updateStateWithCondition( func (r *DynamoGraphDeploymentRequestReconciler) updateStateWithCondition(
ctx context.Context, ctx context.Context,
dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest,
state string, state nvidiacomv1alpha1.DGDRState,
conditionType string, conditionType string,
status metav1.ConditionStatus, status metav1.ConditionStatus,
reason string, reason string,
......
...@@ -140,11 +140,11 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -140,11 +140,11 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
// Check status // Check status
Eventually(func() string { Eventually(func() nvidiacomv1alpha1.DGDRState {
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated) _ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
return updated.Status.State return updated.Status.State
}, timeout, interval).Should(Equal(DGDRStatePending)) }, timeout, interval).Should(Equal(nvidiacomv1alpha1.DGDRStatePending))
// Verify observedGeneration is set // Verify observedGeneration is set
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
...@@ -190,11 +190,11 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -190,11 +190,11 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
// Check status transitions to Pending (not Failed) // Check status transitions to Pending (not Failed)
Eventually(func() string { Eventually(func() nvidiacomv1alpha1.DGDRState {
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated) _ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
return updated.Status.State return updated.Status.State
}, timeout, interval).Should(Equal(DGDRStatePending)) }, timeout, interval).Should(Equal(nvidiacomv1alpha1.DGDRStatePending))
}) })
}) })
...@@ -420,7 +420,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -420,7 +420,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
defer func() { _ = k8sClient.Delete(ctx, dgdr) }() defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Update status to Profiling using Status subresource // Update status to Profiling using Status subresource
dgdr.Status.State = DGDRStateProfiling dgdr.Status.State = nvidiacomv1alpha1.DGDRStateProfiling
Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed()) Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed())
// Create completed profiling job // Create completed profiling job
...@@ -495,7 +495,7 @@ spec: ...@@ -495,7 +495,7 @@ spec:
Expect(updated.Status.GeneratedDeployment).NotTo(BeNil()) Expect(updated.Status.GeneratedDeployment).NotTo(BeNil())
// Verify state transitioned to Ready (since autoApply is false by default) // Verify state transitioned to Ready (since autoApply is false by default)
Expect(updated.Status.State).Should(Equal(DGDRStateReady)) Expect(updated.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStateReady))
}) })
}) })
...@@ -535,7 +535,7 @@ spec: ...@@ -535,7 +535,7 @@ spec:
defer func() { _ = k8sClient.Delete(ctx, dgdr) }() defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Update status to Profiling using Status subresource // Update status to Profiling using Status subresource
dgdr.Status.State = DGDRStateProfiling dgdr.Status.State = nvidiacomv1alpha1.DGDRStateProfiling
Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed()) Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed())
// Create completed profiling job // Create completed profiling job
...@@ -605,7 +605,7 @@ spec: ...@@ -605,7 +605,7 @@ spec:
// Get updated DGDR and check state is Deploying // Get updated DGDR and check state is Deploying
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed()) Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
Expect(updated.Status.State).Should(Equal(DGDRStateDeploying)) Expect(updated.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStateDeploying))
// Reconcile again to create DGD // Reconcile again to create DGD
_, err = reconciler.Reconcile(ctx, reconcile.Request{ _, err = reconciler.Reconcile(ctx, reconcile.Request{
...@@ -676,7 +676,7 @@ spec: ...@@ -676,7 +676,7 @@ spec:
observedGeneration := current.Status.ObservedGeneration observedGeneration := current.Status.ObservedGeneration
// Manually set state to Profiling to simulate in-progress profiling // Manually set state to Profiling to simulate in-progress profiling
current.Status.State = DGDRStateProfiling current.Status.State = nvidiacomv1alpha1.DGDRStateProfiling
Expect(k8sClient.Status().Update(ctx, &current)).Should(Succeed()) Expect(k8sClient.Status().Update(ctx, &current)).Should(Succeed())
// Try to modify spec // Try to modify spec
...@@ -698,7 +698,7 @@ spec: ...@@ -698,7 +698,7 @@ spec:
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &current)).Should(Succeed()) Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &current)).Should(Succeed())
Expect(current.Generation).Should(BeNumerically(">", initialGeneration)) Expect(current.Generation).Should(BeNumerically(">", initialGeneration))
Expect(current.Status.ObservedGeneration).Should(Equal(observedGeneration)) Expect(current.Status.ObservedGeneration).Should(Equal(observedGeneration))
Expect(current.Status.State).Should(Equal(DGDRStateProfiling)) // State unchanged Expect(current.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStateProfiling)) // State unchanged
// Verify event was recorded // Verify event was recorded
Eventually(func() bool { Eventually(func() bool {
...@@ -748,7 +748,7 @@ spec: ...@@ -748,7 +748,7 @@ spec:
defer func() { _ = k8sClient.Delete(ctx, dgdr) }() defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Update status to Ready with Deployment info using Status subresource // Update status to Ready with Deployment info using Status subresource
dgdr.Status.State = DGDRStateReady dgdr.Status.State = nvidiacomv1alpha1.DGDRStateReady
dgdr.Status.Deployment = &nvidiacomv1alpha1.DeploymentStatus{ dgdr.Status.Deployment = &nvidiacomv1alpha1.DeploymentStatus{
Name: "test-dgd-to-delete", Name: "test-dgd-to-delete",
Namespace: namespace, Namespace: namespace,
...@@ -766,7 +766,7 @@ spec: ...@@ -766,7 +766,7 @@ spec:
// Get updated DGDR and check state transitioned to DeploymentDeleted // Get updated DGDR and check state transitioned to DeploymentDeleted
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed()) Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
Expect(updated.Status.State).Should(Equal(DGDRStateDeploymentDeleted)) Expect(updated.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStateDeploymentDeleted))
}) })
}) })
}) })
...@@ -1241,7 +1241,7 @@ var _ = Describe("DGDR Error Handling", func() { ...@@ -1241,7 +1241,7 @@ var _ = Describe("DGDR Error Handling", func() {
defer func() { _ = k8sClient.Delete(ctx, dgdr) }() defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Set status to Profiling // Set status to Profiling
dgdr.Status.State = DGDRStateProfiling dgdr.Status.State = nvidiacomv1alpha1.DGDRStateProfiling
Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed()) Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed())
// Create failed job // Create failed job
...@@ -1323,7 +1323,7 @@ var _ = Describe("DGDR Error Handling", func() { ...@@ -1323,7 +1323,7 @@ var _ = Describe("DGDR Error Handling", func() {
// Verify DGDR transitioned to Failed state // Verify DGDR transitioned to Failed state
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed()) Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
Expect(updated.Status.State).Should(Equal(DGDRStateFailed)) Expect(updated.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStateFailed))
// Verify error condition contains detailed error // Verify error condition contains detailed error
condition := meta.FindStatusCondition(updated.Status.Conditions, ConditionTypeProfiling) condition := meta.FindStatusCondition(updated.Status.Conditions, ConditionTypeProfiling)
...@@ -1560,7 +1560,7 @@ spec: ...@@ -1560,7 +1560,7 @@ spec:
// Should transition to Pending (validation passed) // Should transition to Pending (validation passed)
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated) _ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
Expect(updated.Status.State).Should(Equal(DGDRStatePending)) Expect(updated.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStatePending))
}) })
It("Should respect manual hardware config over GPU discovery", func() { It("Should respect manual hardware config over GPU discovery", func() {
...@@ -1623,7 +1623,7 @@ spec: ...@@ -1623,7 +1623,7 @@ spec:
// Should transition to Pending (validation passed with manual config) // Should transition to Pending (validation passed with manual config)
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated) _ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
Expect(updated.Status.State).Should(Equal(DGDRStatePending)) Expect(updated.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStatePending))
}) })
It("Should succeed with GPU discovery when cluster has GPU nodes", func() { It("Should succeed with GPU discovery when cluster has GPU nodes", func() {
...@@ -1680,7 +1680,7 @@ spec: ...@@ -1680,7 +1680,7 @@ spec:
// Should transition to Pending // Should transition to Pending
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated) _ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
Expect(updated.Status.State).Should(Equal(DGDRStatePending)) Expect(updated.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStatePending))
}) })
It("Should pass validation with explicit GPU ranges without GPU discovery", func() { It("Should pass validation with explicit GPU ranges without GPU discovery", func() {
...@@ -1731,7 +1731,7 @@ spec: ...@@ -1731,7 +1731,7 @@ spec:
// Should transition to Pending // Should transition to Pending
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated) _ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
Expect(updated.Status.State).Should(Equal(DGDRStatePending)) Expect(updated.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStatePending))
}) })
It("Should use GPU discovery with heterogeneous nodes (picks best)", func() { It("Should use GPU discovery with heterogeneous nodes (picks best)", func() {
...@@ -1803,7 +1803,7 @@ spec: ...@@ -1803,7 +1803,7 @@ spec:
// Should transition to Pending (using H100 config) // Should transition to Pending (using H100 config)
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated) _ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
Expect(updated.Status.State).Should(Equal(DGDRStatePending)) Expect(updated.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStatePending))
}) })
}) })
}) })
...@@ -112,6 +112,29 @@ _Appears in:_ ...@@ -112,6 +112,29 @@ _Appears in:_
| `key` _string_ | Key in the ConfigMap to select. If not specified, defaults to "disagg.yaml". | disagg.yaml | | | `key` _string_ | Key in the ConfigMap to select. If not specified, defaults to "disagg.yaml". | disagg.yaml | |
#### DGDRState
_Underlying type:_ _string_
_Validation:_
- Enum: [Initializing Pending Profiling Deploying Ready DeploymentDeleted Failed]
_Appears in:_
- [DynamoGraphDeploymentRequestStatus](#dynamographdeploymentrequeststatus)
| Field | Description |
| --- | --- |
| `Initializing` | |
| `Pending` | |
| `Profiling` | |
| `Deploying` | |
| `Ready` | |
| `DeploymentDeleted` | |
| `Failed` | |
#### DGDState #### DGDState
_Underlying type:_ _string_ _Underlying type:_ _string_
...@@ -442,7 +465,7 @@ It serves as the primary interface for users to request model deployments with ...@@ -442,7 +465,7 @@ It serves as the primary interface for users to request model deployments with
specific performance and resource constraints, enabling SLA-driven deployments. specific performance and resource constraints, enabling SLA-driven deployments.
Lifecycle: Lifecycle:
1. Initial → Pending: Validates spec and prepares for profiling 1. Initializing → Pending: Validates spec and prepares for profiling
2. Pending → Profiling: Creates and runs profiling job (online or AIC) 2. Pending → Profiling: Creates and runs profiling job (online or AIC)
3. Profiling → Ready/Deploying: Generates DGD spec after profiling completes 3. Profiling → Ready/Deploying: Generates DGD spec after profiling completes
4. Deploying → Ready: When autoApply=true, monitors DGD until Ready 4. Deploying → Ready: When autoApply=true, monitors DGD until Ready
...@@ -503,7 +526,7 @@ _Appears in:_ ...@@ -503,7 +526,7 @@ _Appears in:_
| Field | Description | Default | Validation | | Field | Description | Default | Validation |
| --- | --- | --- | --- | | --- | --- | --- | --- |
| `state` _string_ | State is a high-level textual status of the deployment request lifecycle.<br />Possible values: "", "Pending", "Profiling", "Deploying", "Ready", "DeploymentDeleted", "Failed"<br />Empty string ("") represents the initial state before initialization. | | | | `state` _[DGDRState](#dgdrstate)_ | State is a high-level textual status of the deployment request lifecycle. | Initializing | Enum: [Initializing Pending Profiling Deploying Ready DeploymentDeleted Failed] <br /> |
| `backend` _string_ | Backend is extracted from profilingConfig.config.engine.backend for display purposes.<br />This field is populated by the controller and shown in kubectl output. | | Optional: \{\} <br /> | | `backend` _string_ | Backend is extracted from profilingConfig.config.engine.backend for display purposes.<br />This field is populated by the controller and shown in kubectl output. | | Optional: \{\} <br /> |
| `observedGeneration` _integer_ | ObservedGeneration reflects the generation of the most recently observed spec.<br />Used to detect spec changes and enforce immutability after profiling starts. | | | | `observedGeneration` _integer_ | ObservedGeneration reflects the generation of the most recently observed spec.<br />Used to detect spec changes and enforce immutability after profiling starts. | | |
| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions contains the latest observed conditions of the deployment request.<br />Standard condition types include: Validation, Profiling, SpecGenerated, DeploymentReady.<br />Conditions are merged by type on patch updates. | | | | `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions contains the latest observed conditions of the deployment request.<br />Standard condition types include: Validation, Profiling, SpecGenerated, DeploymentReady.<br />Conditions are merged by type on patch updates. | | |
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment