Unverified Commit 72579ee7 authored by Thomas Montfort's avatar Thomas Montfort Committed by GitHub
Browse files

feat: add DGDR Status .state enum (#6396)

parent 7fb95825
......@@ -57,7 +57,7 @@ spec:
specific performance and resource constraints, enabling SLA-driven deployments.
Lifecycle:
1. Initial → Pending: Validates spec and prepares for profiling
1. Initializing → Pending: Validates spec and prepares for profiling
2. Pending → Profiling: Creates and runs profiling job (online or AIC)
3. Profiling → Ready/Deploying: Generates DGD spec after profiling completes
4. Deploying → Ready: When autoApply=true, monitors DGD until Ready
......@@ -451,11 +451,19 @@ spec:
Format: "configmap/\<name\>"
type: string
state:
description: |-
State is a high-level textual status of the deployment request lifecycle.
Possible values: "", "Pending", "Profiling", "Deploying", "Ready", "DeploymentDeleted", "Failed"
Empty string ("") represents the initial state before initialization.
default: Initializing
description: State is a high-level textual status of the deployment request lifecycle.
enum:
- Initializing
- Pending
- Profiling
- Deploying
- Ready
- DeploymentDeleted
- Failed
type: string
required:
- state
type: object
type: object
served: true
......
......@@ -28,8 +28,6 @@ import (
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
runtime "k8s.io/apimachinery/pkg/runtime"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
)
// EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN!
......@@ -96,6 +94,19 @@ type ProfilingConfigSpec struct {
NodeSelector map[string]string `json:"nodeSelector,omitempty"`
}
// +kubebuilder:validation:Enum=Initializing;Pending;Profiling;Deploying;Ready;DeploymentDeleted;Failed
type DGDRState string
const (
DGDRStateInitializing DGDRState = "Initializing"
DGDRStatePending DGDRState = "Pending"
DGDRStateProfiling DGDRState = "Profiling"
DGDRStateDeploying DGDRState = "Deploying"
DGDRStateReady DGDRState = "Ready"
DGDRStateDeploymentDeleted DGDRState = "DeploymentDeleted"
DGDRStateFailed DGDRState = "Failed"
)
// DeploymentOverridesSpec allows users to customize metadata for auto-created DynamoGraphDeployments.
// When autoApply is enabled, these overrides are applied to the generated DGD resource.
type DeploymentOverridesSpec struct {
......@@ -209,9 +220,8 @@ type DeploymentStatus struct {
// The controller updates this status as the DGDR progresses through its lifecycle.
type DynamoGraphDeploymentRequestStatus struct {
// State is a high-level textual status of the deployment request lifecycle.
// Possible values: "", "Pending", "Profiling", "Deploying", "Ready", "DeploymentDeleted", "Failed"
// Empty string ("") represents the initial state before initialization.
State string `json:"state,omitempty"`
// +kubebuilder:default=Initializing
State DGDRState `json:"state"`
// Backend is extracted from profilingConfig.config.engine.backend for display purposes.
// This field is populated by the controller and shown in kubectl output.
......@@ -253,7 +263,7 @@ type DynamoGraphDeploymentRequestStatus struct {
// specific performance and resource constraints, enabling SLA-driven deployments.
//
// Lifecycle:
// 1. Initial → Pending: Validates spec and prepares for profiling
// 1. Initializing → Pending: Validates spec and prepares for profiling
// 2. Pending → Profiling: Creates and runs profiling job (online or AIC)
// 3. Profiling → Ready/Deploying: Generates DGD spec after profiling completes
// 4. Deploying → Ready: When autoApply=true, monitors DGD until Ready
......@@ -283,16 +293,13 @@ type DynamoGraphDeploymentRequest struct {
}
// SetState updates the State field in the DGDR status.
func (s *DynamoGraphDeploymentRequest) SetState(state string) {
func (s *DynamoGraphDeploymentRequest) SetState(state DGDRState) {
s.Status.State = state
}
// GetState returns the current lifecycle state
func (d *DynamoGraphDeploymentRequest) GetState() string {
if d.Status.State == "" {
return consts.ResourceStateUnknown
}
return d.Status.State
return string(d.Status.State)
}
// GetSpec returns the spec of this DGDR as a generic interface.
......
......@@ -57,7 +57,7 @@ spec:
specific performance and resource constraints, enabling SLA-driven deployments.
Lifecycle:
1. Initial → Pending: Validates spec and prepares for profiling
1. Initializing → Pending: Validates spec and prepares for profiling
2. Pending → Profiling: Creates and runs profiling job (online or AIC)
3. Profiling → Ready/Deploying: Generates DGD spec after profiling completes
4. Deploying → Ready: When autoApply=true, monitors DGD until Ready
......@@ -451,11 +451,19 @@ spec:
Format: "configmap/\<name\>"
type: string
state:
description: |-
State is a high-level textual status of the deployment request lifecycle.
Possible values: "", "Pending", "Profiling", "Deploying", "Ready", "DeploymentDeleted", "Failed"
Empty string ("") represents the initial state before initialization.
default: Initializing
description: State is a high-level textual status of the deployment request lifecycle.
enum:
- Initializing
- Pending
- Profiling
- Deploying
- Ready
- DeploymentDeleted
- Failed
type: string
required:
- state
type: object
type: object
served: true
......
......@@ -53,15 +53,6 @@ import (
)
const (
// DGDR state constants
DGDRStateEmpty = ""
DGDRStatePending = "Pending"
DGDRStateProfiling = "Profiling"
DGDRStateDeploying = "Deploying"
DGDRStateReady = "Ready"
DGDRStateDeploymentDeleted = "DeploymentDeleted"
DGDRStateFailed = "Failed"
// Condition types
ConditionTypeValidation = "Validation"
ConditionTypeProfiling = "Profiling"
......@@ -373,8 +364,8 @@ func (r *DynamoGraphDeploymentRequestReconciler) Reconcile(ctx context.Context,
// Check for spec changes (immutability enforcement)
if dgdr.Status.ObservedGeneration > 0 && dgdr.Status.ObservedGeneration != dgdr.Generation {
// Spec changed after initial processing
if dgdr.Status.State == DGDRStateProfiling || dgdr.Status.State == DGDRStateDeploying ||
dgdr.Status.State == DGDRStateReady || dgdr.Status.State == DGDRStateDeploymentDeleted {
if dgdr.Status.State == nvidiacomv1alpha1.DGDRStateProfiling || dgdr.Status.State == nvidiacomv1alpha1.DGDRStateDeploying ||
dgdr.Status.State == nvidiacomv1alpha1.DGDRStateReady || dgdr.Status.State == nvidiacomv1alpha1.DGDRStateDeploymentDeleted {
logger.Info("Spec change detected in immutable state",
"state", dgdr.Status.State,
"observedGeneration", dgdr.Status.ObservedGeneration,
......@@ -390,23 +381,23 @@ func (r *DynamoGraphDeploymentRequestReconciler) Reconcile(ctx context.Context,
}
// State machine: handle different states
switch dgdr.Status.State {
case DGDRStateEmpty:
case nvidiacomv1alpha1.DGDRStateInitializing, "":
return r.handleInitialState(ctx, dgdr)
case DGDRStatePending:
case nvidiacomv1alpha1.DGDRStatePending:
return r.handlePendingState(ctx, dgdr)
case DGDRStateProfiling:
case nvidiacomv1alpha1.DGDRStateProfiling:
return r.handleProfilingState(ctx, dgdr)
case DGDRStateDeploying:
case nvidiacomv1alpha1.DGDRStateDeploying:
return r.handleDeployingState(ctx, dgdr)
case DGDRStateReady:
case nvidiacomv1alpha1.DGDRStateReady:
return r.handleReadyState(ctx, dgdr)
case DGDRStateDeploymentDeleted:
case nvidiacomv1alpha1.DGDRStateDeploymentDeleted:
return r.handleDeploymentDeletedState(ctx, dgdr)
case DGDRStateFailed:
case nvidiacomv1alpha1.DGDRStateFailed:
return r.handleFailedState(ctx, dgdr)
default:
logger.Info("Unknown state", "state", dgdr.Status.State)
return r.updateStateAndRequeue(ctx, dgdr, DGDRStateFailed, MessageInvalidState)
return r.updateStateAndRequeue(ctx, dgdr, nvidiacomv1alpha1.DGDRStateFailed, MessageInvalidState)
}
}
......@@ -418,7 +409,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleInitialState(ctx context.
// Validate the spec
if err := r.validateSpec(ctx, dgdr); err != nil {
r.Recorder.Event(dgdr, corev1.EventTypeWarning, EventReasonValidationFailed, err.Error())
return r.updateStateWithCondition(ctx, dgdr, DGDRStateFailed, ConditionTypeValidation, metav1.ConditionFalse, EventReasonValidationFailed, err.Error())
return r.updateStateWithCondition(ctx, dgdr, nvidiacomv1alpha1.DGDRStateFailed, ConditionTypeValidation, metav1.ConditionFalse, EventReasonValidationFailed, err.Error())
}
// Set observedGeneration to track the spec we're processing
......@@ -429,7 +420,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleInitialState(ctx context.
// Initialize status
r.Recorder.Event(dgdr, corev1.EventTypeNormal, EventReasonInitialized, MessageInitialized)
return r.updateStateAndRequeue(ctx, dgdr, DGDRStatePending, MessageInitialized)
return r.updateStateAndRequeue(ctx, dgdr, nvidiacomv1alpha1.DGDRStatePending, MessageInitialized)
}
// handlePendingState starts the profiling process
......@@ -440,7 +431,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) handlePendingState(ctx context.
// Create profiling job (online or AIC)
if err := r.createProfilingJob(ctx, dgdr); err != nil {
r.Recorder.Event(dgdr, corev1.EventTypeWarning, EventReasonProfilingJobFailed, err.Error())
return r.updateStateWithCondition(ctx, dgdr, DGDRStateFailed, ConditionTypeProfiling, metav1.ConditionFalse, MessageJobCreationFailed, err.Error())
return r.updateStateWithCondition(ctx, dgdr, nvidiacomv1alpha1.DGDRStateFailed, ConditionTypeProfiling, metav1.ConditionFalse, MessageJobCreationFailed, err.Error())
}
// Record event with appropriate message
......@@ -451,7 +442,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) handlePendingState(ctx context.
}
// Update to Profiling state with Running status
return r.updateStateWithCondition(ctx, dgdr, DGDRStateProfiling, ConditionTypeProfiling, metav1.ConditionFalse, "ProfilingRunning", MessageProfilingInProgress)
return r.updateStateWithCondition(ctx, dgdr, nvidiacomv1alpha1.DGDRStateProfiling, ConditionTypeProfiling, metav1.ConditionFalse, "ProfilingRunning", MessageProfilingInProgress)
}
// handleProfilingState monitors profiling progress and generates spec when complete
......@@ -465,7 +456,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleProfilingState(ctx contex
if err != nil {
r.Recorder.Event(dgdr, corev1.EventTypeWarning, MessageProfilingCheckFailed, err.Error())
// Job failed - transition to Failed state
return r.updateStateWithCondition(ctx, dgdr, DGDRStateFailed, ConditionTypeProfiling, metav1.ConditionFalse, "ProfilingFailed", err.Error())
return r.updateStateWithCondition(ctx, dgdr, nvidiacomv1alpha1.DGDRStateFailed, ConditionTypeProfiling, metav1.ConditionFalse, "ProfilingFailed", err.Error())
}
if !completed {
......@@ -486,7 +477,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleProfilingState(ctx contex
// Retrieve profiling results and generate spec
if err := r.generateDGDSpec(ctx, dgdr); err != nil {
r.Recorder.Event(dgdr, corev1.EventTypeWarning, MessageGenerationFailed, err.Error())
return r.updateStateWithCondition(ctx, dgdr, DGDRStateFailed, ConditionTypeSpecGenerated, metav1.ConditionFalse, MessageGenerationFailed, err.Error())
return r.updateStateWithCondition(ctx, dgdr, nvidiacomv1alpha1.DGDRStateFailed, ConditionTypeSpecGenerated, metav1.ConditionFalse, MessageGenerationFailed, err.Error())
}
// Record spec generation event
......@@ -508,11 +499,11 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleProfilingState(ctx contex
// If autoApply is enabled, transition to Deploying state
if dgdr.Spec.AutoApply {
logger.Info("AutoApply enabled, transitioning to Deploying state")
return r.updateStateWithCondition(ctx, dgdr, DGDRStateDeploying, ConditionTypeSpecGenerated, metav1.ConditionTrue, EventReasonSpecGenerated, MessageSpecGenerated)
return r.updateStateWithCondition(ctx, dgdr, nvidiacomv1alpha1.DGDRStateDeploying, ConditionTypeSpecGenerated, metav1.ConditionTrue, EventReasonSpecGenerated, MessageSpecGenerated)
}
// Otherwise, transition to Ready state
return r.updateStateWithCondition(ctx, dgdr, DGDRStateReady, ConditionTypeSpecGenerated, metav1.ConditionTrue, EventReasonSpecGenerated, MessageSpecAvailable)
return r.updateStateWithCondition(ctx, dgdr, nvidiacomv1alpha1.DGDRStateReady, ConditionTypeSpecGenerated, metav1.ConditionTrue, EventReasonSpecGenerated, MessageSpecAvailable)
}
// handleReadyState handles DGDR in Ready state
......@@ -549,7 +540,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleReadyState(ctx context.Co
logger.Info("DGD degraded, transitioning back to Deploying",
"dgdState", dgd.Status.State)
dgdr.Status.State = DGDRStateDeploying
dgdr.Status.State = nvidiacomv1alpha1.DGDRStateDeploying
r.Recorder.Event(dgdr, corev1.EventTypeWarning, EventReasonDeploymentDegraded,
fmt.Sprintf(MessageDeploymentDegraded, dgd.Name, string(dgd.Status.State)))
......@@ -573,7 +564,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleDeployingState(ctx contex
if !dgdr.Spec.AutoApply {
// Shouldn't be in this state without autoApply
logger.Info("AutoApply not enabled, transitioning to Ready")
dgdr.Status.State = DGDRStateReady
dgdr.Status.State = nvidiacomv1alpha1.DGDRStateReady
return ctrl.Result{}, r.Status().Update(ctx, dgdr)
}
......@@ -604,7 +595,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleDeployingState(ctx contex
// Check if DGD is Ready
if dgd.Status.State == nvidiacomv1alpha1.DGDStateSuccessful {
logger.Info("DGD is Ready, transitioning to Ready state")
dgdr.Status.State = DGDRStateReady
dgdr.Status.State = nvidiacomv1alpha1.DGDRStateReady
r.Recorder.Event(dgdr, corev1.EventTypeNormal, EventReasonDeploymentReady,
fmt.Sprintf(MessageDeploymentReady, dgd.Name))
......@@ -632,7 +623,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleDGDDeleted(ctx context.Co
logger := log.FromContext(ctx)
logger.Info("DGD was deleted by user, transitioning to DeploymentDeleted state")
dgdr.Status.State = DGDRStateDeploymentDeleted
dgdr.Status.State = nvidiacomv1alpha1.DGDRStateDeploymentDeleted
r.Recorder.Event(dgdr, corev1.EventTypeWarning, EventReasonDeploymentDeleted,
fmt.Sprintf(MessageDeploymentDeleted, dgdr.Status.Deployment.Name))
......@@ -1732,7 +1723,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) extractDGDFromYAML(yamlContent
}
// updateStateAndRequeue updates the DGDR state and requeues
func (r *DynamoGraphDeploymentRequestReconciler) updateStateAndRequeue(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest, state, _ string) (ctrl.Result, error) {
func (r *DynamoGraphDeploymentRequestReconciler) updateStateAndRequeue(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest, state nvidiacomv1alpha1.DGDRState, _ string) (ctrl.Result, error) {
dgdr.Status.State = state
if err := r.Status().Update(ctx, dgdr); err != nil {
return ctrl.Result{}, err
......@@ -1744,7 +1735,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) updateStateAndRequeue(ctx conte
func (r *DynamoGraphDeploymentRequestReconciler) updateStateWithCondition(
ctx context.Context,
dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest,
state string,
state nvidiacomv1alpha1.DGDRState,
conditionType string,
status metav1.ConditionStatus,
reason string,
......
......@@ -140,11 +140,11 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
Expect(err).NotTo(HaveOccurred())
// Check status
Eventually(func() string {
Eventually(func() nvidiacomv1alpha1.DGDRState {
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
return updated.Status.State
}, timeout, interval).Should(Equal(DGDRStatePending))
}, timeout, interval).Should(Equal(nvidiacomv1alpha1.DGDRStatePending))
// Verify observedGeneration is set
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
......@@ -190,11 +190,11 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
Expect(err).NotTo(HaveOccurred())
// Check status transitions to Pending (not Failed)
Eventually(func() string {
Eventually(func() nvidiacomv1alpha1.DGDRState {
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
return updated.Status.State
}, timeout, interval).Should(Equal(DGDRStatePending))
}, timeout, interval).Should(Equal(nvidiacomv1alpha1.DGDRStatePending))
})
})
......@@ -420,7 +420,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Update status to Profiling using Status subresource
dgdr.Status.State = DGDRStateProfiling
dgdr.Status.State = nvidiacomv1alpha1.DGDRStateProfiling
Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed())
// Create completed profiling job
......@@ -495,7 +495,7 @@ spec:
Expect(updated.Status.GeneratedDeployment).NotTo(BeNil())
// Verify state transitioned to Ready (since autoApply is false by default)
Expect(updated.Status.State).Should(Equal(DGDRStateReady))
Expect(updated.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStateReady))
})
})
......@@ -535,7 +535,7 @@ spec:
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Update status to Profiling using Status subresource
dgdr.Status.State = DGDRStateProfiling
dgdr.Status.State = nvidiacomv1alpha1.DGDRStateProfiling
Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed())
// Create completed profiling job
......@@ -605,7 +605,7 @@ spec:
// Get updated DGDR and check state is Deploying
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
Expect(updated.Status.State).Should(Equal(DGDRStateDeploying))
Expect(updated.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStateDeploying))
// Reconcile again to create DGD
_, err = reconciler.Reconcile(ctx, reconcile.Request{
......@@ -676,7 +676,7 @@ spec:
observedGeneration := current.Status.ObservedGeneration
// Manually set state to Profiling to simulate in-progress profiling
current.Status.State = DGDRStateProfiling
current.Status.State = nvidiacomv1alpha1.DGDRStateProfiling
Expect(k8sClient.Status().Update(ctx, &current)).Should(Succeed())
// Try to modify spec
......@@ -698,7 +698,7 @@ spec:
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &current)).Should(Succeed())
Expect(current.Generation).Should(BeNumerically(">", initialGeneration))
Expect(current.Status.ObservedGeneration).Should(Equal(observedGeneration))
Expect(current.Status.State).Should(Equal(DGDRStateProfiling)) // State unchanged
Expect(current.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStateProfiling)) // State unchanged
// Verify event was recorded
Eventually(func() bool {
......@@ -748,7 +748,7 @@ spec:
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Update status to Ready with Deployment info using Status subresource
dgdr.Status.State = DGDRStateReady
dgdr.Status.State = nvidiacomv1alpha1.DGDRStateReady
dgdr.Status.Deployment = &nvidiacomv1alpha1.DeploymentStatus{
Name: "test-dgd-to-delete",
Namespace: namespace,
......@@ -766,7 +766,7 @@ spec:
// Get updated DGDR and check state transitioned to DeploymentDeleted
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
Expect(updated.Status.State).Should(Equal(DGDRStateDeploymentDeleted))
Expect(updated.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStateDeploymentDeleted))
})
})
})
......@@ -1241,7 +1241,7 @@ var _ = Describe("DGDR Error Handling", func() {
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Set status to Profiling
dgdr.Status.State = DGDRStateProfiling
dgdr.Status.State = nvidiacomv1alpha1.DGDRStateProfiling
Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed())
// Create failed job
......@@ -1323,7 +1323,7 @@ var _ = Describe("DGDR Error Handling", func() {
// Verify DGDR transitioned to Failed state
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
Expect(updated.Status.State).Should(Equal(DGDRStateFailed))
Expect(updated.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStateFailed))
// Verify error condition contains detailed error
condition := meta.FindStatusCondition(updated.Status.Conditions, ConditionTypeProfiling)
......@@ -1560,7 +1560,7 @@ spec:
// Should transition to Pending (validation passed)
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
Expect(updated.Status.State).Should(Equal(DGDRStatePending))
Expect(updated.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStatePending))
})
It("Should respect manual hardware config over GPU discovery", func() {
......@@ -1623,7 +1623,7 @@ spec:
// Should transition to Pending (validation passed with manual config)
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
Expect(updated.Status.State).Should(Equal(DGDRStatePending))
Expect(updated.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStatePending))
})
It("Should succeed with GPU discovery when cluster has GPU nodes", func() {
......@@ -1680,7 +1680,7 @@ spec:
// Should transition to Pending
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
Expect(updated.Status.State).Should(Equal(DGDRStatePending))
Expect(updated.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStatePending))
})
It("Should pass validation with explicit GPU ranges without GPU discovery", func() {
......@@ -1731,7 +1731,7 @@ spec:
// Should transition to Pending
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
Expect(updated.Status.State).Should(Equal(DGDRStatePending))
Expect(updated.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStatePending))
})
It("Should use GPU discovery with heterogeneous nodes (picks best)", func() {
......@@ -1803,7 +1803,7 @@ spec:
// Should transition to Pending (using H100 config)
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
Expect(updated.Status.State).Should(Equal(DGDRStatePending))
Expect(updated.Status.State).Should(Equal(nvidiacomv1alpha1.DGDRStatePending))
})
})
})
......@@ -112,6 +112,29 @@ _Appears in:_
| `key` _string_ | Key in the ConfigMap to select. If not specified, defaults to "disagg.yaml". | disagg.yaml | |
#### DGDRState
_Underlying type:_ _string_
_Validation:_
- Enum: [Initializing Pending Profiling Deploying Ready DeploymentDeleted Failed]
_Appears in:_
- [DynamoGraphDeploymentRequestStatus](#dynamographdeploymentrequeststatus)
| Field | Description |
| --- | --- |
| `Initializing` | |
| `Pending` | |
| `Profiling` | |
| `Deploying` | |
| `Ready` | |
| `DeploymentDeleted` | |
| `Failed` | |
#### DGDState
_Underlying type:_ _string_
......@@ -442,7 +465,7 @@ It serves as the primary interface for users to request model deployments with
specific performance and resource constraints, enabling SLA-driven deployments.
Lifecycle:
1. Initial → Pending: Validates spec and prepares for profiling
1. Initializing → Pending: Validates spec and prepares for profiling
2. Pending → Profiling: Creates and runs profiling job (online or AIC)
3. Profiling → Ready/Deploying: Generates DGD spec after profiling completes
4. Deploying → Ready: When autoApply=true, monitors DGD until Ready
......@@ -503,7 +526,7 @@ _Appears in:_
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `state` _string_ | State is a high-level textual status of the deployment request lifecycle.<br />Possible values: "", "Pending", "Profiling", "Deploying", "Ready", "DeploymentDeleted", "Failed"<br />Empty string ("") represents the initial state before initialization. | | |
| `state` _[DGDRState](#dgdrstate)_ | State is a high-level textual status of the deployment request lifecycle. | Initializing | Enum: [Initializing Pending Profiling Deploying Ready DeploymentDeleted Failed] <br /> |
| `backend` _string_ | Backend is extracted from profilingConfig.config.engine.backend for display purposes.<br />This field is populated by the controller and shown in kubectl output. | | Optional: \{\} <br /> |
| `observedGeneration` _integer_ | ObservedGeneration reflects the generation of the most recently observed spec.<br />Used to detect spec changes and enforce immutability after profiling starts. | | |
| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions contains the latest observed conditions of the deployment request.<br />Standard condition types include: Validation, Profiling, SpecGenerated, DeploymentReady.<br />Conditions are merged by type on patch updates. | | |
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment