Unverified Commit 82f721c7 authored by Nikita's avatar Nikita Committed by GitHub
Browse files

feat: Add DGD .status.state enum (#6324)


Signed-off-by: default avatarNikita Sukharev <kaonael@gmail.com>
parent af17da35
......@@ -416,10 +416,18 @@ spec:
description: Namespace is the namespace of the created DynamoGraphDeployment.
type: string
state:
default: initializing
description: |-
State is the current state of the DynamoGraphDeployment.
This value is mirrored from the DGD's status.state field.
enum:
- initializing
- pending
- successful
- failed
type: string
required:
- state
type: object
generatedDeployment:
description: |-
......
......@@ -11487,8 +11487,16 @@ spec:
The map key is the service name from spec.services.
type: object
state:
default: initializing
description: State is a high-level textual status of the graph deployment lifecycle.
enum:
- initializing
- pending
- successful
- failed
type: string
required:
- state
type: object
type: object
served: true
......
......@@ -44,6 +44,16 @@ const (
ComponentKindLeaderWorkerSet ComponentKind = "LeaderWorkerSet"
)
// +kubebuilder:validation:Enum=initializing;pending;successful;failed
type DGDState string
const (
DGDStateInitializing DGDState = "initializing"
DGDStatePending DGDState = "pending"
DGDStateSuccessful DGDState = "successful"
DGDStateFailed DGDState = "failed"
)
// DynamoGraphDeploymentSpec defines the desired state of DynamoGraphDeployment.
type DynamoGraphDeploymentSpec struct {
// PVCs defines a list of persistent volume claims that can be referenced by components.
......@@ -101,7 +111,8 @@ const (
// DynamoGraphDeploymentStatus defines the observed state of DynamoGraphDeployment.
type DynamoGraphDeploymentStatus struct {
// State is a high-level textual status of the graph deployment lifecycle.
State string `json:"state,omitempty"`
// +kubebuilder:default=initializing
State DGDState `json:"state"`
// Conditions contains the latest observed conditions of the graph deployment.
// The slice is merged by type on patch updates.
Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type"`
......@@ -251,16 +262,13 @@ type DynamoGraphDeployment struct {
Status DynamoGraphDeploymentStatus `json:"status,omitempty"`
}
func (s *DynamoGraphDeployment) SetState(state string) {
func (s *DynamoGraphDeployment) SetState(state DGDState) {
s.Status.State = state
}
// GetState returns the current lifecycle state
func (d *DynamoGraphDeployment) GetState() string {
if d.Status.State == "" {
return consts.ResourceStateUnknown
}
return d.Status.State
return string(d.Status.State)
}
// +kubebuilder:object:root=true
......
......@@ -197,7 +197,8 @@ type DeploymentStatus struct {
// State is the current state of the DynamoGraphDeployment.
// This value is mirrored from the DGD's status.state field.
State string `json:"state,omitempty"`
// +kubebuilder:default=initializing
State DGDState `json:"state"`
// Created indicates whether the DGD has been successfully created.
// Used to prevent recreation if the DGD is manually deleted by users.
......
......@@ -416,10 +416,18 @@ spec:
description: Namespace is the namespace of the created DynamoGraphDeployment.
type: string
state:
default: initializing
description: |-
State is the current state of the DynamoGraphDeployment.
This value is mirrored from the DGD's status.state field.
enum:
- initializing
- pending
- successful
- failed
type: string
required:
- state
type: object
generatedDeployment:
description: |-
......
......@@ -11487,8 +11487,16 @@ spec:
The map key is the service name from spec.services.
type: object
state:
default: initializing
description: State is a high-level textual status of the graph deployment lifecycle.
enum:
- initializing
- pending
- successful
- failed
type: string
required:
- state
type: object
type: object
served: true
......
......@@ -58,16 +58,9 @@ import (
gaiev1 "sigs.k8s.io/gateway-api-inference-extension/api/v1"
)
type State string
type Reason string
type Message string
const (
DGDStateFailed State = "failed"
DGDStateReady State = "successful"
DGDStatePending State = "pending"
)
// rbacManager interface for managing RBAC resources
type rbacManager interface {
EnsureServiceAccountWithRBAC(ctx context.Context, targetNamespace, serviceAccountName, clusterRoleName string) error
......@@ -108,7 +101,7 @@ func (r *DynamoGraphDeploymentReconciler) Reconcile(ctx context.Context, req ctr
reason := Reason("undefined")
message := Message("")
state := DGDStatePending
state := nvidiacomv1alpha1.DGDStatePending
// retrieve the CRD
dynamoDeployment := &nvidiacomv1alpha1.DynamoGraphDeployment{}
if err = r.Get(ctx, req.NamespacedName, dynamoDeployment); err != nil {
......@@ -123,14 +116,14 @@ func (r *DynamoGraphDeploymentReconciler) Reconcile(ctx context.Context, req ctr
}
if err != nil {
state = DGDStateFailed
state = nvidiacomv1alpha1.DGDStateFailed
message = Message(err.Error())
logger.Error(err, "Reconciliation failed")
}
dynamoDeployment.SetState(string(state))
dynamoDeployment.SetState(state)
readyStatus := metav1.ConditionFalse
if state == DGDStateReady {
if state == nvidiacomv1alpha1.DGDStateSuccessful {
readyStatus = metav1.ConditionTrue
}
......@@ -172,7 +165,7 @@ func (r *DynamoGraphDeploymentReconciler) Reconcile(ctx context.Context, req ctr
logger.Error(validationErr, "DynamoGraphDeployment validation failed, refusing to reconcile")
// Set validation error state and reason (defer will update status)
state = DGDStateFailed
state = nvidiacomv1alpha1.DGDStateFailed
reason = Reason("ValidationFailed")
message = Message(fmt.Sprintf("Validation failed: %v", validationErr))
......@@ -260,7 +253,7 @@ type Resource interface {
}
type ReconcileResult struct {
State State
State nvidiacomv1alpha1.DGDState
Reason Reason
Message Message
ServiceStatus map[string]nvidiacomv1alpha1.ServiceReplicaStatus
......@@ -1003,14 +996,14 @@ func (r *DynamoGraphDeploymentReconciler) checkResourcesReadiness(resources []Re
if len(notReadyResources) == 0 {
return ReconcileResult{
State: DGDStateReady,
State: nvidiacomv1alpha1.DGDStateSuccessful,
Reason: "all_resources_are_ready",
Message: Message("All resources are ready"),
ServiceStatus: serviceStatuses,
}
}
return ReconcileResult{
State: DGDStatePending,
State: nvidiacomv1alpha1.DGDStatePending,
Reason: "some_resources_are_not_ready",
Message: Message(fmt.Sprintf("Resources not ready: %s", strings.Join(notReadyReasons, "; "))),
ServiceStatus: serviceStatuses,
......
......@@ -404,7 +404,7 @@ func Test_reconcileGroveResources(t *testing.T) {
},
},
wantReconcileResult: ReconcileResult{
State: DGDStateReady,
State: v1alpha1.DGDStateSuccessful,
Reason: "all_resources_are_ready",
Message: "All resources are ready",
ServiceStatus: map[string]v1alpha1.ServiceReplicaStatus{
......@@ -467,7 +467,7 @@ func Test_reconcileGroveResources(t *testing.T) {
},
},
wantReconcileResult: ReconcileResult{
State: DGDStatePending,
State: v1alpha1.DGDStatePending,
Reason: "some_resources_are_not_ready",
Message: Message("Resources not ready: test-dgd: podclique/test-dgd-0-decode: desired=2, ready=1"),
ServiceStatus: map[string]v1alpha1.ServiceReplicaStatus{
......@@ -544,7 +544,7 @@ func Test_reconcileGroveResources(t *testing.T) {
},
},
wantReconcileResult: ReconcileResult{
State: DGDStateReady,
State: v1alpha1.DGDStateSuccessful,
Reason: "all_resources_are_ready",
Message: "All resources are ready",
ServiceStatus: map[string]v1alpha1.ServiceReplicaStatus{
......@@ -618,7 +618,7 @@ func Test_reconcileGroveResources(t *testing.T) {
},
},
wantReconcileResult: ReconcileResult{
State: DGDStatePending,
State: v1alpha1.DGDStatePending,
Reason: "some_resources_are_not_ready",
Message: Message("Resources not ready: test-dgd: pcsg/test-dgd-0-aggregated: desired=2, available=1"),
ServiceStatus: map[string]v1alpha1.ServiceReplicaStatus{
......@@ -1559,7 +1559,7 @@ func Test_reconcileDynamoComponentsDeployments(t *testing.T) {
},
},
wantReconcileResult: ReconcileResult{
State: DGDStateReady,
State: v1alpha1.DGDStateSuccessful,
Reason: "all_resources_are_ready",
Message: "All resources are ready",
ServiceStatus: map[string]v1alpha1.ServiceReplicaStatus{
......@@ -1619,7 +1619,7 @@ func Test_reconcileDynamoComponentsDeployments(t *testing.T) {
},
},
wantReconcileResult: ReconcileResult{
State: DGDStatePending,
State: v1alpha1.DGDStatePending,
Reason: "some_resources_are_not_ready",
Message: "Resources not ready: test-dgd-frontend: Component deployment not ready - Available condition not true",
ServiceStatus: map[string]v1alpha1.ServiceReplicaStatus{
......@@ -1749,7 +1749,7 @@ func Test_reconcileDynamoComponentsDeployments(t *testing.T) {
},
},
wantReconcileResult: ReconcileResult{
State: DGDStateReady,
State: v1alpha1.DGDStateSuccessful,
Reason: "all_resources_are_ready",
Message: "All resources are ready",
ServiceStatus: map[string]v1alpha1.ServiceReplicaStatus{
......@@ -1895,7 +1895,7 @@ func Test_reconcileDynamoComponentsDeployments(t *testing.T) {
},
},
wantReconcileResult: ReconcileResult{
State: DGDStatePending,
State: v1alpha1.DGDStatePending,
Reason: "some_resources_are_not_ready",
Message: "Resources not ready: test-dgd-decode-e1f2a6fe: Component deployment not ready - Available condition not true",
ServiceStatus: map[string]v1alpha1.ServiceReplicaStatus{
......@@ -2006,7 +2006,7 @@ func Test_reconcileDynamoComponentsDeployments(t *testing.T) {
},
},
wantReconcileResult: ReconcileResult{
State: DGDStatePending,
State: v1alpha1.DGDStatePending,
Reason: "some_resources_are_not_ready",
Message: "Resources not ready: test-dgd-decode-5f3d46ba: Component deployment not ready - Available condition not true; test-dgd-frontend: Component deployment not ready - Available condition not true",
ServiceStatus: map[string]v1alpha1.ServiceReplicaStatus{
......
......@@ -545,20 +545,20 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleReadyState(ctx context.Co
dgdr.Status.Deployment.State = dgd.Status.State
// Check if DGD degraded from Ready
if dgd.Status.State != string(DGDStateReady) {
if dgd.Status.State != nvidiacomv1alpha1.DGDStateSuccessful {
logger.Info("DGD degraded, transitioning back to Deploying",
"dgdState", dgd.Status.State)
dgdr.Status.State = DGDRStateDeploying
r.Recorder.Event(dgdr, corev1.EventTypeWarning, EventReasonDeploymentDegraded,
fmt.Sprintf(MessageDeploymentDegraded, dgd.Name, dgd.Status.State))
fmt.Sprintf(MessageDeploymentDegraded, dgd.Name, string(dgd.Status.State)))
meta.SetStatusCondition(&dgdr.Status.Conditions, metav1.Condition{
Type: ConditionTypeDeploymentReady,
Status: metav1.ConditionFalse,
Reason: EventReasonDeploymentDegraded,
Message: fmt.Sprintf("Deployment degraded to %s", dgd.Status.State),
Message: fmt.Sprintf("Deployment degraded to %s", string(dgd.Status.State)),
})
}
......@@ -602,7 +602,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleDeployingState(ctx contex
dgdr.Status.Deployment.State = dgd.Status.State
// Check if DGD is Ready
if dgd.Status.State == string(DGDStateReady) {
if dgd.Status.State == nvidiacomv1alpha1.DGDStateSuccessful {
logger.Info("DGD is Ready, transitioning to Ready state")
dgdr.Status.State = DGDRStateReady
......@@ -633,11 +633,12 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleDGDDeleted(ctx context.Co
logger.Info("DGD was deleted by user, transitioning to DeploymentDeleted state")
dgdr.Status.State = DGDRStateDeploymentDeleted
dgdr.Status.Deployment.State = ""
r.Recorder.Event(dgdr, corev1.EventTypeWarning, EventReasonDeploymentDeleted,
fmt.Sprintf(MessageDeploymentDeleted, dgdr.Status.Deployment.Name))
dgdr.Status.Deployment = nil
meta.SetStatusCondition(&dgdr.Status.Conditions, metav1.Condition{
Type: ConditionTypeDeploymentReady,
Status: metav1.ConditionFalse,
......@@ -745,7 +746,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) createDGD(ctx context.Context,
dgdr.Status.Deployment = &nvidiacomv1alpha1.DeploymentStatus{
Name: dgdName,
Namespace: dgdNamespace,
State: string(DGDStatePending),
State: nvidiacomv1alpha1.DGDStatePending,
Created: true,
}
return ctrl.Result{}, r.Status().Update(ctx, dgdr)
......@@ -758,7 +759,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) createDGD(ctx context.Context,
dgdr.Status.Deployment = &nvidiacomv1alpha1.DeploymentStatus{
Name: dgdName,
Namespace: dgdNamespace,
State: string(DGDStatePending),
State: nvidiacomv1alpha1.DGDStatePending,
Created: true,
}
......
......@@ -753,7 +753,7 @@ spec:
Name: "test-dgd-to-delete",
Namespace: namespace,
Created: true,
State: DGDRStateReady,
State: nvidiacomv1alpha1.DGDStateSuccessful,
}
Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed())
......
......@@ -112,6 +112,27 @@ _Appears in:_
| `key` _string_ | Key in the ConfigMap to select. If not specified, defaults to "disagg.yaml". | disagg.yaml | |
#### DGDState
_Underlying type:_ _string_
_Validation:_
- Enum: [initializing pending successful failed]
_Appears in:_
- [DeploymentStatus](#deploymentstatus)
- [DynamoGraphDeploymentStatus](#dynamographdeploymentstatus)
| Field | Description |
| --- | --- |
| `initializing` | |
| `pending` | |
| `successful` | |
| `failed` | |
#### DeploymentOverridesSpec
......@@ -149,7 +170,7 @@ _Appears in:_
| --- | --- | --- | --- |
| `name` _string_ | Name is the name of the created DynamoGraphDeployment. | | |
| `namespace` _string_ | Namespace is the namespace of the created DynamoGraphDeployment. | | |
| `state` _string_ | State is the current state of the DynamoGraphDeployment.<br />This value is mirrored from the DGD's status.state field. | | |
| `state` _[DGDState](#dgdstate)_ | State is the current state of the DynamoGraphDeployment.<br />This value is mirrored from the DGD's status.state field. | initializing | Enum: [initializing pending successful failed] <br /> |
| `created` _boolean_ | Created indicates whether the DGD has been successfully created.<br />Used to prevent recreation if the DGD is manually deleted by users. | | |
......@@ -601,7 +622,7 @@ _Appears in:_
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `state` _string_ | State is a high-level textual status of the graph deployment lifecycle. | | |
| `state` _[DGDState](#dgdstate)_ | State is a high-level textual status of the graph deployment lifecycle. | initializing | Enum: [initializing pending successful failed] <br /> |
| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions contains the latest observed conditions of the graph deployment.<br />The slice is merged by type on patch updates. | | |
| `services` _object (keys:string, values:[ServiceReplicaStatus](#servicereplicastatus))_ | Services contains per-service replica status information.<br />The map key is the service name from spec.services. | | Optional: \{\} <br /> |
| `restart` _[RestartStatus](#restartstatus)_ | Restart contains the status of the restart of the graph deployment. | | Optional: \{\} <br /> |
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment