Unverified Commit aa1bc3c5 authored by Thomas Montfort's avatar Thomas Montfort Committed by GitHub
Browse files

feat(operator): DGD service status replica information (#4863)

parent f790e921
......@@ -10400,6 +10400,59 @@ spec:
PodSelector contains the labels that can be used to select Pods belonging to
this component deployment.
type: object
service:
description: Service contains replica status information for this service.
properties:
availableReplicas:
description: |-
AvailableReplicas is the number of available replicas.
For Deployment: replicas ready for >= minReadySeconds.
For PodCliqueScalingGroup: replicas where all constituent PodCliques have >= MinAvailable ready pods.
Not available for PodClique or LeaderWorkerSet.
When nil, the field is omitted from the API response.
format: int32
minimum: 0
type: integer
componentKind:
description: ComponentKind is the underlying resource kind (e.g., "PodClique", "PodCliqueScalingGroup", "Deployment", "LeaderWorkerSet").
enum:
- PodClique
- PodCliqueScalingGroup
- Deployment
- LeaderWorkerSet
type: string
componentName:
description: ComponentName is the name of the underlying resource.
type: string
readyReplicas:
description: |-
ReadyReplicas is the number of ready replicas.
Populated for PodClique, Deployment, and LeaderWorkerSet.
Not available for PodCliqueScalingGroup.
When nil, the field is omitted from the API response.
format: int32
minimum: 0
type: integer
replicas:
description: |-
Replicas is the total number of non-terminated replicas.
Required for all component kinds.
format: int32
minimum: 0
type: integer
updatedReplicas:
description: |-
UpdatedReplicas is the number of replicas at the current/desired revision.
Required for all component kinds.
format: int32
minimum: 0
type: integer
required:
- componentKind
- componentName
- replicas
- updatedReplicas
type: object
required:
- conditions
type: object
......
......@@ -10530,6 +10530,64 @@ spec:
- type
type: object
type: array
services:
additionalProperties:
description: ServiceReplicaStatus contains replica information for a single service.
properties:
availableReplicas:
description: |-
AvailableReplicas is the number of available replicas.
For Deployment: replicas ready for >= minReadySeconds.
For PodCliqueScalingGroup: replicas where all constituent PodCliques have >= MinAvailable ready pods.
Not available for PodClique or LeaderWorkerSet.
When nil, the field is omitted from the API response.
format: int32
minimum: 0
type: integer
componentKind:
description: ComponentKind is the underlying resource kind (e.g., "PodClique", "PodCliqueScalingGroup", "Deployment", "LeaderWorkerSet").
enum:
- PodClique
- PodCliqueScalingGroup
- Deployment
- LeaderWorkerSet
type: string
componentName:
description: ComponentName is the name of the underlying resource.
type: string
readyReplicas:
description: |-
ReadyReplicas is the number of ready replicas.
Populated for PodClique, Deployment, and LeaderWorkerSet.
Not available for PodCliqueScalingGroup.
When nil, the field is omitted from the API response.
format: int32
minimum: 0
type: integer
replicas:
description: |-
Replicas is the total number of non-terminated replicas.
Required for all component kinds.
format: int32
minimum: 0
type: integer
updatedReplicas:
description: |-
UpdatedReplicas is the number of replicas at the current/desired revision.
Required for all component kinds.
format: int32
minimum: 0
type: integer
required:
- componentKind
- componentName
- replicas
- updatedReplicas
type: object
description: |-
Services contains per-service replica status information.
The map key is the service name from spec.services.
type: object
state:
description: State is a high-level textual status of the graph deployment lifecycle.
type: string
......
......@@ -179,6 +179,10 @@ type DynamoComponentDeploymentStatus struct {
// PodSelector contains the labels that can be used to select Pods belonging to
// this component deployment.
PodSelector map[string]string `json:"podSelector,omitempty"`
// Service contains replica status information for this service.
// +optional
Service ServiceReplicaStatus `json:"service,omitempty"`
}
// +genclient
......@@ -218,6 +222,10 @@ func (s *DynamoComponentDeployment) IsReady() (bool, string) {
return ready, reason
}
func (s *DynamoComponentDeployment) GetServiceStatuses() map[string]ServiceReplicaStatus {
return map[string]ServiceReplicaStatus{s.Spec.ServiceName: s.Status.Service}
}
func (s *DynamoComponentDeploymentStatus) IsReady() (bool, string) {
for _, condition := range s.Conditions {
if condition.Type == DynamoGraphDeploymentConditionTypeAvailable && condition.Status == metav1.ConditionTrue {
......
......@@ -27,6 +27,21 @@ import (
// EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN!
// NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized.
// ComponentKind represents the type of underlying Kubernetes resource.
// +kubebuilder:validation:Enum=PodClique;PodCliqueScalingGroup;Deployment;LeaderWorkerSet
type ComponentKind string
const (
// ComponentKindPodClique represents a PodClique resource.
ComponentKindPodClique ComponentKind = "PodClique"
// ComponentKindPodCliqueScalingGroup represents a PodCliqueScalingGroup resource.
ComponentKindPodCliqueScalingGroup ComponentKind = "PodCliqueScalingGroup"
// ComponentKindDeployment represents a Deployment resource.
ComponentKindDeployment ComponentKind = "Deployment"
// ComponentKindLeaderWorkerSet represents a LeaderWorkerSet resource.
ComponentKindLeaderWorkerSet ComponentKind = "LeaderWorkerSet"
)
// DynamoGraphDeploymentSpec defines the desired state of DynamoGraphDeployment.
type DynamoGraphDeploymentSpec struct {
// PVCs defines a list of persistent volume claims that can be referenced by components.
......@@ -54,6 +69,45 @@ type DynamoGraphDeploymentStatus struct {
// Conditions contains the latest observed conditions of the graph deployment.
// The slice is merged by type on patch updates.
Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type"`
// Services contains per-service replica status information.
// The map key is the service name from spec.services.
// +optional
Services map[string]ServiceReplicaStatus `json:"services,omitempty"`
}
// ServiceReplicaStatus contains replica information for a single service.
type ServiceReplicaStatus struct {
// ComponentKind is the underlying resource kind (e.g., "PodClique", "PodCliqueScalingGroup", "Deployment", "LeaderWorkerSet").
ComponentKind ComponentKind `json:"componentKind"`
// ComponentName is the name of the underlying resource.
ComponentName string `json:"componentName"`
// Replicas is the total number of non-terminated replicas.
// Required for all component kinds.
// +kubebuilder:validation:Minimum=0
Replicas int32 `json:"replicas"`
// UpdatedReplicas is the number of replicas at the current/desired revision.
// Required for all component kinds.
// +kubebuilder:validation:Minimum=0
UpdatedReplicas int32 `json:"updatedReplicas"`
// ReadyReplicas is the number of ready replicas.
// Populated for PodClique, Deployment, and LeaderWorkerSet.
// Not available for PodCliqueScalingGroup.
// When nil, the field is omitted from the API response.
// +optional
// +kubebuilder:validation:Minimum=0
ReadyReplicas *int32 `json:"readyReplicas,omitempty"`
// AvailableReplicas is the number of available replicas.
// For Deployment: replicas ready for >= minReadySeconds.
// For PodCliqueScalingGroup: replicas where all constituent PodCliques have >= MinAvailable ready pods.
// Not available for PodClique or LeaderWorkerSet.
// When nil, the field is omitted from the API response.
// +optional
// +kubebuilder:validation:Minimum=0
AvailableReplicas *int32 `json:"availableReplicas,omitempty"`
}
// +kubebuilder:object:root=true
......
......@@ -421,6 +421,7 @@ func (in *DynamoComponentDeploymentStatus) DeepCopyInto(out *DynamoComponentDepl
(*out)[key] = val
}
}
in.Service.DeepCopyInto(&out.Service)
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoComponentDeploymentStatus.
......@@ -768,6 +769,13 @@ func (in *DynamoGraphDeploymentStatus) DeepCopyInto(out *DynamoGraphDeploymentSt
(*in)[i].DeepCopyInto(&(*out)[i])
}
}
if in.Services != nil {
in, out := &in.Services, &out.Services
*out = make(map[string]ServiceReplicaStatus, len(*in))
for key, val := range *in {
(*out)[key] = *val.DeepCopy()
}
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoGraphDeploymentStatus.
......@@ -1214,6 +1222,31 @@ func (in *ScalingAdapter) DeepCopy() *ScalingAdapter {
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *ServiceReplicaStatus) DeepCopyInto(out *ServiceReplicaStatus) {
*out = *in
if in.ReadyReplicas != nil {
in, out := &in.ReadyReplicas, &out.ReadyReplicas
*out = new(int32)
**out = **in
}
if in.AvailableReplicas != nil {
in, out := &in.AvailableReplicas, &out.AvailableReplicas
*out = new(int32)
**out = **in
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServiceReplicaStatus.
func (in *ServiceReplicaStatus) DeepCopy() *ServiceReplicaStatus {
if in == nil {
return nil
}
out := new(ServiceReplicaStatus)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *SharedMemorySpec) DeepCopyInto(out *SharedMemorySpec) {
*out = *in
......
......@@ -10400,6 +10400,59 @@ spec:
PodSelector contains the labels that can be used to select Pods belonging to
this component deployment.
type: object
service:
description: Service contains replica status information for this service.
properties:
availableReplicas:
description: |-
AvailableReplicas is the number of available replicas.
For Deployment: replicas ready for >= minReadySeconds.
For PodCliqueScalingGroup: replicas where all constituent PodCliques have >= MinAvailable ready pods.
Not available for PodClique or LeaderWorkerSet.
When nil, the field is omitted from the API response.
format: int32
minimum: 0
type: integer
componentKind:
description: ComponentKind is the underlying resource kind (e.g., "PodClique", "PodCliqueScalingGroup", "Deployment", "LeaderWorkerSet").
enum:
- PodClique
- PodCliqueScalingGroup
- Deployment
- LeaderWorkerSet
type: string
componentName:
description: ComponentName is the name of the underlying resource.
type: string
readyReplicas:
description: |-
ReadyReplicas is the number of ready replicas.
Populated for PodClique, Deployment, and LeaderWorkerSet.
Not available for PodCliqueScalingGroup.
When nil, the field is omitted from the API response.
format: int32
minimum: 0
type: integer
replicas:
description: |-
Replicas is the total number of non-terminated replicas.
Required for all component kinds.
format: int32
minimum: 0
type: integer
updatedReplicas:
description: |-
UpdatedReplicas is the number of replicas at the current/desired revision.
Required for all component kinds.
format: int32
minimum: 0
type: integer
required:
- componentKind
- componentName
- replicas
- updatedReplicas
type: object
required:
- conditions
type: object
......
......@@ -10530,6 +10530,64 @@ spec:
- type
type: object
type: array
services:
additionalProperties:
description: ServiceReplicaStatus contains replica information for a single service.
properties:
availableReplicas:
description: |-
AvailableReplicas is the number of available replicas.
For Deployment: replicas ready for >= minReadySeconds.
For PodCliqueScalingGroup: replicas where all constituent PodCliques have >= MinAvailable ready pods.
Not available for PodClique or LeaderWorkerSet.
When nil, the field is omitted from the API response.
format: int32
minimum: 0
type: integer
componentKind:
description: ComponentKind is the underlying resource kind (e.g., "PodClique", "PodCliqueScalingGroup", "Deployment", "LeaderWorkerSet").
enum:
- PodClique
- PodCliqueScalingGroup
- Deployment
- LeaderWorkerSet
type: string
componentName:
description: ComponentName is the name of the underlying resource.
type: string
readyReplicas:
description: |-
ReadyReplicas is the number of ready replicas.
Populated for PodClique, Deployment, and LeaderWorkerSet.
Not available for PodCliqueScalingGroup.
When nil, the field is omitted from the API response.
format: int32
minimum: 0
type: integer
replicas:
description: |-
Replicas is the total number of non-terminated replicas.
Required for all component kinds.
format: int32
minimum: 0
type: integer
updatedReplicas:
description: |-
UpdatedReplicas is the number of replicas at the current/desired revision.
Required for all component kinds.
format: int32
minimum: 0
type: integer
required:
- componentKind
- componentName
- replicas
- updatedReplicas
type: object
description: |-
Services contains per-service replica status information.
The map key is the service name from spec.services.
type: object
state:
description: State is a high-level textual status of the graph deployment lifecycle.
type: string
......
......@@ -226,130 +226,24 @@ func (r *DynamoComponentDeploymentReconciler) Reconcile(ctx context.Context, req
}
}
modified := false
// Create the appropriate workload resource based on deployment type
var leaderWorkerSets []*leaderworkersetv1.LeaderWorkerSet
var deployment *appsv1.Deployment
var componentReconcileResult ComponentReconcileResult
if r.Config.LWS.Enabled && dynamoComponentDeployment.IsMultinode() {
desiredReplicas := int32(1)
if dynamoComponentDeployment.Spec.Replicas != nil {
desiredReplicas = *dynamoComponentDeployment.Spec.Replicas
}
anyModified := false
for i := range int(desiredReplicas) {
modified_, _, err := commonController.SyncResource(ctx, r, dynamoComponentDeployment, func(ctx context.Context) (*volcanov1beta1.PodGroup, bool, error) {
return r.generateVolcanoPodGroup(ctx, generateResourceOption{
dynamoComponentDeployment: dynamoComponentDeployment,
isStealingTrafficDebugModeEnabled: false,
containsStealingTrafficDebugModeEnabled: false,
instanceID: &i,
})
})
if err != nil {
return ctrl.Result{}, err
}
if modified_ {
anyModified = true
}
modified_, lwsObj, err := commonController.SyncResource(ctx, r, dynamoComponentDeployment, func(ctx context.Context) (*leaderworkersetv1.LeaderWorkerSet, bool, error) {
return r.generateLeaderWorkerSet(ctx, generateResourceOption{
dynamoComponentDeployment: dynamoComponentDeployment,
isStealingTrafficDebugModeEnabled: false,
containsStealingTrafficDebugModeEnabled: false,
instanceID: &i,
})
})
if err != nil {
return ctrl.Result{}, err
}
if modified_ {
anyModified = true
}
leaderWorkerSets = append(leaderWorkerSets, lwsObj)
}
// Clean up any excess LeaderWorkerSets (if replicas were decreased)
baseKubeName := r.getKubeName(dynamoComponentDeployment, false)
for i := int(desiredReplicas); ; i++ {
// Try to find a LeaderWorkerSet with the next index
nextLWSName := fmt.Sprintf("%s-%d", baseKubeName, i)
lwsToDelete := &leaderworkersetv1.LeaderWorkerSet{}
err := r.Get(ctx, types.NamespacedName{
Name: nextLWSName,
Namespace: dynamoComponentDeployment.Namespace,
}, lwsToDelete)
if err != nil {
if k8serrors.IsNotFound(err) {
break
}
return ctrl.Result{}, err
}
err = r.Delete(ctx, lwsToDelete)
if err != nil {
return ctrl.Result{}, err
}
podGroupName := nextLWSName
podGroupToDelete := &volcanov1beta1.PodGroup{}
err = r.Get(ctx, types.NamespacedName{
Name: podGroupName,
Namespace: dynamoComponentDeployment.Namespace,
}, podGroupToDelete)
if err != nil {
if !k8serrors.IsNotFound(err) {
logs.Error(err, "Failed to get PodGroup for deletion", "podGroupName", podGroupName)
}
} else {
err = r.Delete(ctx, podGroupToDelete)
if err != nil {
logs.Error(err, "Failed to delete PodGroup", "podGroupName", podGroupName)
}
}
anyModified = true
}
modified = anyModified
componentReconcileResult, err = r.reconcileLeaderWorkerSetResources(ctx, dynamoComponentDeployment)
} else {
modified_, obj, err := r.createOrUpdateOrDeleteDeployments(ctx, generateResourceOption{
dynamoComponentDeployment: dynamoComponentDeployment,
})
if err != nil {
return ctrl.Result{}, err
}
if modified_ {
modified = true
}
deployment = obj
componentReconcileResult, err = r.reconcileDeploymentResources(ctx, dynamoComponentDeployment)
}
if err != nil {
return ctrl.Result{}, fmt.Errorf("failed to reconcile the resources: %w", err)
}
modified := componentReconcileResult.modified
// create or update api-server service
modified_, err := r.createOrUpdateOrDeleteServices(ctx, generateResourceOption{
serviceModified, err := r.createOrUpdateOrDeleteServices(ctx, generateResourceOption{
dynamoComponentDeployment: dynamoComponentDeployment,
})
if err != nil {
return
}
if modified_ {
modified = true
return ctrl.Result{}, fmt.Errorf("failed to create or update the service: %w", err)
}
// create or update headless service for model endpoint discovery
......@@ -368,14 +262,14 @@ func (r *DynamoComponentDeploymentReconciler) Reconcile(ctx context.Context, req
}
// create or update api-server ingresses
modified_, err = r.createOrUpdateOrDeleteIngress(ctx, generateResourceOption{
ingressModified, err := r.createOrUpdateOrDeleteIngress(ctx, generateResourceOption{
dynamoComponentDeployment: dynamoComponentDeployment,
})
if err != nil {
return
return ctrl.Result{}, fmt.Errorf("failed to create or update the ingress: %w", err)
}
if modified_ {
if serviceModified || ingressModified {
modified = true
}
......@@ -386,50 +280,220 @@ func (r *DynamoComponentDeploymentReconciler) Reconcile(ctx context.Context, req
logs.Info("Finished reconciling.")
r.Recorder.Eventf(dynamoComponentDeployment, corev1.EventTypeNormal, "Update", "All resources updated!")
if dynamoComponentDeployment.IsMultinode() {
err = r.computeAvailableStatusConditionForLeaderWorkerSets(ctx, req, leaderWorkerSets)
} else {
err = r.computeAvailableStatusCondition(ctx, req, deployment)
err = r.setStatusConditionAndServiceReplicaStatus(ctx, dynamoComponentDeployment, componentReconcileResult)
if err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set status condition and service replica status: %w", err)
}
return
}
// computeAvailableStatusConditionForLeaderWorkerSet updates the status condition based on LeaderWorkerSet readiness
func (r *DynamoComponentDeploymentReconciler) computeAvailableStatusConditionForLeaderWorkerSets(ctx context.Context, req ctrl.Request, leaderWorkerSets []*leaderworkersetv1.LeaderWorkerSet) error {
logs := log.FromContext(ctx)
type ComponentReconcileResult struct {
modified bool
status metav1.ConditionStatus
reason string
message string
serviceReplicaStatus v1alpha1.ServiceReplicaStatus
}
func (r *DynamoComponentDeploymentReconciler) reconcileDeploymentResources(ctx context.Context, dynamoComponentDeployment *v1alpha1.DynamoComponentDeployment) (ComponentReconcileResult, error) {
logger := log.FromContext(ctx)
deploymentModified, deployment, err := r.createOrUpdateOrDeleteDeployments(ctx, generateResourceOption{
dynamoComponentDeployment: dynamoComponentDeployment,
})
if err != nil {
return ComponentReconcileResult{}, fmt.Errorf("failed to create or update the deployment: %w", err)
}
serviceReplicaStatus := v1alpha1.ServiceReplicaStatus{
ComponentKind: v1alpha1.ComponentKindDeployment,
ComponentName: deployment.Name,
Replicas: deployment.Status.Replicas,
UpdatedReplicas: deployment.Status.UpdatedReplicas,
ReadyReplicas: &deployment.Status.ReadyReplicas,
AvailableReplicas: &deployment.Status.AvailableReplicas,
}
if IsDeploymentReady(deployment) {
logger.Info("Deployment is ready. Setting available status condition to true.")
return ComponentReconcileResult{
modified: deploymentModified,
status: metav1.ConditionTrue,
reason: "DeploymentReady",
message: "Deployment is ready",
serviceReplicaStatus: serviceReplicaStatus,
}, nil
}
return ComponentReconcileResult{
modified: deploymentModified,
status: metav1.ConditionFalse,
reason: "DeploymentNotReady",
message: "Deployment is not ready",
serviceReplicaStatus: serviceReplicaStatus,
}, nil
}
func (r *DynamoComponentDeploymentReconciler) reconcileLeaderWorkerSetResources(ctx context.Context, dynamoComponentDeployment *v1alpha1.DynamoComponentDeployment) (ComponentReconcileResult, error) {
logger := log.FromContext(ctx)
desiredReplicas := int32(1)
if dynamoComponentDeployment.Spec.Replicas != nil {
desiredReplicas = *dynamoComponentDeployment.Spec.Replicas
}
anyModified := false
leaderWorkerSets := make([]*leaderworkersetv1.LeaderWorkerSet, 0, desiredReplicas)
for i := range int(desiredReplicas) {
volcanoPodGroupModified, _, err := commonController.SyncResource(ctx, r, dynamoComponentDeployment, func(ctx context.Context) (*volcanov1beta1.PodGroup, bool, error) {
return r.generateVolcanoPodGroup(ctx, generateResourceOption{
dynamoComponentDeployment: dynamoComponentDeployment,
isStealingTrafficDebugModeEnabled: false,
containsStealingTrafficDebugModeEnabled: false,
instanceID: &i,
})
})
if err != nil {
return ComponentReconcileResult{}, fmt.Errorf("failed to sync the PodGroup: %w", err)
}
leaderWorkerSetModified, lwsObj, err := commonController.SyncResource(ctx, r, dynamoComponentDeployment, func(ctx context.Context) (*leaderworkersetv1.LeaderWorkerSet, bool, error) {
return r.generateLeaderWorkerSet(ctx, generateResourceOption{
dynamoComponentDeployment: dynamoComponentDeployment,
isStealingTrafficDebugModeEnabled: false,
containsStealingTrafficDebugModeEnabled: false,
instanceID: &i,
})
})
if err != nil {
return ComponentReconcileResult{}, fmt.Errorf("failed to sync the LeaderWorkerSet: %w", err)
}
if leaderWorkerSetModified || volcanoPodGroupModified {
anyModified = true
}
leaderWorkerSets = append(leaderWorkerSets, lwsObj)
}
// Clean up any excess LeaderWorkerSets (if replicas were decreased)
baseKubeName := r.getKubeName(dynamoComponentDeployment, false)
for i := int(desiredReplicas); ; i++ {
// Try to find a LeaderWorkerSet with the next index
nextLWSName := fmt.Sprintf("%s-%d", baseKubeName, i)
lwsToDelete := &leaderworkersetv1.LeaderWorkerSet{}
err := r.Get(ctx, types.NamespacedName{
Name: nextLWSName,
Namespace: dynamoComponentDeployment.Namespace,
}, lwsToDelete)
if err != nil {
if k8serrors.IsNotFound(err) {
break
}
return ComponentReconcileResult{}, fmt.Errorf("failed to get the LeaderWorkerSet for deletion: %w", err)
}
err = r.Delete(ctx, lwsToDelete)
if err != nil {
return ComponentReconcileResult{}, fmt.Errorf("failed to delete the LeaderWorkerSet: %w", err)
}
podGroupName := nextLWSName
podGroupToDelete := &volcanov1beta1.PodGroup{}
err = r.Get(ctx, types.NamespacedName{
Name: podGroupName,
Namespace: dynamoComponentDeployment.Namespace,
}, podGroupToDelete)
if err != nil {
if !k8serrors.IsNotFound(err) {
logger.Error(err, "Failed to get PodGroup for deletion", "podGroupName", podGroupName)
}
} else {
err = r.Delete(ctx, podGroupToDelete)
if err != nil {
logger.Error(err, "Failed to delete PodGroup", "podGroupName", podGroupName)
}
}
anyModified = true
}
allReady := true
lwsReplicaStatuses := []v1alpha1.ServiceReplicaStatus{}
for _, leaderWorkerSet := range leaderWorkerSets {
if !IsLeaderWorkerSetReady(leaderWorkerSet) {
allReady = false
break
}
lwsReplicaStatuses = append(lwsReplicaStatuses, getLeaderWorkerSetReplicasStatus(leaderWorkerSet))
}
if allReady {
logs.Info("All LeaderWorkerSets are ready. Setting available status condition to true.")
_, err := r.setStatusConditions(ctx, req,
metav1.Condition{
Type: v1alpha1.DynamoGraphDeploymentConditionTypeAvailable,
Status: metav1.ConditionTrue,
Reason: "AllLeaderWorkerSetsReady",
Message: "All LeaderWorkerSets are ready",
},
)
return err
} else {
logs.Info("Not all LeaderWorkerSets are ready. Setting available status condition to false.")
_, err := r.setStatusConditions(ctx, req,
metav1.Condition{
Type: v1alpha1.DynamoGraphDeploymentConditionTypeAvailable,
Status: metav1.ConditionFalse,
Reason: "LeaderWorkerSetsNotReady",
Message: "Not all LeaderWorkerSets are ready",
},
)
return err
return ComponentReconcileResult{
modified: anyModified,
status: metav1.ConditionTrue,
reason: "AllLeaderWorkerSetsReady",
message: "All LeaderWorkerSets are ready",
serviceReplicaStatus: combineLWSReplicaStatuses(lwsReplicaStatuses),
}, nil
}
return ComponentReconcileResult{
modified: anyModified,
status: metav1.ConditionFalse,
reason: "SomeLeaderWorkerSetsNotReady",
message: "Some LeaderWorkerSets are not ready",
serviceReplicaStatus: combineLWSReplicaStatuses(lwsReplicaStatuses),
}, nil
}
func (r *DynamoComponentDeploymentReconciler) setStatusConditionAndServiceReplicaStatus(ctx context.Context, dynamoComponentDeployment *v1alpha1.DynamoComponentDeployment, componentReconcileResult ComponentReconcileResult) error {
condition := metav1.Condition{
Type: v1alpha1.DynamoGraphDeploymentConditionTypeAvailable,
Status: componentReconcileResult.status,
Reason: componentReconcileResult.reason,
Message: componentReconcileResult.message,
}
meta.SetStatusCondition(&dynamoComponentDeployment.Status.Conditions, condition)
dynamoComponentDeployment.Status.Service = componentReconcileResult.serviceReplicaStatus
err := r.Status().Update(ctx, dynamoComponentDeployment)
if err != nil {
return fmt.Errorf("failed to update DynamoComponentDeployment status: %w", err)
}
return nil
}
func getLeaderWorkerSetReplicasStatus(leaderWorkerSet *leaderworkersetv1.LeaderWorkerSet) v1alpha1.ServiceReplicaStatus {
return v1alpha1.ServiceReplicaStatus{
ComponentKind: v1alpha1.ComponentKindLeaderWorkerSet,
ComponentName: leaderWorkerSet.Name,
Replicas: leaderWorkerSet.Status.Replicas,
UpdatedReplicas: leaderWorkerSet.Status.UpdatedReplicas,
ReadyReplicas: &leaderWorkerSet.Status.ReadyReplicas,
}
}
func combineLWSReplicaStatuses(serviceReplicaStatuses []v1alpha1.ServiceReplicaStatus) v1alpha1.ServiceReplicaStatus {
if len(serviceReplicaStatuses) == 0 {
return v1alpha1.ServiceReplicaStatus{}
}
firstServiceStatus := serviceReplicaStatuses[0]
var readyReplicas int32 = 0
if firstServiceStatus.ReadyReplicas != nil {
readyReplicas = *firstServiceStatus.ReadyReplicas
}
for _, serviceReplicaStatus := range serviceReplicaStatuses[1:] {
firstServiceStatus.Replicas += serviceReplicaStatus.Replicas
firstServiceStatus.UpdatedReplicas += serviceReplicaStatus.UpdatedReplicas
if serviceReplicaStatus.ReadyReplicas != nil {
readyReplicas += *serviceReplicaStatus.ReadyReplicas
}
}
firstServiceStatus.ReadyReplicas = &readyReplicas
return firstServiceStatus
}
// IsLeaderWorkerSetReady determines if a LeaderWorkerSet is fully ready and available
......@@ -672,33 +736,6 @@ func (r *DynamoComponentDeploymentReconciler) FinalizeResource(ctx context.Conte
return nil
}
func (r *DynamoComponentDeploymentReconciler) computeAvailableStatusCondition(ctx context.Context, req ctrl.Request, deployment *appsv1.Deployment) error {
logs := log.FromContext(ctx)
if IsDeploymentReady(deployment) {
logs.Info("Deployment is ready. Setting available status condition to true.")
_, err := r.setStatusConditions(ctx, req,
metav1.Condition{
Type: v1alpha1.DynamoGraphDeploymentConditionTypeAvailable,
Status: metav1.ConditionTrue,
Reason: "DeploymentReady",
Message: "Deployment is ready",
},
)
return err
} else {
logs.Info("Deployment is not ready. Setting available status condition to false.")
_, err := r.setStatusConditions(ctx, req,
metav1.Condition{
Type: v1alpha1.DynamoGraphDeploymentConditionTypeAvailable,
Status: metav1.ConditionFalse,
Reason: "DeploymentNotReady",
Message: "Deployment is not ready",
},
)
return err
}
}
// IsDeploymentReady determines if a Kubernetes Deployment is fully ready and available.
// It checks various status fields to ensure all replicas are available and the deployment
// configuration has been fully applied.
......
......@@ -515,21 +515,49 @@ func AppendUniqueImagePullSecrets(existing, additional []corev1.LocalObjectRefer
}
type Resource struct {
client.Object
isReady func() (bool, string)
object client.Object
isReady bool
readyReason string
serviceStatuses map[string]v1alpha1.ServiceReplicaStatus
}
func WrapResource[T client.Object](resource T, isReady func() (bool, string)) *Resource {
func NewResource[T client.Object](resource T, isReady func() (bool, string)) (*Resource, error) {
v := reflect.ValueOf(resource)
// handles untype nil and typed nil
if !v.IsValid() || v.IsNil() {
return nil, fmt.Errorf("resource is nil")
}
ready, reason := isReady()
return &Resource{
Object: resource,
isReady: isReady,
object: resource,
isReady: ready,
readyReason: reason,
}, nil
}
func NewResourceWithServiceStatuses[T client.Object](resource T, isReadyAndServiceStatuses func() (bool, string, map[string]v1alpha1.ServiceReplicaStatus)) (*Resource, error) {
v := reflect.ValueOf(resource)
// handles untype nil and typed nil
if !v.IsValid() || v.IsNil() {
return nil, fmt.Errorf("resource is nil")
}
ready, reason, serviceStatuses := isReadyAndServiceStatuses()
return &Resource{
object: resource,
isReady: ready,
readyReason: reason,
serviceStatuses: serviceStatuses,
}, nil
}
func (r *Resource) IsReady() (bool, string) {
return r.isReady()
return r.isReady, r.readyReason
}
func (r *Resource) GetName() string {
return r.Object.GetName()
return r.object.GetName()
}
func (r *Resource) GetServiceStatuses() map[string]v1alpha1.ServiceReplicaStatus {
return r.serviceStatuses
}
......@@ -12,6 +12,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
commonconsts "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
......@@ -51,94 +52,116 @@ func (d *GroveMultinodeDeployer) GetHostNames(serviceName string, numberOfNodes
return hostnames
}
// EvaluateAllComponentsReady determines if all Grove components are ready
// GetComponentReadinessAndServiceReplicaStatuses determines if all Grove components are ready
// and returns the service replica statuses for each component.
// - PodCliques: spec.replicas == status.readyReplicas
// - PodCliqueScalingGroups: spec.replicas == status.availableReplicas
func EvaluateAllComponentsReady(ctx context.Context, client client.Client, dgd *nvidiacomv1alpha1.DynamoGraphDeployment) (bool, string) {
func GetComponentReadinessAndServiceReplicaStatuses(ctx context.Context, client client.Client, dgd *nvidiacomv1alpha1.DynamoGraphDeployment) (bool, string, map[string]v1alpha1.ServiceReplicaStatus) {
logger := log.FromContext(ctx)
var notReadyComponents []string
serviceStatuses := make(map[string]v1alpha1.ServiceReplicaStatus, len(dgd.Spec.Services))
for serviceName, component := range dgd.Spec.Services {
numberOfNodes := component.GetNumberOfNodes()
isMultinode := numberOfNodes > 1
isMultinode := component.GetNumberOfNodes() > 1
resourceName := fmt.Sprintf("%s-0-%s", dgd.Name, strings.ToLower(serviceName))
if isMultinode {
// Check PodCliqueScalingGroup: spec.replicas == status.availableReplicas
if ok, reason := checkPCSGReady(ctx, client, resourceName, dgd.Namespace, logger); !ok {
ok, reason, serviceStatus := checkPCSGReady(ctx, client, resourceName, dgd.Namespace, logger)
serviceStatuses[serviceName] = serviceStatus
if !ok {
notReadyComponents = append(notReadyComponents, fmt.Sprintf("pcsg/%s: %s", resourceName, reason))
}
} else {
// Check PodClique: spec.replicas == status.readyReplicas
if ok, reason := checkPodCliqueReady(ctx, client, resourceName, dgd.Namespace, logger); !ok {
ok, reason, serviceStatus := checkPodCliqueReady(ctx, client, resourceName, dgd.Namespace, logger)
serviceStatuses[serviceName] = serviceStatus
if !ok {
notReadyComponents = append(notReadyComponents, fmt.Sprintf("podclique/%s: %s", resourceName, reason))
}
}
}
if len(notReadyComponents) > 0 {
return false, strings.Join(notReadyComponents, "; ")
return false, strings.Join(notReadyComponents, "; "), serviceStatuses
}
return true, ""
return true, "", serviceStatuses
}
// checkPodCliqueReady checks if a PodClique has spec.replicas == status.readyReplicas
func checkPodCliqueReady(ctx context.Context, client client.Client, resourceName, namespace string, logger logr.Logger) (bool, string) {
func checkPodCliqueReady(ctx context.Context, client client.Client, resourceName, namespace string, logger logr.Logger) (bool, string, v1alpha1.ServiceReplicaStatus) {
podClique := &grovev1alpha1.PodClique{}
err := client.Get(ctx, types.NamespacedName{Name: resourceName, Namespace: namespace}, podClique)
if err != nil {
if errors.IsNotFound(err) {
logger.V(2).Info("PodClique not found", "resourceName", resourceName)
return false, "resource not found"
return false, "resource not found", v1alpha1.ServiceReplicaStatus{}
}
logger.V(1).Info("Failed to get PodClique", "error", err, "resourceName", resourceName)
return false, fmt.Sprintf("get error: %v", err)
return false, fmt.Sprintf("get error: %v", err), v1alpha1.ServiceReplicaStatus{}
}
desiredReplicas := podClique.Spec.Replicas
readyReplicas := podClique.Status.ReadyReplicas
serviceStatus := v1alpha1.ServiceReplicaStatus{
ComponentKind: v1alpha1.ComponentKindPodClique,
ComponentName: resourceName,
Replicas: podClique.Status.Replicas,
UpdatedReplicas: podClique.Status.UpdatedReplicas,
ReadyReplicas: &readyReplicas,
}
if desiredReplicas == 0 {
// No replicas desired, so it's ready
return true, ""
return true, "", serviceStatus
}
if desiredReplicas != readyReplicas {
logger.V(1).Info("PodClique not ready", "resourceName", resourceName, "desired", desiredReplicas, "ready", readyReplicas)
return false, fmt.Sprintf("desired=%d, ready=%d", desiredReplicas, readyReplicas)
return false, fmt.Sprintf("desired=%d, ready=%d", desiredReplicas, readyReplicas), serviceStatus
}
return true, ""
return true, "", serviceStatus
}
// checkPCSGReady checks if a PodCliqueScalingGroup has spec.replicas == status.availableReplicas
func checkPCSGReady(ctx context.Context, client client.Client, resourceName, namespace string, logger logr.Logger) (bool, string) {
func checkPCSGReady(ctx context.Context, client client.Client, resourceName, namespace string, logger logr.Logger) (bool, string, v1alpha1.ServiceReplicaStatus) {
pcsg := &grovev1alpha1.PodCliqueScalingGroup{}
err := client.Get(ctx, types.NamespacedName{Name: resourceName, Namespace: namespace}, pcsg)
if err != nil {
if errors.IsNotFound(err) {
logger.V(2).Info("PodCliqueScalingGroup not found", "resourceName", resourceName)
return false, "resource not found"
return false, "resource not found", v1alpha1.ServiceReplicaStatus{}
}
logger.V(1).Info("Failed to get PodCliqueScalingGroup", "error", err, "resourceName", resourceName)
return false, fmt.Sprintf("get error: %v", err)
return false, fmt.Sprintf("get error: %v", err), v1alpha1.ServiceReplicaStatus{}
}
desiredReplicas := pcsg.Spec.Replicas
availableReplicas := pcsg.Status.AvailableReplicas
serviceStatus := v1alpha1.ServiceReplicaStatus{
ComponentKind: v1alpha1.ComponentKindPodCliqueScalingGroup,
ComponentName: resourceName,
Replicas: pcsg.Status.Replicas,
UpdatedReplicas: pcsg.Status.UpdatedReplicas,
AvailableReplicas: &availableReplicas,
}
if desiredReplicas == 0 {
// No replicas desired, so it's ready
return true, ""
return true, "", serviceStatus
}
if desiredReplicas != availableReplicas {
logger.V(1).Info("PodCliqueScalingGroup not ready", "resourceName", resourceName, "desired", desiredReplicas, "available", availableReplicas)
return false, fmt.Sprintf("desired=%d, available=%d", desiredReplicas, availableReplicas)
return false, fmt.Sprintf("desired=%d, available=%d", desiredReplicas, availableReplicas), serviceStatus
}
return true, ""
return true, "", serviceStatus
}
// resolveKaiSchedulerQueueName extracts the queue name from annotations or returns default
......
......@@ -6,14 +6,20 @@ import (
"testing"
grovev1alpha1 "github.com/NVIDIA/grove/operator/api/core/v1alpha1"
v1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
commonconsts "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
"github.com/onsi/gomega"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
dynamicfake "k8s.io/client-go/dynamic/fake"
"k8s.io/client-go/kubernetes/scheme"
"k8s.io/utils/ptr"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/fake"
)
func TestResolveKaiSchedulerQueueName(t *testing.T) {
......@@ -311,3 +317,328 @@ func TestEnsureQueueExists(t *testing.T) {
})
}
}
func Test_GetComponentReadinessAndServiceReplicaStatuses(t *testing.T) {
ctx := context.Background()
tests := []struct {
name string
dgdSpec v1alpha1.DynamoGraphDeploymentSpec
existingGroveResources []client.Object
wantReady bool
wantReason string
wantServiceStatuses map[string]v1alpha1.ServiceReplicaStatus
}{
{
name: "single-node service not ready - PodClique not ready",
dgdSpec: v1alpha1.DynamoGraphDeploymentSpec{
Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{
"frontend": {
ServiceName: "frontend",
DynamoNamespace: ptr.To("default"),
ComponentType: string(commonconsts.ComponentTypeFrontend),
Replicas: ptr.To(int32(2)),
},
},
},
existingGroveResources: []client.Object{
&grovev1alpha1.PodClique{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-0-frontend",
Namespace: "default",
},
Spec: grovev1alpha1.PodCliqueSpec{
Replicas: 2,
},
Status: grovev1alpha1.PodCliqueStatus{
Replicas: 2,
UpdatedReplicas: 2,
ReadyReplicas: 1,
},
},
},
wantReady: false,
wantReason: "podclique/test-dgd-0-frontend: desired=2, ready=1",
wantServiceStatuses: map[string]v1alpha1.ServiceReplicaStatus{
"frontend": {
ComponentKind: v1alpha1.ComponentKindPodClique,
ComponentName: "test-dgd-0-frontend",
Replicas: 2,
UpdatedReplicas: 2,
ReadyReplicas: ptr.To(int32(1)),
},
},
},
{
name: "all multinode services ready - all PCSGs ready",
dgdSpec: v1alpha1.DynamoGraphDeploymentSpec{
Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{
"decode": {
ServiceName: "decode",
DynamoNamespace: ptr.To("default"),
ComponentType: string(commonconsts.ComponentTypeDecode),
Replicas: ptr.To(int32(2)),
Multinode: &v1alpha1.MultinodeSpec{
NodeCount: 2,
},
},
"prefill": {
ServiceName: "prefill",
DynamoNamespace: ptr.To("default"),
ComponentType: string(commonconsts.ComponentTypePrefill),
Replicas: ptr.To(int32(3)),
Multinode: &v1alpha1.MultinodeSpec{
NodeCount: 4,
},
},
},
},
existingGroveResources: []client.Object{
&grovev1alpha1.PodCliqueScalingGroup{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-0-decode",
Namespace: "default",
},
Spec: grovev1alpha1.PodCliqueScalingGroupSpec{
Replicas: 2,
},
Status: grovev1alpha1.PodCliqueScalingGroupStatus{
Replicas: 2,
UpdatedReplicas: 2,
AvailableReplicas: 2,
},
},
&grovev1alpha1.PodCliqueScalingGroup{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-0-prefill",
Namespace: "default",
},
Spec: grovev1alpha1.PodCliqueScalingGroupSpec{
Replicas: 3,
},
Status: grovev1alpha1.PodCliqueScalingGroupStatus{
Replicas: 3,
UpdatedReplicas: 3,
AvailableReplicas: 3,
},
},
},
wantReady: true,
wantReason: "",
wantServiceStatuses: map[string]v1alpha1.ServiceReplicaStatus{
"decode": {
ComponentKind: v1alpha1.ComponentKindPodCliqueScalingGroup,
ComponentName: "test-dgd-0-decode",
Replicas: 2,
UpdatedReplicas: 2,
AvailableReplicas: ptr.To(int32(2)),
},
"prefill": {
ComponentKind: v1alpha1.ComponentKindPodCliqueScalingGroup,
ComponentName: "test-dgd-0-prefill",
Replicas: 3,
UpdatedReplicas: 3,
AvailableReplicas: ptr.To(int32(3)),
},
},
},
{
name: "multinode service not ready - PCSG not ready",
dgdSpec: v1alpha1.DynamoGraphDeploymentSpec{
Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {
ServiceName: "worker",
DynamoNamespace: ptr.To("default"),
ComponentType: string(commonconsts.ComponentTypeWorker),
Replicas: ptr.To(int32(2)),
Multinode: &v1alpha1.MultinodeSpec{
NodeCount: 4,
},
},
},
},
existingGroveResources: []client.Object{
&grovev1alpha1.PodCliqueScalingGroup{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-0-worker",
Namespace: "default",
},
Spec: grovev1alpha1.PodCliqueScalingGroupSpec{
Replicas: 2,
},
Status: grovev1alpha1.PodCliqueScalingGroupStatus{
Replicas: 2,
UpdatedReplicas: 2,
AvailableReplicas: 1,
},
},
},
wantReady: false,
wantReason: "pcsg/test-dgd-0-worker: desired=2, available=1",
wantServiceStatuses: map[string]v1alpha1.ServiceReplicaStatus{
"worker": {
ComponentKind: v1alpha1.ComponentKindPodCliqueScalingGroup,
ComponentName: "test-dgd-0-worker",
Replicas: 2,
UpdatedReplicas: 2,
AvailableReplicas: ptr.To(int32(1)),
},
},
},
{
name: "mixed services - some ready, some not - combination of PodClique and PCSG",
dgdSpec: v1alpha1.DynamoGraphDeploymentSpec{
Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{
"frontend": {
ServiceName: "frontend",
DynamoNamespace: ptr.To("default"),
ComponentType: string(commonconsts.ComponentTypeFrontend),
Replicas: ptr.To(int32(1)),
},
"decode": {
ServiceName: "decode",
DynamoNamespace: ptr.To("default"),
ComponentType: string(commonconsts.ComponentTypeDecode),
Replicas: ptr.To(int32(2)),
Multinode: &v1alpha1.MultinodeSpec{
NodeCount: 2,
},
},
"prefill": {
ServiceName: "prefill",
DynamoNamespace: ptr.To("default"),
ComponentType: string(commonconsts.ComponentTypePrefill),
Replicas: ptr.To(int32(2)),
Multinode: &v1alpha1.MultinodeSpec{
NodeCount: 2,
},
},
},
},
existingGroveResources: []client.Object{
&grovev1alpha1.PodClique{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-0-frontend",
Namespace: "default",
},
Spec: grovev1alpha1.PodCliqueSpec{
Replicas: 1,
},
Status: grovev1alpha1.PodCliqueStatus{
Replicas: 1,
UpdatedReplicas: 1,
ReadyReplicas: 1,
},
},
&grovev1alpha1.PodCliqueScalingGroup{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-0-decode",
Namespace: "default",
},
Spec: grovev1alpha1.PodCliqueScalingGroupSpec{
Replicas: 2,
},
Status: grovev1alpha1.PodCliqueScalingGroupStatus{
Replicas: 2,
UpdatedReplicas: 2,
AvailableReplicas: 1,
},
},
&grovev1alpha1.PodCliqueScalingGroup{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-0-prefill",
Namespace: "default",
},
Spec: grovev1alpha1.PodCliqueScalingGroupSpec{
Replicas: 2,
},
Status: grovev1alpha1.PodCliqueScalingGroupStatus{
Replicas: 2,
UpdatedReplicas: 2,
AvailableReplicas: 2,
},
},
},
wantReady: false,
wantReason: "pcsg/test-dgd-0-decode: desired=2, available=1",
wantServiceStatuses: map[string]v1alpha1.ServiceReplicaStatus{
"frontend": {
ComponentKind: v1alpha1.ComponentKindPodClique,
ComponentName: "test-dgd-0-frontend",
Replicas: 1,
UpdatedReplicas: 1,
ReadyReplicas: ptr.To(int32(1)),
},
"decode": {
ComponentKind: v1alpha1.ComponentKindPodCliqueScalingGroup,
ComponentName: "test-dgd-0-decode",
Replicas: 2,
UpdatedReplicas: 2,
AvailableReplicas: ptr.To(int32(1)),
},
"prefill": {
ComponentKind: v1alpha1.ComponentKindPodCliqueScalingGroup,
ComponentName: "test-dgd-0-prefill",
Replicas: 2,
UpdatedReplicas: 2,
AvailableReplicas: ptr.To(int32(2)),
},
},
},
{
name: "service resource not found - PodClique missing",
dgdSpec: v1alpha1.DynamoGraphDeploymentSpec{
Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{
"frontend": {
ServiceName: "frontend",
DynamoNamespace: ptr.To("default"),
ComponentType: string(commonconsts.ComponentTypeFrontend),
Replicas: ptr.To(int32(1)),
},
},
},
existingGroveResources: []client.Object{},
wantReady: false,
wantReason: "podclique/test-dgd-0-frontend: resource not found",
wantServiceStatuses: map[string]v1alpha1.ServiceReplicaStatus{
"frontend": {},
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
g := gomega.NewGomegaWithT(t)
s := scheme.Scheme
err := v1alpha1.AddToScheme(s)
g.Expect(err).NotTo(gomega.HaveOccurred())
err = grovev1alpha1.AddToScheme(s)
g.Expect(err).NotTo(gomega.HaveOccurred())
dgd := &v1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd",
Namespace: "default",
},
Spec: tt.dgdSpec,
}
var objects []client.Object
objects = append(objects, dgd)
objects = append(objects, tt.existingGroveResources...)
fakeKubeClient := fake.NewClientBuilder().
WithScheme(s).
WithObjects(objects...).
WithStatusSubresource(objects...).
Build()
ready, reason, serviceStatuses := GetComponentReadinessAndServiceReplicaStatuses(ctx, fakeKubeClient, dgd)
g.Expect(ready).To(gomega.Equal(tt.wantReady))
g.Expect(reason).To(gomega.Equal(tt.wantReason))
g.Expect(serviceStatuses).To(gomega.Equal(tt.wantServiceStatuses))
})
}
}
......@@ -67,6 +67,26 @@ _Appears in:_
#### ComponentKind
_Underlying type:_ _string_
ComponentKind represents the type of underlying Kubernetes resource.
_Validation:_
- Enum: [PodClique PodCliqueScalingGroup Deployment LeaderWorkerSet]
_Appears in:_
- [ServiceReplicaStatus](#servicereplicastatus)
| Field | Description |
| --- | --- |
| `PodClique` | ComponentKindPodClique represents a PodClique resource.<br /> |
| `PodCliqueScalingGroup` | ComponentKindPodCliqueScalingGroup represents a PodCliqueScalingGroup resource.<br /> |
| `Deployment` | ComponentKindDeployment represents a Deployment resource.<br /> |
| `LeaderWorkerSet` | ComponentKindLeaderWorkerSet represents a LeaderWorkerSet resource.<br /> |
#### ConfigMapKeySelector
......@@ -430,6 +450,7 @@ _Appears in:_
| --- | --- | --- | --- |
| `state` _string_ | State is a high-level textual status of the graph deployment lifecycle. | | |
| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions contains the latest observed conditions of the graph deployment.<br />The slice is merged by type on patch updates. | | |
| `services` _object (keys:string, values:[ServiceReplicaStatus](#servicereplicastatus))_ | Services contains per-service replica status information.<br />The map key is the service name from spec.services. | | |
#### DynamoModel
......@@ -739,6 +760,27 @@ _Appears in:_
| `disable` _boolean_ | Disable indicates whether the ScalingAdapter should be disabled for this service.<br />When false (default), a DGDSA is created and owns the replicas field.<br />When true, no DGDSA is created and replicas can be modified directly in the DGD. | false | |
#### ServiceReplicaStatus
ServiceReplicaStatus contains replica information for a single service.
_Appears in:_
- [DynamoGraphDeploymentStatus](#dynamographdeploymentstatus)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `componentKind` _[ComponentKind](#componentkind)_ | ComponentKind is the underlying resource kind (e.g., "PodClique", "PodCliqueScalingGroup", "Deployment", "LeaderWorkerSet"). | | Enum: [PodClique PodCliqueScalingGroup Deployment LeaderWorkerSet] <br /> |
| `componentName` _string_ | ComponentName is the name of the underlying resource. | | |
| `replicas` _integer_ | Replicas is the total number of non-terminated replicas.<br />Required for all component kinds. | | Minimum: 0 <br /> |
| `updatedReplicas` _integer_ | UpdatedReplicas is the number of replicas at the current/desired revision.<br />Required for all component kinds. | | Minimum: 0 <br /> |
| `readyReplicas` _integer_ | ReadyReplicas is the number of ready replicas.<br />Populated for PodClique, Deployment, and LeaderWorkerSet.<br />Not available for PodCliqueScalingGroup.<br />When nil, the field is omitted from the API response. | | Minimum: 0 <br /> |
| `availableReplicas` _integer_ | AvailableReplicas is the number of available replicas.<br />For Deployment: replicas ready for >= minReadySeconds.<br />For PodCliqueScalingGroup: replicas where all constituent PodCliques have >= MinAvailable ready pods.<br />Not available for PodClique or LeaderWorkerSet.<br />When nil, the field is omitted from the API response. | | Minimum: 0 <br /> |
#### SharedMemorySpec
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment