Unverified Commit 97f79537 authored by Thomas Montfort's avatar Thomas Montfort Committed by GitHub
Browse files

feat(operator): managed rolling updates for DGD worker deployments (#6110)


Signed-off-by: default avatartmontfort <tmontfort@nvidia.com>
parent 0d9eb99d
......@@ -11171,8 +11171,19 @@ spec:
- LeaderWorkerSet
type: string
componentName:
description: ComponentName is the name of the underlying resource.
description: |-
ComponentName is the name of the primary underlying resource.
DEPRECATED: Use ComponentNames instead. This field will be removed in a future release.
During rolling updates, this reflects the new (target) component name.
type: string
componentNames:
description: |-
ComponentNames is the list of underlying resource names for this service.
During normal operation, this contains a single name.
During rolling updates, this contains both old and new component names.
items:
type: string
type: array
readyReplicas:
description: |-
ReadyReplicas is the number of ready replicas.
......
......@@ -11386,6 +11386,37 @@ spec:
description: Phase is the phase of the restart.
type: string
type: object
rollingUpdate:
description: |-
RollingUpdate tracks the progress of operator manged rolling updates.
Currently only supported for singl-node, non-Grove deployments (DCD/Deployment).
properties:
endTime:
description: EndTime is when the rolling update completed (successfully or failed).
format: date-time
type: string
phase:
description: Phase indicates the current phase of the rolling update.
enum:
- Pending
- InProgress
- Completed
- Failed
- ""
type: string
startTime:
description: StartTime is when the rolling update began.
format: date-time
type: string
updatedServices:
description: |-
UpdatedServices is the list of services that have completed the rolling update.
A service is considered updated when its new replicas are all ready and old replicas are fully scaled down.
Only services of componentType Worker (or Prefill/Decode) are considered.
items:
type: string
type: array
type: object
services:
additionalProperties:
description: ServiceReplicaStatus contains replica information for a single service.
......@@ -11409,8 +11440,19 @@ spec:
- LeaderWorkerSet
type: string
componentName:
description: ComponentName is the name of the underlying resource.
description: |-
ComponentName is the name of the primary underlying resource.
DEPRECATED: Use ComponentNames instead. This field will be removed in a future release.
During rolling updates, this reflects the new (target) component name.
type: string
componentNames:
description: |-
ComponentNames is the list of underlying resource names for this service.
During normal operation, this contains a single name.
During rolling updates, this contains both old and new component names.
items:
type: string
type: array
readyReplicas:
description: |-
ReadyReplicas is the number of ready replicas.
......
......@@ -109,7 +109,6 @@ type DynamoGraphDeploymentStatus struct {
// The map key is the service name from spec.services.
// +optional
Services map[string]ServiceReplicaStatus `json:"services,omitempty"`
// Restart contains the status of the restart of the graph deployment.
// +optional
Restart *RestartStatus `json:"restart,omitempty"`
......@@ -117,6 +116,10 @@ type DynamoGraphDeploymentStatus struct {
// The map key is the service name from spec.services.
// +optional
Checkpoints map[string]ServiceCheckpointStatus `json:"checkpoints,omitempty"`
// RollingUpdate tracks the progress of operator manged rolling updates.
// Currently only supported for singl-node, non-Grove deployments (DCD/Deployment).
// +optional
RollingUpdate *RollingUpdateStatus `json:"rollingUpdate,omitempty"`
}
// ServiceCheckpointStatus contains checkpoint information for a single service.
......@@ -151,15 +154,58 @@ const (
RestartPhaseRestarting RestartPhase = "Restarting"
RestartPhaseCompleted RestartPhase = "Completed"
RestartPhaseFailed RestartPhase = "Failed"
RestartPhaseSuperseded RestartPhase = "Superseded"
)
// RollingUpdatePhase represents the current phase of a rolling update.
// +kubebuilder:validation:Enum=Pending;InProgress;Completed;Failed;""
type RollingUpdatePhase string
const (
RollingUpdatePhasePending RollingUpdatePhase = "Pending"
RollingUpdatePhaseInProgress RollingUpdatePhase = "InProgress"
RollingUpdatePhaseCompleted RollingUpdatePhase = "Completed"
RollingUpdatePhaseNone RollingUpdatePhase = ""
)
// RollingUpdateStatus tracks the progress of a rolling update.
type RollingUpdateStatus struct {
// Phase indicates the current phase of the rolling update.
// +optional
Phase RollingUpdatePhase `json:"phase,omitempty"`
// StartTime is when the rolling update began.
// +optional
StartTime *metav1.Time `json:"startTime,omitempty"`
// EndTime is when the rolling update completed (successfully or failed).
// +optional
EndTime *metav1.Time `json:"endTime,omitempty"`
// UpdatedServices is the list of services that have completed the rolling update.
// A service is considered updated when its new replicas are all ready and old replicas are fully scaled down.
// Only services of componentType Worker (or Prefill/Decode) are considered.
// +optional
UpdatedServices []string `json:"updatedServices,omitempty"`
}
// ServiceReplicaStatus contains replica information for a single service.
type ServiceReplicaStatus struct {
// ComponentKind is the underlying resource kind (e.g., "PodClique", "PodCliqueScalingGroup", "Deployment", "LeaderWorkerSet").
ComponentKind ComponentKind `json:"componentKind"`
// ComponentName is the name of the underlying resource.
// ComponentName is the name of the primary underlying resource.
// DEPRECATED: Use ComponentNames instead. This field will be removed in a future release.
// During rolling updates, this reflects the new (target) component name.
// +kubebuilder:deprecatedversion:warning="ComponentName is deprecated, view ComponentNames instead"
ComponentName string `json:"componentName"`
// ComponentNames is the list of underlying resource names for this service.
// During normal operation, this contains a single name.
// During rolling updates, this contains both old and new component names.
// +optional
ComponentNames []string `json:"componentNames,omitempty"`
// Replicas is the total number of non-terminated replicas.
// Required for all component kinds.
// +kubebuilder:validation:Minimum=0
......@@ -264,11 +310,6 @@ func (s *DynamoGraphDeployment) HasAnyMultinodeService() bool {
return false
}
// GetDynamoNamespaceForService returns the Dynamo namespace for a given service.
func (s *DynamoGraphDeployment) GetDynamoNamespaceForService(service *DynamoComponentDeploymentSharedSpec) string {
return ComputeDynamoNamespace(service.GlobalDynamoNamespace, s.GetNamespace(), s.GetName())
}
// HasEPPService returns true if any service in the DGD has EPP component type
func (dgd *DynamoGraphDeployment) HasEPPService() bool {
for _, component := range dgd.Spec.Services {
......@@ -279,6 +320,11 @@ func (dgd *DynamoGraphDeployment) HasEPPService() bool {
return false
}
// GetDynamoNamespaceForService returns the Dynamo namespace for a given service.
func (s *DynamoGraphDeployment) GetDynamoNamespaceForService(service *DynamoComponentDeploymentSharedSpec) string {
return ComputeDynamoNamespace(service.GlobalDynamoNamespace, s.GetNamespace(), s.GetName())
}
// GetEPPService returns the EPP service name and spec if present
func (dgd *DynamoGraphDeployment) GetEPPService() (string, *DynamoComponentDeploymentSharedSpec, bool) {
for serviceName, component := range dgd.Spec.Services {
......
......@@ -968,6 +968,11 @@ func (in *DynamoGraphDeploymentStatus) DeepCopyInto(out *DynamoGraphDeploymentSt
(*out)[key] = val
}
}
if in.RollingUpdate != nil {
in, out := &in.RollingUpdate, &out.RollingUpdate
*out = new(RollingUpdateStatus)
(*in).DeepCopyInto(*out)
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoGraphDeploymentStatus.
......@@ -1491,6 +1496,34 @@ func (in *RestartStrategy) DeepCopy() *RestartStrategy {
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *RollingUpdateStatus) DeepCopyInto(out *RollingUpdateStatus) {
*out = *in
if in.StartTime != nil {
in, out := &in.StartTime, &out.StartTime
*out = (*in).DeepCopy()
}
if in.EndTime != nil {
in, out := &in.EndTime, &out.EndTime
*out = (*in).DeepCopy()
}
if in.UpdatedServices != nil {
in, out := &in.UpdatedServices, &out.UpdatedServices
*out = make([]string, len(*in))
copy(*out, *in)
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RollingUpdateStatus.
func (in *RollingUpdateStatus) DeepCopy() *RollingUpdateStatus {
if in == nil {
return nil
}
out := new(RollingUpdateStatus)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *ScalingAdapter) DeepCopyInto(out *ScalingAdapter) {
*out = *in
......@@ -1549,6 +1582,11 @@ func (in *ServiceCheckpointStatus) DeepCopy() *ServiceCheckpointStatus {
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *ServiceReplicaStatus) DeepCopyInto(out *ServiceReplicaStatus) {
*out = *in
if in.ComponentNames != nil {
in, out := &in.ComponentNames, &out.ComponentNames
*out = make([]string, len(*in))
copy(*out, *in)
}
if in.ReadyReplicas != nil {
in, out := &in.ReadyReplicas, &out.ReadyReplicas
*out = new(int32)
......
......@@ -11171,8 +11171,19 @@ spec:
- LeaderWorkerSet
type: string
componentName:
description: ComponentName is the name of the underlying resource.
description: |-
ComponentName is the name of the primary underlying resource.
DEPRECATED: Use ComponentNames instead. This field will be removed in a future release.
During rolling updates, this reflects the new (target) component name.
type: string
componentNames:
description: |-
ComponentNames is the list of underlying resource names for this service.
During normal operation, this contains a single name.
During rolling updates, this contains both old and new component names.
items:
type: string
type: array
readyReplicas:
description: |-
ReadyReplicas is the number of ready replicas.
......
......@@ -11386,6 +11386,37 @@ spec:
description: Phase is the phase of the restart.
type: string
type: object
rollingUpdate:
description: |-
RollingUpdate tracks the progress of operator manged rolling updates.
Currently only supported for singl-node, non-Grove deployments (DCD/Deployment).
properties:
endTime:
description: EndTime is when the rolling update completed (successfully or failed).
format: date-time
type: string
phase:
description: Phase indicates the current phase of the rolling update.
enum:
- Pending
- InProgress
- Completed
- Failed
- ""
type: string
startTime:
description: StartTime is when the rolling update began.
format: date-time
type: string
updatedServices:
description: |-
UpdatedServices is the list of services that have completed the rolling update.
A service is considered updated when its new replicas are all ready and old replicas are fully scaled down.
Only services of componentType Worker (or Prefill/Decode) are considered.
items:
type: string
type: array
type: object
services:
additionalProperties:
description: ServiceReplicaStatus contains replica information for a single service.
......@@ -11409,8 +11440,19 @@ spec:
- LeaderWorkerSet
type: string
componentName:
description: ComponentName is the name of the underlying resource.
description: |-
ComponentName is the name of the primary underlying resource.
DEPRECATED: Use ComponentNames instead. This field will be removed in a future release.
During rolling updates, this reflects the new (target) component name.
type: string
componentNames:
description: |-
ComponentNames is the list of underlying resource names for this service.
During normal operation, this contains a single name.
During rolling updates, this contains both old and new component names.
items:
type: string
type: array
readyReplicas:
description: |-
ReadyReplicas is the number of ready replicas.
......
......@@ -54,6 +54,7 @@ const (
KubeAnnotationDynamoBaseModel = "nvidia.com/dynamo-base-model"
KubeLabelDynamoDiscoveryBackend = "nvidia.com/dynamo-discovery-backend"
KubeLabelDynamoDiscoveryEnabled = "nvidia.com/dynamo-discovery-enabled"
KubeLabelDynamoWorkerHash = "nvidia.com/dynamo-worker-hash"
KubeLabelValueFalse = "false"
KubeLabelValueTrue = "true"
......@@ -62,10 +63,12 @@ const (
KubeResourceGPUNvidia = "nvidia.com/gpu"
DynamoDeploymentConfigEnvVar = "DYN_DEPLOYMENT_CONFIG"
DynamoNamespaceEnvVar = "DYN_NAMESPACE"
DynamoComponentEnvVar = "DYN_COMPONENT"
DynamoDiscoveryBackendEnvVar = "DYN_DISCOVERY_BACKEND"
DynamoDeploymentConfigEnvVar = "DYN_DEPLOYMENT_CONFIG"
DynamoNamespaceEnvVar = "DYN_NAMESPACE"
DynamoNamespacePrefixEnvVar = "DYN_NAMESPACE_PREFIX"
DynamoNamespaceWorkerSuffixEnvVar = "DYN_NAMESPACE_WORKER_SUFFIX"
DynamoComponentEnvVar = "DYN_COMPONENT"
DynamoDiscoveryBackendEnvVar = "DYN_DISCOVERY_BACKEND"
GlobalDynamoNamespace = "dynamo"
......@@ -177,6 +180,14 @@ const (
AnnotationDynParentDGDName = "nvidia.com/dyn-parent-dgd-name"
AnnotationDynParentDGDNS = "nvidia.com/dyn-parent-dgd-namespace"
AnnotationDynDiscoveryBackend = "nvidia.com/dyn-discovery-backend"
// Rolling update annotations
AnnotationCurrentWorkerHash = "nvidia.com/current-worker-hash"
// LegacyWorkerHash is a sentinel value used during migration from pre-rolling-update
// operator versions. Legacy worker DCDs (those without a worker hash label) are tagged
// with this value so the existing rolling update machinery can manage the transition.
LegacyWorkerHash = "legacy"
)
type MultinodeDeploymentType string
......
......@@ -24,6 +24,7 @@ import (
"fmt"
"maps"
"os"
"slices"
"time"
appsv1 "k8s.io/api/apps/v1"
......@@ -321,6 +322,7 @@ func (r *DynamoComponentDeploymentReconciler) reconcileDeploymentResources(ctx c
serviceReplicaStatus := &v1alpha1.ServiceReplicaStatus{
ComponentKind: v1alpha1.ComponentKindDeployment,
ComponentName: deployment.Name,
ComponentNames: []string{deployment.Name},
Replicas: deployment.Status.Replicas,
UpdatedReplicas: deployment.Status.UpdatedReplicas,
ReadyReplicas: &deployment.Status.ReadyReplicas,
......@@ -498,6 +500,7 @@ func getLeaderWorkerSetReplicasStatus(leaderWorkerSet *leaderworkersetv1.LeaderW
return v1alpha1.ServiceReplicaStatus{
ComponentKind: v1alpha1.ComponentKindLeaderWorkerSet,
ComponentName: leaderWorkerSet.Name,
ComponentNames: []string{leaderWorkerSet.Name},
Replicas: leaderWorkerSet.Status.Replicas,
UpdatedReplicas: leaderWorkerSet.Status.UpdatedReplicas,
ReadyReplicas: &leaderWorkerSet.Status.ReadyReplicas,
......@@ -514,14 +517,18 @@ func combineLWSReplicaStatuses(serviceReplicaStatuses []v1alpha1.ServiceReplicaS
if firstServiceStatus.ReadyReplicas != nil {
readyReplicas = *firstServiceStatus.ReadyReplicas
}
allNames := append([]string{}, firstServiceStatus.ComponentNames...)
for _, serviceReplicaStatus := range serviceReplicaStatuses[1:] {
firstServiceStatus.Replicas += serviceReplicaStatus.Replicas
firstServiceStatus.UpdatedReplicas += serviceReplicaStatus.UpdatedReplicas
if serviceReplicaStatus.ReadyReplicas != nil {
readyReplicas += *serviceReplicaStatus.ReadyReplicas
}
allNames = append(allNames, serviceReplicaStatus.ComponentNames...)
}
slices.Sort(allNames)
firstServiceStatus.ComponentNames = allNames
firstServiceStatus.ReadyReplicas = &readyReplicas
return &firstServiceStatus
}
......
......@@ -634,7 +634,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
ComponentType: string(commonconsts.ComponentTypeWorker),
SubComponentType: "test-sub-component",
ServiceName: "test-lws-deploy-service",
DynamoNamespace: &[]string{"default"}[0],
DynamoNamespace: &[]string{"default-test-lws-deploy"}[0],
Multinode: &v1alpha1.MultinodeSpec{
NodeCount: 2,
},
......@@ -1360,6 +1360,7 @@ func Test_reconcileLeaderWorkerSetResources(t *testing.T) {
serviceReplicaStatus: &v1alpha1.ServiceReplicaStatus{
ComponentKind: v1alpha1.ComponentKindLeaderWorkerSet,
ComponentName: "test-component-0",
ComponentNames: []string{"test-component-0"},
ReadyReplicas: ptr.To(int32(1)),
UpdatedReplicas: 1,
Replicas: 1,
......@@ -1439,6 +1440,7 @@ func Test_reconcileLeaderWorkerSetResources(t *testing.T) {
serviceReplicaStatus: &v1alpha1.ServiceReplicaStatus{
ComponentKind: v1alpha1.ComponentKindLeaderWorkerSet,
ComponentName: "test-component-0",
ComponentNames: []string{"test-component-0", "test-component-1", "test-component-2"},
ReadyReplicas: ptr.To(int32(2)),
UpdatedReplicas: 2,
Replicas: 3,
......@@ -1518,6 +1520,7 @@ func Test_reconcileLeaderWorkerSetResources(t *testing.T) {
serviceReplicaStatus: &v1alpha1.ServiceReplicaStatus{
ComponentKind: v1alpha1.ComponentKindLeaderWorkerSet,
ComponentName: "test-component-0",
ComponentNames: []string{"test-component-0", "test-component-1", "test-component-2"},
ReadyReplicas: ptr.To(int32(3)),
UpdatedReplicas: 3,
Replicas: 3,
......@@ -1662,6 +1665,7 @@ func Test_reconcileDeploymentResources(t *testing.T) {
serviceReplicaStatus: &v1alpha1.ServiceReplicaStatus{
ComponentKind: v1alpha1.ComponentKindDeployment,
ComponentName: "test-component",
ComponentNames: []string{"test-component"},
Replicas: 2,
UpdatedReplicas: 2,
ReadyReplicas: ptr.To(int32(2)),
......@@ -1703,6 +1707,7 @@ func Test_reconcileDeploymentResources(t *testing.T) {
serviceReplicaStatus: &v1alpha1.ServiceReplicaStatus{
ComponentKind: v1alpha1.ComponentKindDeployment,
ComponentName: "test-component",
ComponentNames: []string{"test-component"},
Replicas: 1,
UpdatedReplicas: 1,
ReadyReplicas: ptr.To(int32(1)),
......
......@@ -187,6 +187,40 @@ func (r *DynamoGraphDeploymentReconciler) Reconcile(ctx context.Context, req ctr
}
}
if r.supportsManagedRollingUpdate(dynamoDeployment) {
if err = r.initializeWorkerHashIfNeeded(ctx, dynamoDeployment); err != nil {
logger.Error(err, "Failed to initialize worker hash")
reason = "failed_to_initialize_worker_hash"
return ctrl.Result{}, err
}
if r.isRollingUpdateInProgress(dynamoDeployment) || r.shouldTriggerRollingUpdate(dynamoDeployment) {
if err = r.reconcileRollingUpdate(ctx, dynamoDeployment); err != nil {
logger.Error(err, "Failed to reconcile rolling update")
state = DGDStateFailed
reason = Reason("RollingUpdateFailed")
message = Message(err.Error())
return ctrl.Result{}, err
}
}
} else {
// For unsupported pathways, log if a rolling update would have been triggered
if r.shouldTriggerRollingUpdate(dynamoDeployment) {
logger.Info("Worker spec change detected but rolling update not supported for this pathway",
"isGrove", r.isGrovePathway(dynamoDeployment),
"hasMultinode", dynamoDeployment.HasAnyMultinodeService())
r.Recorder.Event(dynamoDeployment, corev1.EventTypeWarning, "RollingUpdateNotSupported",
"Worker spec changed but custom rolling updates are not supported for Grove/multinode deployments")
// Update the hash to prevent repeated warnings
hash := dynamo.ComputeDGDWorkersSpecHash(dynamoDeployment)
r.setCurrentWorkerHash(dynamoDeployment, hash)
if updateErr := r.Update(ctx, dynamoDeployment); updateErr != nil {
logger.Error(updateErr, "Failed to update worker hash for unsupported pathway")
}
}
}
reconcileResult, err := r.reconcileResources(ctx, dynamoDeployment)
state = reconcileResult.State
......@@ -201,6 +235,21 @@ func (r *DynamoGraphDeploymentReconciler) Reconcile(ctx context.Context, req ctr
return ctrl.Result{}, err
}
// Override state based on rolling update status if a rolling update is in progress
if dynamoDeployment.Status.RollingUpdate != nil {
switch dynamoDeployment.Status.RollingUpdate.Phase {
case nvidiacomv1alpha1.RollingUpdatePhaseCompleted:
// Keep the reconcileResult state (should be Ready if resources are ready)
case nvidiacomv1alpha1.RollingUpdatePhasePending, nvidiacomv1alpha1.RollingUpdatePhaseInProgress:
// Rolling update in progress - resources are being transitioned
if state != DGDStateFailed {
state = DGDStatePending
reason = "rolling_update_in_progress"
message = "Rolling update in progress"
}
}
}
return ctrl.Result{}, nil
}
......@@ -403,7 +452,8 @@ func isRestartAlreadyProcessed(dgd *nvidiacomv1alpha1.DynamoGraphDeployment) boo
if dgd.Spec.Restart.ID == dgd.Status.Restart.ObservedID &&
(dgd.Status.Restart.Phase == nvidiacomv1alpha1.RestartPhaseCompleted ||
dgd.Status.Restart.Phase == nvidiacomv1alpha1.RestartPhaseFailed) {
dgd.Status.Restart.Phase == nvidiacomv1alpha1.RestartPhaseFailed ||
dgd.Status.Restart.Phase == nvidiacomv1alpha1.RestartPhaseSuperseded) {
return true
}
......@@ -610,7 +660,7 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Co
if component.Ingress != nil {
ingressSpec = *component.Ingress
}
mainComponentIngress := dynamo.GenerateComponentIngress(ctx, dynamo.GetDynamoComponentName(dynamoDeployment, componentName), dynamoDeployment.Namespace, ingressSpec)
mainComponentIngress := dynamo.GenerateComponentIngress(ctx, dynamo.GetDCDResourceName(dynamoDeployment, componentName, ""), dynamoDeployment.Namespace, ingressSpec)
_, syncedMainComponentIngress, err := commoncontroller.SyncResource(ctx, r, dynamoDeployment, func(ctx context.Context) (*networkingv1.Ingress, bool, error) {
if !ingressSpec.Enabled || ingressSpec.IngressControllerClassName == nil {
logger.Info("Ingress is not enabled")
......@@ -634,7 +684,7 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Co
}
// generate the main component virtual service
if r.Config.IngressConfig.UseVirtualService() {
mainComponentVirtualService := dynamo.GenerateComponentVirtualService(ctx, dynamo.GetDynamoComponentName(dynamoDeployment, componentName), dynamoDeployment.Namespace, ingressSpec)
mainComponentVirtualService := dynamo.GenerateComponentVirtualService(ctx, dynamo.GetDCDResourceName(dynamoDeployment, componentName, ""), dynamoDeployment.Namespace, ingressSpec)
_, syncedMainComponentVirtualService, err := commoncontroller.SyncResource(ctx, r, dynamoDeployment, func(ctx context.Context) (*networkingv1beta1.VirtualService, bool, error) {
if !ingressSpec.IsVirtualServiceEnabled() {
logger.Info("VirtualService is not enabled")
......@@ -826,17 +876,27 @@ func (r *DynamoGraphDeploymentReconciler) computeRestartStatus(ctx context.Conte
// No restart requested
if dgd.Spec.Restart == nil || dgd.Spec.Restart.ID == "" {
// Preserve existing terminal status
if dgd.Status.Restart != nil && (dgd.Status.Restart.Phase == nvidiacomv1alpha1.RestartPhaseCompleted || dgd.Status.Restart.Phase == nvidiacomv1alpha1.RestartPhaseFailed) {
if dgd.Status.Restart != nil && (dgd.Status.Restart.Phase == nvidiacomv1alpha1.RestartPhaseCompleted || dgd.Status.Restart.Phase == nvidiacomv1alpha1.RestartPhaseFailed || dgd.Status.Restart.Phase == nvidiacomv1alpha1.RestartPhaseSuperseded) {
return dgd.Status.Restart
}
return nil
}
// If restart was already processed (completed or failed), return existing status
// If restart was already processed (completed, failed, or superseded), return existing status
if isRestartAlreadyProcessed(dgd) {
return dgd.Status.Restart
}
// Supersede restart if a rolling update is in progress
if r.isRollingUpdateInProgress(dgd) {
r.Recorder.Eventf(dgd, corev1.EventTypeWarning, "RestartSuperseded",
"Restart %s superseded by rolling update", dgd.Spec.Restart.ID)
return &nvidiacomv1alpha1.RestartStatus{
ObservedID: dgd.Spec.Restart.ID,
Phase: nvidiacomv1alpha1.RestartPhaseSuperseded,
}
}
order := dynamo.GetRestartOrder(dgd)
if dynamo.IsParallelRestart(dgd) {
......@@ -848,7 +908,7 @@ func (r *DynamoGraphDeploymentReconciler) computeRestartStatus(ctx context.Conte
// checkComponentServiceFullyUpdated checks if a DynamoComponentDeployment is fully updated.
func (r *DynamoGraphDeploymentReconciler) checkComponentServiceFullyUpdated(ctx context.Context, dgd *nvidiacomv1alpha1.DynamoGraphDeployment, serviceName string) (bool, string) {
resourceName := dynamo.GetDynamoComponentName(dgd, serviceName)
resourceName := dynamo.GetDCDResourceName(dgd, serviceName, r.getCurrentWorkerHash(dgd))
return checkDCDReady(ctx, r.Client, resourceName, dgd.Namespace)
}
......@@ -961,44 +1021,79 @@ func (r *DynamoGraphDeploymentReconciler) reconcileDynamoComponentsDeployments(c
resources := []Resource{}
logger := log.FromContext(ctx)
defaultIngressSpec := dynamo.GenerateDefaultIngressSpec(dynamoDeployment, r.Config.IngressConfig)
rollingUpdateCtx := r.buildRollingUpdateContext(ctx, dynamoDeployment)
existingRestartAnnotations, err := r.getExistingRestartAnnotationsDCD(ctx, dynamoDeployment)
if err != nil {
logger.Error(err, "failed to get existing restart annotations")
return ReconcileResult{}, fmt.Errorf("failed to get existing restart annotations: %w", err)
}
if rollingUpdateCtx.InProgress() {
logger.Info("Rolling update in progress",
"newWorkerHash", rollingUpdateCtx.NewWorkerHash,
"oldWorkerReplicas", rollingUpdateCtx.OldWorkerReplicas)
}
// generate the dynamoComponentsDeployments from the config
defaultIngressSpec := dynamo.GenerateDefaultIngressSpec(dynamoDeployment, r.Config.IngressConfig)
dynamoComponentsDeployments, err := dynamo.GenerateDynamoComponentsDeployments(ctx, dynamoDeployment, &defaultIngressSpec, restartState, existingRestartAnnotations)
// Generate all DCDs (handles both normal and rolling update cases)
dynamoComponentsDeployments, err := dynamo.GenerateDynamoComponentsDeployments(
ctx, dynamoDeployment, &defaultIngressSpec, restartState, existingRestartAnnotations, rollingUpdateCtx,
)
if err != nil {
logger.Error(err, "failed to generate the DynamoComponentsDeployments")
return ReconcileResult{}, fmt.Errorf("failed to generate the DynamoComponentsDeployments: %w", err)
}
// reconcile the dynamoComponentsDeployments
for serviceName, dynamoComponentDeployment := range dynamoComponentsDeployments {
logger.Info("Reconciling the DynamoComponentDeployment", "serviceName", serviceName, "dynamoComponentDeployment", dynamoComponentDeployment)
_, dynamoComponentDeployment, err = commoncontroller.SyncResource(ctx, r, dynamoDeployment, func(ctx context.Context) (*nvidiacomv1alpha1.DynamoComponentDeployment, bool, error) {
return dynamoComponentDeployment, false, nil
// Sync all generated DCDs
for key, dcd := range dynamoComponentsDeployments {
logger.Info("Reconciling DynamoComponentDeployment", "key", key, "name", dcd.Name)
_, syncedDCD, err := commoncontroller.SyncResource(ctx, r, dynamoDeployment, func(ctx context.Context) (*nvidiacomv1alpha1.DynamoComponentDeployment, bool, error) {
return dcd, false, nil
})
if err != nil {
logger.Error(err, "failed to sync the DynamoComponentDeployment")
logger.Error(err, "failed to sync the DynamoComponentDeployment", "name", dcd.Name)
return ReconcileResult{}, fmt.Errorf("failed to sync the DynamoComponentDeployment: %w", err)
}
resources = append(resources, dynamoComponentDeployment)
resources = append(resources, syncedDCD)
}
// During rolling update, scale old worker DCDs via direct patching.
// This is done separately from DCD generation to avoid overwriting the old spec
// with the new spec (which would trigger an unwanted rolling update on old workers).
if rollingUpdateCtx.InProgress() {
if err := r.scaleOldWorkerDCDs(ctx, dynamoDeployment, rollingUpdateCtx); err != nil {
logger.Error(err, "failed to scale old worker DCDs")
return ReconcileResult{}, fmt.Errorf("failed to scale old worker DCDs: %w", err)
}
}
// Check resource readiness
result := r.checkResourcesReadiness(resources)
// During rolling updates, aggregate old worker service statuses into the result
// so that Replicas, ReadyReplicas, etc. reflect the total across old and new DCDs.
if rollingUpdateCtx.InProgress() {
oldWorkerStatuses, err := r.aggregateOldWorkerServiceStatuses(ctx, dynamoDeployment, rollingUpdateCtx)
if err != nil {
logger.Error(err, "failed to aggregate old worker service statuses")
// Non-fatal: continue with partial status
} else if len(oldWorkerStatuses) > 0 {
mergeWorkerServiceStatuses(result.ServiceStatus, oldWorkerStatuses)
}
}
return result, nil
}
func (r *DynamoGraphDeploymentReconciler) getExistingRestartAnnotationsDCD(ctx context.Context, dgd *nvidiacomv1alpha1.DynamoGraphDeployment) (map[string]string, error) {
logger := log.FromContext(ctx)
computedHash := dynamo.ComputeDGDWorkersSpecHash(dgd)
restartAnnotations := make(map[string]string)
for serviceName := range dgd.Spec.Services {
dcdName := dynamo.GetDynamoComponentName(dgd, serviceName)
dcdName := dynamo.GetDCDResourceName(dgd, serviceName, computedHash)
existingDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{}
err := r.Get(ctx, types.NamespacedName{Name: dcdName, Namespace: dgd.Namespace}, existingDCD)
......@@ -1437,7 +1532,7 @@ func (r *DynamoGraphDeploymentReconciler) reconcileEPPResources(ctx context.Cont
// 2. Reconcile InferencePool
// Note: EPP Service is created automatically by the standard component reconciliation
// via GenerateComponentService() in graph.go (see ComponentTypeEPP case)
eppServiceName := dynamo.GetDynamoComponentName(dgd, componentName)
eppServiceName := dynamo.GetDCDResourceName(dgd, componentName, "")
inferencePool, err := epp.GenerateInferencePool(dgd, componentName, eppServiceName, eppService.EPPConfig)
if err != nil {
logger.Error(err, "Failed to generate EPP InferencePool")
......
......@@ -22,7 +22,6 @@ import (
"testing"
"github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common"
grovev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"
......@@ -157,7 +156,7 @@ func TestDynamoGraphDeploymentReconciler_reconcileScalingAdapters(t *testing.T)
Name: "test-dgd-frontend",
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
},
OwnerReferences: []metav1.OwnerReference{
{
......@@ -181,7 +180,7 @@ func TestDynamoGraphDeploymentReconciler_reconcileScalingAdapters(t *testing.T)
Name: "test-dgd-removed",
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
},
OwnerReferences: []metav1.OwnerReference{
{
......@@ -230,7 +229,7 @@ func TestDynamoGraphDeploymentReconciler_reconcileScalingAdapters(t *testing.T)
Name: "test-dgd-frontend",
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
},
OwnerReferences: []metav1.OwnerReference{
{
......@@ -412,6 +411,7 @@ func Test_reconcileGroveResources(t *testing.T) {
"frontend": {
ComponentKind: v1alpha1.ComponentKindPodClique,
ComponentName: "test-dgd-0-frontend",
ComponentNames: []string{"test-dgd-0-frontend"},
Replicas: 2,
UpdatedReplicas: 2,
ReadyReplicas: ptr.To(int32(2)),
......@@ -474,6 +474,7 @@ func Test_reconcileGroveResources(t *testing.T) {
"frontend": {
ComponentKind: v1alpha1.ComponentKindPodClique,
ComponentName: "test-dgd-0-frontend",
ComponentNames: []string{"test-dgd-0-frontend"},
Replicas: 1,
UpdatedReplicas: 1,
ReadyReplicas: ptr.To(int32(1)),
......@@ -481,6 +482,7 @@ func Test_reconcileGroveResources(t *testing.T) {
"decode": {
ComponentKind: v1alpha1.ComponentKindPodClique,
ComponentName: "test-dgd-0-decode",
ComponentNames: []string{"test-dgd-0-decode"},
Replicas: 2,
UpdatedReplicas: 1,
ReadyReplicas: ptr.To(int32(1)),
......@@ -549,6 +551,7 @@ func Test_reconcileGroveResources(t *testing.T) {
"decode": {
ComponentKind: v1alpha1.ComponentKindPodCliqueScalingGroup,
ComponentName: "test-dgd-0-decode",
ComponentNames: []string{"test-dgd-0-decode"},
Replicas: 1,
UpdatedReplicas: 1,
AvailableReplicas: ptr.To(int32(1)),
......@@ -556,6 +559,7 @@ func Test_reconcileGroveResources(t *testing.T) {
"prefill": {
ComponentKind: v1alpha1.ComponentKindPodCliqueScalingGroup,
ComponentName: "test-dgd-0-prefill",
ComponentNames: []string{"test-dgd-0-prefill"},
Replicas: 1,
UpdatedReplicas: 1,
AvailableReplicas: ptr.To(int32(1)),
......@@ -621,6 +625,7 @@ func Test_reconcileGroveResources(t *testing.T) {
"frontend": {
ComponentKind: v1alpha1.ComponentKindPodClique,
ComponentName: "test-dgd-0-frontend",
ComponentNames: []string{"test-dgd-0-frontend"},
Replicas: 1,
UpdatedReplicas: 1,
ReadyReplicas: ptr.To(int32(1)),
......@@ -628,6 +633,7 @@ func Test_reconcileGroveResources(t *testing.T) {
"aggregated": {
ComponentKind: v1alpha1.ComponentKindPodCliqueScalingGroup,
ComponentName: "test-dgd-0-aggregated",
ComponentNames: []string{"test-dgd-0-aggregated"},
Replicas: 2,
UpdatedReplicas: 2,
AvailableReplicas: ptr.To(int32(1)),
......@@ -1336,6 +1342,114 @@ func Test_computeRestartStatus(t *testing.T) {
InProgress: []string{"frontend"}, // Reset to FIRST service
},
},
{
name: "rolling update in progress + new restart request - superseded",
dgdSpec: v1alpha1.DynamoGraphDeploymentSpec{
Restart: &v1alpha1.Restart{
ID: newID,
},
Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{
"frontend": {
Replicas: ptr.To(int32(1)),
},
},
},
dgdStatus: v1alpha1.DynamoGraphDeploymentStatus{
RollingUpdate: &v1alpha1.RollingUpdateStatus{
Phase: v1alpha1.RollingUpdatePhaseInProgress,
},
},
wantRestartStatus: &v1alpha1.RestartStatus{
ObservedID: newID,
Phase: v1alpha1.RestartPhaseSuperseded,
},
},
{
name: "rolling update pending + restart already in progress - superseded",
dgdSpec: v1alpha1.DynamoGraphDeploymentSpec{
Restart: &v1alpha1.Restart{
ID: newID,
},
Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{
"frontend": {
Replicas: ptr.To(int32(1)),
},
},
},
dgdStatus: v1alpha1.DynamoGraphDeploymentStatus{
Restart: &v1alpha1.RestartStatus{
ObservedID: oldID,
Phase: v1alpha1.RestartPhaseRestarting,
InProgress: []string{"frontend"},
},
RollingUpdate: &v1alpha1.RollingUpdateStatus{
Phase: v1alpha1.RollingUpdatePhasePending,
},
},
wantRestartStatus: &v1alpha1.RestartStatus{
ObservedID: newID,
Phase: v1alpha1.RestartPhaseSuperseded,
},
},
{
name: "rolling update completed + restart request - normal processing",
dgdSpec: v1alpha1.DynamoGraphDeploymentSpec{
Restart: &v1alpha1.Restart{
ID: newID,
},
Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{
"frontend": {
Replicas: ptr.To(int32(1)),
},
},
},
dgdStatus: v1alpha1.DynamoGraphDeploymentStatus{
RollingUpdate: &v1alpha1.RollingUpdateStatus{
Phase: v1alpha1.RollingUpdatePhaseCompleted,
},
},
wantRestartStatus: &v1alpha1.RestartStatus{
ObservedID: newID,
Phase: v1alpha1.RestartPhaseRestarting,
InProgress: []string{"frontend"},
},
},
{
name: "restart already processed as superseded - returns existing status",
dgdSpec: v1alpha1.DynamoGraphDeploymentSpec{
Restart: &v1alpha1.Restart{
ID: newID,
},
Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{
"frontend": {
Replicas: ptr.To(int32(1)),
},
},
},
dgdStatus: v1alpha1.DynamoGraphDeploymentStatus{
Restart: &v1alpha1.RestartStatus{
ObservedID: newID,
Phase: v1alpha1.RestartPhaseSuperseded,
},
},
wantRestartStatus: &v1alpha1.RestartStatus{
ObservedID: newID,
Phase: v1alpha1.RestartPhaseSuperseded,
},
},
{
name: "no restart requested but has superseded status - preserves status",
dgdStatus: v1alpha1.DynamoGraphDeploymentStatus{
Restart: &v1alpha1.RestartStatus{
ObservedID: oldID,
Phase: v1alpha1.RestartPhaseSuperseded,
},
},
wantRestartStatus: &v1alpha1.RestartStatus{
ObservedID: oldID,
Phase: v1alpha1.RestartPhaseSuperseded,
},
},
}
for _, tt := range tests {
......@@ -1577,7 +1691,7 @@ func Test_reconcileDynamoComponentsDeployments(t *testing.T) {
},
&v1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-decode",
Name: "test-dgd-decode-e1f2a6fe",
Namespace: "default",
},
Spec: v1alpha1.DynamoComponentDeploymentSpec{
......@@ -1596,7 +1710,7 @@ func Test_reconcileDynamoComponentsDeployments(t *testing.T) {
},
Service: &v1alpha1.ServiceReplicaStatus{
ComponentKind: v1alpha1.ComponentKindDeployment,
ComponentName: "test-dgd-decode-deployment",
ComponentName: "test-dgd-decode-e1f2a6fe-deployment",
Replicas: 2,
UpdatedReplicas: 2,
ReadyReplicas: ptr.To(int32(2)),
......@@ -1606,7 +1720,7 @@ func Test_reconcileDynamoComponentsDeployments(t *testing.T) {
},
&v1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-prefill",
Name: "test-dgd-prefill-e1f2a6fe",
Namespace: "default",
},
Spec: v1alpha1.DynamoComponentDeploymentSpec{
......@@ -1625,7 +1739,7 @@ func Test_reconcileDynamoComponentsDeployments(t *testing.T) {
},
Service: &v1alpha1.ServiceReplicaStatus{
ComponentKind: v1alpha1.ComponentKindDeployment,
ComponentName: "test-dgd-prefill-deployment",
ComponentName: "test-dgd-prefill-e1f2a6fe-deployment",
Replicas: 3,
UpdatedReplicas: 3,
ReadyReplicas: ptr.To(int32(3)),
......@@ -1649,7 +1763,7 @@ func Test_reconcileDynamoComponentsDeployments(t *testing.T) {
},
"decode": {
ComponentKind: v1alpha1.ComponentKindDeployment,
ComponentName: "test-dgd-decode-deployment",
ComponentName: "test-dgd-decode-e1f2a6fe-deployment",
Replicas: 2,
UpdatedReplicas: 2,
ReadyReplicas: ptr.To(int32(2)),
......@@ -1657,7 +1771,7 @@ func Test_reconcileDynamoComponentsDeployments(t *testing.T) {
},
"prefill": {
ComponentKind: v1alpha1.ComponentKindDeployment,
ComponentName: "test-dgd-prefill-deployment",
ComponentName: "test-dgd-prefill-e1f2a6fe-deployment",
Replicas: 3,
UpdatedReplicas: 3,
ReadyReplicas: ptr.To(int32(3)),
......@@ -1723,7 +1837,7 @@ func Test_reconcileDynamoComponentsDeployments(t *testing.T) {
},
&v1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-decode",
Name: "test-dgd-decode-e1f2a6fe",
Namespace: "default",
},
Spec: v1alpha1.DynamoComponentDeploymentSpec{
......@@ -1742,7 +1856,7 @@ func Test_reconcileDynamoComponentsDeployments(t *testing.T) {
},
Service: &v1alpha1.ServiceReplicaStatus{
ComponentKind: v1alpha1.ComponentKindDeployment,
ComponentName: "test-dgd-decode-deployment",
ComponentName: "test-dgd-decode-e1f2a6fe-deployment",
Replicas: 2,
UpdatedReplicas: 1,
ReadyReplicas: ptr.To(int32(1)),
......@@ -1752,7 +1866,7 @@ func Test_reconcileDynamoComponentsDeployments(t *testing.T) {
},
&v1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-prefill",
Name: "test-dgd-prefill-e1f2a6fe",
Namespace: "default",
},
Spec: v1alpha1.DynamoComponentDeploymentSpec{
......@@ -1771,7 +1885,7 @@ func Test_reconcileDynamoComponentsDeployments(t *testing.T) {
},
Service: &v1alpha1.ServiceReplicaStatus{
ComponentKind: v1alpha1.ComponentKindDeployment,
ComponentName: "test-dgd-prefill-deployment",
ComponentName: "test-dgd-prefill-e1f2a6fe-deployment",
Replicas: 3,
UpdatedReplicas: 3,
ReadyReplicas: ptr.To(int32(3)),
......@@ -1783,7 +1897,7 @@ func Test_reconcileDynamoComponentsDeployments(t *testing.T) {
wantReconcileResult: ReconcileResult{
State: DGDStatePending,
Reason: "some_resources_are_not_ready",
Message: "Resources not ready: test-dgd-decode: Component deployment not ready - Available condition not true",
Message: "Resources not ready: test-dgd-decode-e1f2a6fe: Component deployment not ready - Available condition not true",
ServiceStatus: map[string]v1alpha1.ServiceReplicaStatus{
"frontend": {
ComponentKind: v1alpha1.ComponentKindDeployment,
......@@ -1795,7 +1909,7 @@ func Test_reconcileDynamoComponentsDeployments(t *testing.T) {
},
"decode": {
ComponentKind: v1alpha1.ComponentKindDeployment,
ComponentName: "test-dgd-decode-deployment",
ComponentName: "test-dgd-decode-e1f2a6fe-deployment",
Replicas: 2,
UpdatedReplicas: 1,
ReadyReplicas: ptr.To(int32(1)),
......@@ -1803,7 +1917,7 @@ func Test_reconcileDynamoComponentsDeployments(t *testing.T) {
},
"prefill": {
ComponentKind: v1alpha1.ComponentKindDeployment,
ComponentName: "test-dgd-prefill-deployment",
ComponentName: "test-dgd-prefill-e1f2a6fe-deployment",
Replicas: 3,
UpdatedReplicas: 3,
ReadyReplicas: ptr.To(int32(3)),
......@@ -1863,7 +1977,7 @@ func Test_reconcileDynamoComponentsDeployments(t *testing.T) {
},
&v1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-decode",
Name: "test-dgd-decode-5f3d46ba",
Namespace: "default",
},
Spec: v1alpha1.DynamoComponentDeploymentSpec{
......@@ -1882,7 +1996,7 @@ func Test_reconcileDynamoComponentsDeployments(t *testing.T) {
},
Service: &v1alpha1.ServiceReplicaStatus{
ComponentKind: v1alpha1.ComponentKindDeployment,
ComponentName: "test-dgd-decode-deployment",
ComponentName: "test-dgd-decode-5f3d46ba-deployment",
Replicas: 2,
UpdatedReplicas: 1,
ReadyReplicas: ptr.To(int32(1)),
......@@ -1894,7 +2008,7 @@ func Test_reconcileDynamoComponentsDeployments(t *testing.T) {
wantReconcileResult: ReconcileResult{
State: DGDStatePending,
Reason: "some_resources_are_not_ready",
Message: "Resources not ready: test-dgd-decode: Component deployment not ready - Available condition not true; test-dgd-frontend: Component deployment not ready - Available condition not true",
Message: "Resources not ready: test-dgd-decode-5f3d46ba: Component deployment not ready - Available condition not true; test-dgd-frontend: Component deployment not ready - Available condition not true",
ServiceStatus: map[string]v1alpha1.ServiceReplicaStatus{
"frontend": {
ComponentKind: v1alpha1.ComponentKindDeployment,
......@@ -1906,7 +2020,7 @@ func Test_reconcileDynamoComponentsDeployments(t *testing.T) {
},
"decode": {
ComponentKind: v1alpha1.ComponentKindDeployment,
ComponentName: "test-dgd-decode-deployment",
ComponentName: "test-dgd-decode-5f3d46ba-deployment",
Replicas: 2,
UpdatedReplicas: 1,
ReadyReplicas: ptr.To(int32(1)),
......
/*
* SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package controller
import (
"context"
"fmt"
"slices"
"sort"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/intstr"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/dynamo"
)
// shouldTriggerRollingUpdate determines if worker spec changes require a rolling update.
func (r *DynamoGraphDeploymentReconciler) shouldTriggerRollingUpdate(
dgd *nvidiacomv1alpha1.DynamoGraphDeployment,
) bool {
computedHash := dynamo.ComputeDGDWorkersSpecHash(dgd)
currentHash := r.getCurrentWorkerHash(dgd)
// If no current hash exists (new deployment), no rolling update needed
if currentHash == "" {
return false
}
return computedHash != currentHash
}
// initializeWorkerHashIfNeeded sets the current worker hash annotation on first deployment.
// For existing DGDs being upgraded from a pre-rolling-update operator version, this handles
// patching the legacy DCDs with the new worker hash label and then triggering a rolling update on the next reconcile.
func (r *DynamoGraphDeploymentReconciler) initializeWorkerHashIfNeeded(
ctx context.Context,
dgd *nvidiacomv1alpha1.DynamoGraphDeployment,
) error {
logger := log.FromContext(ctx)
if r.getCurrentWorkerHash(dgd) != "" {
return nil // Already initialized
}
// Check for legacy (pre-rolling-update) worker DCDs
legacyDCDs, err := r.findLegacyWorkerDCDs(ctx, dgd)
if err != nil {
return fmt.Errorf("failed to check for legacy worker DCDs: %w", err)
}
if len(legacyDCDs) > 0 {
logger.Info("Found legacy worker DCDs without hash label, initiating migration",
"count", len(legacyDCDs))
// Backfill hash label on legacy DCDs so they're manageable by the rolling update machinery
for i := range legacyDCDs {
dcd := &legacyDCDs[i]
patch := client.MergeFrom(dcd.DeepCopy())
if dcd.Labels == nil {
dcd.Labels = make(map[string]string)
}
dcd.Labels[consts.KubeLabelDynamoWorkerHash] = consts.LegacyWorkerHash
if err := r.Patch(ctx, dcd, patch); err != nil {
return fmt.Errorf("failed to backfill hash label on legacy DCD %s: %w", dcd.Name, err)
}
logger.Info("Backfilled worker hash label on legacy DCD",
"dcdName", dcd.Name, "hash", consts.LegacyWorkerHash)
}
// Set sentinel hash — next reconcile triggers a real rolling update from "legacy" -> computed hash
r.setCurrentWorkerHash(dgd, consts.LegacyWorkerHash)
if err := r.Update(ctx, dgd); err != nil {
return fmt.Errorf("failed to set legacy worker hash: %w", err)
}
r.Recorder.Eventf(dgd, corev1.EventTypeNormal, "LegacyMigrationStarted",
"Detected %d legacy worker DCDs, initiating rolling update migration", len(legacyDCDs))
return nil
}
// Normal first deploy — set the actual computed hash
hash := dynamo.ComputeDGDWorkersSpecHash(dgd)
r.setCurrentWorkerHash(dgd, hash)
if err := r.Update(ctx, dgd); err != nil {
return fmt.Errorf("failed to initialize worker hash: %w", err)
}
logger.Info("Initialized current worker hash", "hash", hash)
return nil
}
// findLegacyWorkerDCDs returns worker DCDs owned by this DGD that lack the worker hash label.
// These are DCDs created by a pre-rolling-update operator version.
func (r *DynamoGraphDeploymentReconciler) findLegacyWorkerDCDs(
ctx context.Context,
dgd *nvidiacomv1alpha1.DynamoGraphDeployment,
) ([]nvidiacomv1alpha1.DynamoComponentDeployment, error) {
// List all DCDs for this DGD
dcdList := &nvidiacomv1alpha1.DynamoComponentDeploymentList{}
listOpts := []client.ListOption{
client.InNamespace(dgd.Namespace),
client.MatchingLabels{
consts.KubeLabelDynamoGraphDeploymentName: dgd.Name,
},
}
if err := r.List(ctx, dcdList, listOpts...); err != nil {
return nil, fmt.Errorf("failed to list DCDs for DGD %s: %w", dgd.Name, err)
}
var legacyDCDs []nvidiacomv1alpha1.DynamoComponentDeployment
for _, dcd := range dcdList.Items {
if !dynamo.IsWorkerComponent(dcd.Spec.ComponentType) {
continue
}
// Legacy DCDs lack the worker hash label
if dcd.Labels[consts.KubeLabelDynamoWorkerHash] == "" {
legacyDCDs = append(legacyDCDs, dcd)
}
}
return legacyDCDs, nil
}
// supportsManagedRollingUpdate checks if DGD pathway supports operator managed rolling updates.
// Grove and LWS deployments currently do not support operator managed rolling updates.
// They fall back to the default rolling update mechanism.
func (r *DynamoGraphDeploymentReconciler) supportsManagedRollingUpdate(
dgd *nvidiacomv1alpha1.DynamoGraphDeployment,
) bool {
return !r.isGrovePathway(dgd) && !dgd.HasAnyMultinodeService()
}
// getCurrentWorkerHash returns the stored worker hash from DGD annotations.
// during a rolling update, this is the previous worker hash and is not updated until the rolling update is completed.
// Returns empty string if no hash has been set (new deployment).
func (r *DynamoGraphDeploymentReconciler) getCurrentWorkerHash(
dgd *nvidiacomv1alpha1.DynamoGraphDeployment,
) string {
if dgd.Annotations == nil {
return ""
}
return dgd.Annotations[consts.AnnotationCurrentWorkerHash]
}
// setCurrentWorkerHash stores the worker hash in DGD annotations.
func (r *DynamoGraphDeploymentReconciler) setCurrentWorkerHash(
dgd *nvidiacomv1alpha1.DynamoGraphDeployment,
hash string,
) {
if dgd.Annotations == nil {
dgd.Annotations = make(map[string]string)
}
dgd.Annotations[consts.AnnotationCurrentWorkerHash] = hash
}
// getOrCreateRollingUpdateStatus returns the existing rolling update status or creates a new one.
func (r *DynamoGraphDeploymentReconciler) getOrCreateRollingUpdateStatus(
dgd *nvidiacomv1alpha1.DynamoGraphDeployment,
) *nvidiacomv1alpha1.RollingUpdateStatus {
if dgd.Status.RollingUpdate == nil {
dgd.Status.RollingUpdate = &nvidiacomv1alpha1.RollingUpdateStatus{
Phase: nvidiacomv1alpha1.RollingUpdatePhaseNone,
}
}
return dgd.Status.RollingUpdate
}
// isRollingUpdateInProgress returns true if a rolling update is currently active.
func (r *DynamoGraphDeploymentReconciler) isRollingUpdateInProgress(
dgd *nvidiacomv1alpha1.DynamoGraphDeployment,
) bool {
if dgd.Status.RollingUpdate == nil {
return false
}
phase := dgd.Status.RollingUpdate.Phase
return phase == nvidiacomv1alpha1.RollingUpdatePhasePending ||
phase == nvidiacomv1alpha1.RollingUpdatePhaseInProgress
}
// reconcileRollingUpdate handles the rolling update lifecycle.
func (r *DynamoGraphDeploymentReconciler) reconcileRollingUpdate(
ctx context.Context,
dgd *nvidiacomv1alpha1.DynamoGraphDeployment,
) error {
logger := log.FromContext(ctx)
// Get or create rollingUpdate status
rollingUpdateStatus := r.getOrCreateRollingUpdateStatus(dgd)
// Compute hash information
newWorkerHash := dynamo.ComputeDGDWorkersSpecHash(dgd)
prevWorkerHash := r.getCurrentWorkerHash(dgd)
logger.Info("Reconciling rolling update",
"phase", rollingUpdateStatus.Phase,
"prevWorkerHash", prevWorkerHash,
"newWorkerHash", newWorkerHash)
if (rollingUpdateStatus.Phase == nvidiacomv1alpha1.RollingUpdatePhaseCompleted) && prevWorkerHash != newWorkerHash {
// Check if DCDs with the new hash already exist and are serving.
// If so, this is just a stale annotation — update it without starting a new rollout.
newInfo, err := r.getWorkerInfoForWorkerHash(ctx, dgd, newWorkerHash)
if err == nil && newInfo.TotalReadyWorkers() > 0 {
logger.Info("Updating stale worker hash annotation",
"prevWorkerHash", prevWorkerHash, "newHash", newWorkerHash)
r.setCurrentWorkerHash(dgd, newWorkerHash)
return r.Update(ctx, dgd)
}
// New spec change: reset to start a proper rolling update cycle with surge/drain.
logger.Info("New worker spec change detected, starting new rolling update cycle",
"prevWorkerHash", prevWorkerHash, "newHash", newWorkerHash,
"previousPhase", rollingUpdateStatus.Phase)
rollingUpdateStatus.Phase = nvidiacomv1alpha1.RollingUpdatePhaseNone
rollingUpdateStatus.StartTime = nil
rollingUpdateStatus.EndTime = nil
rollingUpdateStatus.UpdatedServices = nil
}
if prevWorkerHash == newWorkerHash &&
rollingUpdateStatus.Phase == nvidiacomv1alpha1.RollingUpdatePhaseInProgress {
logger.Info("Detected stuck rolling update: hashes match but phase is InProgress",
"hash", newWorkerHash,
"phase", rollingUpdateStatus.Phase)
return r.completeRollingUpdate(ctx, dgd, rollingUpdateStatus, newWorkerHash)
}
switch rollingUpdateStatus.Phase {
case nvidiacomv1alpha1.RollingUpdatePhaseNone:
return r.startRollingUpdate(ctx, dgd, rollingUpdateStatus, newWorkerHash)
case nvidiacomv1alpha1.RollingUpdatePhasePending:
rollingUpdateStatus.Phase = nvidiacomv1alpha1.RollingUpdatePhaseInProgress
if err := r.Status().Update(ctx, dgd); err != nil {
return fmt.Errorf("failed to update rolling update status to InProgress: %w", err)
}
return nil
case nvidiacomv1alpha1.RollingUpdatePhaseInProgress:
return r.continueRollingUpdate(ctx, dgd, rollingUpdateStatus, newWorkerHash)
case nvidiacomv1alpha1.RollingUpdatePhaseCompleted:
// Cleanup is now done atomically in completeRollingUpdate, nothing to do here
logger.Info("Rolling update already completed")
return nil
}
return nil
}
// startRollingUpdate initializes a new rolling update.
func (r *DynamoGraphDeploymentReconciler) startRollingUpdate(
ctx context.Context,
dgd *nvidiacomv1alpha1.DynamoGraphDeployment,
rollingUpdateStatus *nvidiacomv1alpha1.RollingUpdateStatus,
newWorkerHash string,
) error {
logger := log.FromContext(ctx)
prevWorkerHash := r.getCurrentWorkerHash(dgd)
logger.Info("Starting rolling update",
"prevHash", prevWorkerHash,
"newHash", newWorkerHash)
now := metav1.Now()
rollingUpdateStatus.Phase = nvidiacomv1alpha1.RollingUpdatePhasePending
rollingUpdateStatus.StartTime = &now
rollingUpdateStatus.UpdatedServices = nil
r.Recorder.Eventf(dgd, corev1.EventTypeNormal, "RollingUpdateStarted",
"Starting rolling update from worker hash %s to %s", prevWorkerHash, newWorkerHash)
if err := r.Status().Update(ctx, dgd); err != nil {
return fmt.Errorf("failed to initialize rolling update status: %w", err)
}
return nil
}
// continueRollingUpdate handles the in-progress phase of a rolling update.
func (r *DynamoGraphDeploymentReconciler) continueRollingUpdate(
ctx context.Context,
dgd *nvidiacomv1alpha1.DynamoGraphDeployment,
rollingUpdateStatus *nvidiacomv1alpha1.RollingUpdateStatus,
newWorkerHash string,
) error {
logger := log.FromContext(ctx)
oldInfo, err := r.getOldWorkerInfo(ctx, dgd, newWorkerHash)
if err != nil {
logger.Error(err, "Failed to get old worker info")
oldInfo = &dynamoNamespaceWorkerInfo{}
}
newInfo, err := r.getWorkerInfoForWorkerHash(ctx, dgd, newWorkerHash)
if err != nil {
logger.Error(err, "Failed to get new worker hash status")
newInfo = &dynamoNamespaceWorkerInfo{}
}
desiredReplicas := r.getDesiredWorkerReplicas(dgd)
logger.Info("Rolling update progress",
"oldReadyWorkers", oldInfo.TotalReadyWorkers(),
"newReadyWorkers", newInfo.TotalReadyWorkers(),
"desiredReplicas", desiredReplicas,
"newWorkerHash", newWorkerHash)
// Compute per-service completion
var updatedServices []string
for serviceName, spec := range dgd.Spec.Services {
if spec == nil || !dynamo.IsWorkerComponent(spec.ComponentType) {
continue
}
desired := int32(1)
if spec.Replicas != nil {
desired = *spec.Replicas
}
newSvc := newInfo.services[serviceName]
oldSvc := oldInfo.services[serviceName]
newReady := newSvc != nil && newSvc.readyReplicas >= desired
oldGone := oldSvc == nil || oldSvc.readyReplicas == 0
if newReady && oldGone {
updatedServices = append(updatedServices, serviceName)
}
}
sort.Strings(updatedServices)
rollingUpdateStatus.UpdatedServices = updatedServices
// Count total worker services
totalWorkerServices := 0
for _, spec := range dgd.Spec.Services {
if spec != nil && dynamo.IsWorkerComponent(spec.ComponentType) {
totalWorkerServices++
}
}
// Rolling update is complete when every worker service is individually updated
if len(updatedServices) == totalWorkerServices && totalWorkerServices > 0 {
return r.completeRollingUpdate(ctx, dgd, rollingUpdateStatus, newWorkerHash)
}
// Persist updated services list mid-rolling update
if err := r.Status().Update(ctx, dgd); err != nil {
return fmt.Errorf("failed to update rolling update status with updated services: %w", err)
}
return nil
}
// completeRollingUpdate marks the rolling update as completed, cleans up old resources, and updates status.
// This performs all cleanup atomically to avoid race conditions with subsequent reconciles.
func (r *DynamoGraphDeploymentReconciler) completeRollingUpdate(
ctx context.Context,
dgd *nvidiacomv1alpha1.DynamoGraphDeployment,
rollingUpdateStatus *nvidiacomv1alpha1.RollingUpdateStatus,
newWorkerHash string,
) error {
logger := log.FromContext(ctx)
// Delete all non-current worker DCDs (any number of old generations)
if err := r.deleteOldWorkerDCDs(ctx, dgd, newWorkerHash); err != nil {
logger.Error(err, "Failed to delete non-current worker DCDs", "newWorkerHash", newWorkerHash)
r.Recorder.Eventf(dgd, corev1.EventTypeWarning, "CleanupPartialFailure",
"Failed to delete some old worker DCDs: %v", err)
// Continue anyway - we don't want cleanup failures to block the rolling update completion
} else {
logger.Info("Old resources cleaned up", "newWorkerHash", newWorkerHash)
}
// Update rolling update status to Completed
rollingUpdateStatus.Phase = nvidiacomv1alpha1.RollingUpdatePhaseCompleted
now := metav1.Now()
rollingUpdateStatus.EndTime = &now
// Mark all worker services as updated
var allWorkerServices []string
for serviceName, spec := range dgd.Spec.Services {
if spec != nil && dynamo.IsWorkerComponent(spec.ComponentType) {
allWorkerServices = append(allWorkerServices, serviceName)
}
}
sort.Strings(allWorkerServices)
rollingUpdateStatus.UpdatedServices = allWorkerServices
r.Recorder.Eventf(dgd, corev1.EventTypeNormal, "RollingUpdateCompleted",
"Rolling update completed, worker hash %s", newWorkerHash)
if err := r.Status().Update(ctx, dgd); err != nil {
return fmt.Errorf("failed to update rolling update status: %w", err)
}
// Update the current worker hash to the new hash
r.setCurrentWorkerHash(dgd, newWorkerHash)
if err := r.Update(ctx, dgd); err != nil {
return fmt.Errorf("failed to update current worker hash: %w", err)
}
logger.Info("Rolling update finalized", "newWorkerHash", newWorkerHash)
return nil
}
// workerServiceInfo holds ready replica count for a worker service.
type workerServiceInfo struct {
readyReplicas int32
desired int32
}
// dynamoNamespaceWorkerInfo holds aggregated worker status for a single dynamo namespace.
type dynamoNamespaceWorkerInfo struct {
// totalReadyWorkers is the sum of ready replicas across all worker services
totalReadyWorkers int32
// services contains per-component-type status (e.g., "prefill", "decode", "worker")
services map[string]*workerServiceInfo
}
func (s *dynamoNamespaceWorkerInfo) TotalReadyWorkers() int32 {
return s.totalReadyWorkers
}
// getWorkerInfoForWorkerHash queries DCDs for a specific worker hash and returns
// aggregated worker info.
func (r *DynamoGraphDeploymentReconciler) getWorkerInfoForWorkerHash(
ctx context.Context,
dgd *nvidiacomv1alpha1.DynamoGraphDeployment,
workerHash string,
) (*dynamoNamespaceWorkerInfo, error) {
dcdList := &nvidiacomv1alpha1.DynamoComponentDeploymentList{}
listOpts := []client.ListOption{
client.InNamespace(dgd.Namespace),
client.MatchingLabels{
consts.KubeLabelDynamoGraphDeploymentName: dgd.Name,
consts.KubeLabelDynamoWorkerHash: workerHash,
},
}
if err := r.List(ctx, dcdList, listOpts...); err != nil {
return nil, fmt.Errorf("failed to list DCDs: %w", err)
}
status := &dynamoNamespaceWorkerInfo{
services: make(map[string]*workerServiceInfo),
}
for _, dcd := range dcdList.Items {
if !dynamo.IsWorkerComponent(dcd.Spec.ComponentType) {
continue
}
// Add ready replicas
readyReplicas := int32(0)
if dcd.Status.Service != nil && dcd.Status.Service.ReadyReplicas != nil {
readyReplicas = *dcd.Status.Service.ReadyReplicas
}
// Add desired replicas
desiredReplicas := int32(0)
if dcd.Spec.Replicas != nil {
desiredReplicas = *dcd.Spec.Replicas
}
status.services[dcd.Spec.ServiceName] = &workerServiceInfo{
readyReplicas: readyReplicas,
desired: desiredReplicas,
}
status.totalReadyWorkers += readyReplicas
}
return status, nil
}
// getOldWorkerInfo aggregates ready replicas across ALL non-current worker DCDs.
func (r *DynamoGraphDeploymentReconciler) getOldWorkerInfo(
ctx context.Context,
dgd *nvidiacomv1alpha1.DynamoGraphDeployment,
newWorkerHash string,
) (*dynamoNamespaceWorkerInfo, error) {
oldDCDs, err := r.listOldWorkerDCDs(ctx, dgd, newWorkerHash)
if err != nil {
return nil, fmt.Errorf("failed to list non-current worker DCDs: %w", err)
}
status := &dynamoNamespaceWorkerInfo{
services: make(map[string]*workerServiceInfo),
}
for _, dcd := range oldDCDs {
readyReplicas := int32(0)
if dcd.Status.Service != nil && dcd.Status.Service.ReadyReplicas != nil {
readyReplicas = *dcd.Status.Service.ReadyReplicas
}
if existing, ok := status.services[dcd.Spec.ServiceName]; ok {
existing.readyReplicas += readyReplicas
} else {
status.services[dcd.Spec.ServiceName] = &workerServiceInfo{
readyReplicas: readyReplicas,
}
}
status.totalReadyWorkers += readyReplicas
}
return status, nil
}
// getDesiredWorkerReplicas returns the total desired replicas across all worker services.
func (r *DynamoGraphDeploymentReconciler) getDesiredWorkerReplicas(
dgd *nvidiacomv1alpha1.DynamoGraphDeployment,
) int32 {
var total int32
for _, spec := range dgd.Spec.Services {
if spec != nil && dynamo.IsWorkerComponent(spec.ComponentType) {
if spec.Replicas != nil {
total += *spec.Replicas
} else {
total += 1 // Default to 1 if not specified
}
}
}
return total
}
// scaleOldWorkerDCDs patches the replicas field on old worker DCDs during a rolling update.
// When multiple old generations exist for the same service, replicas are distributed to the
// newest old DCD first, with older DCDs drained to 0 (matching K8s Deployment controller behavior).
func (r *DynamoGraphDeploymentReconciler) scaleOldWorkerDCDs(
ctx context.Context,
dgd *nvidiacomv1alpha1.DynamoGraphDeployment,
rollingUpdateCtx dynamo.RollingUpdateContext,
) error {
logger := log.FromContext(ctx)
if !rollingUpdateCtx.InProgress() {
return nil
}
oldDCDs, err := r.listOldWorkerDCDs(ctx, dgd, rollingUpdateCtx.NewWorkerHash)
if err != nil {
return fmt.Errorf("failed to list old worker DCDs: %w", err)
}
// Group old DCDs by service name
dcdsByService := make(map[string][]*nvidiacomv1alpha1.DynamoComponentDeployment)
for i := range oldDCDs {
svc := oldDCDs[i].Spec.ServiceName
dcdsByService[svc] = append(dcdsByService[svc], &oldDCDs[i])
}
for serviceName, dcds := range dcdsByService {
oldNeeded, ok := rollingUpdateCtx.OldWorkerReplicas[serviceName]
if !ok {
continue
}
// Sort by creation time descending (newest first) so newest old DCDs get replicas first
sort.Slice(dcds, func(i, j int) bool {
return dcds[i].CreationTimestamp.After(dcds[j].CreationTimestamp.Time)
})
remaining := oldNeeded
for _, dcd := range dcds {
var desiredReplicas int32
if remaining > 0 {
currentSpec := int32(1)
if dcd.Spec.Replicas != nil {
currentSpec = *dcd.Spec.Replicas
}
// Give this DCD up to its current spec count, but no more than remaining
desiredReplicas = min(remaining, currentSpec)
remaining -= desiredReplicas
}
currentReplicas := int32(1)
if dcd.Spec.Replicas != nil {
currentReplicas = *dcd.Spec.Replicas
}
if currentReplicas == desiredReplicas {
logger.V(1).Info("Old worker DCD replicas already at desired value",
"dcdName", dcd.Name, "replicas", desiredReplicas)
continue
}
patch := client.MergeFrom(dcd.DeepCopy())
dcd.Spec.Replicas = &desiredReplicas
if err := r.Patch(ctx, dcd, patch); err != nil {
return fmt.Errorf("failed to patch old worker DCD %s replicas: %w", dcd.Name, err)
}
logger.Info("Scaled old worker DCD",
"dcdName", dcd.Name,
"service", serviceName,
"oldReplicas", currentReplicas,
"newReplicas", desiredReplicas)
}
}
return nil
}
// listOldWorkerDCDs returns all worker DCDs for this DGD whose worker hash label
// does NOT match the given newWorkerHash. This captures all old generations (including legacy).
func (r *DynamoGraphDeploymentReconciler) listOldWorkerDCDs(
ctx context.Context,
dgd *nvidiacomv1alpha1.DynamoGraphDeployment,
newWorkerHash string,
) ([]nvidiacomv1alpha1.DynamoComponentDeployment, error) {
dcdList := &nvidiacomv1alpha1.DynamoComponentDeploymentList{}
listOpts := []client.ListOption{
client.InNamespace(dgd.Namespace),
client.MatchingLabels{
consts.KubeLabelDynamoGraphDeploymentName: dgd.Name,
},
}
if err := r.List(ctx, dcdList, listOpts...); err != nil {
return nil, err
}
var workers []nvidiacomv1alpha1.DynamoComponentDeployment
for _, dcd := range dcdList.Items {
if !dynamo.IsWorkerComponent(dcd.Spec.ComponentType) {
continue
}
if dcd.Labels[consts.KubeLabelDynamoWorkerHash] != newWorkerHash {
workers = append(workers, dcd)
}
}
return workers, nil
}
// deleteOldWorkerDCDs deletes all worker DCDs belonging to this DGD whose hash label
// does NOT match the given newWorkerHash. This cleans up all old generations at once.
func (r *DynamoGraphDeploymentReconciler) deleteOldWorkerDCDs(
ctx context.Context,
dgd *nvidiacomv1alpha1.DynamoGraphDeployment,
newWorkerHash string,
) error {
logger := log.FromContext(ctx)
oldDCDs, err := r.listOldWorkerDCDs(ctx, dgd, newWorkerHash)
if err != nil {
return fmt.Errorf("failed to list non-current worker DCDs: %w", err)
}
if len(oldDCDs) == 0 {
logger.Info("No non-current worker DCDs found to delete", "newWorkerHash", newWorkerHash)
return nil
}
logger.Info("Deleting non-current worker DCDs", "count", len(oldDCDs), "newWorkerHash", newWorkerHash)
var deleteErrors []error
for i := range oldDCDs {
dcd := &oldDCDs[i]
logger.Info("Deleting non-current worker DCD", "name", dcd.Name, "hash", dcd.Labels[consts.KubeLabelDynamoWorkerHash])
if err := r.Delete(ctx, dcd); err != nil {
if !apierrors.IsNotFound(err) {
deleteErrors = append(deleteErrors, fmt.Errorf("failed to delete DCD %s: %w", dcd.Name, err))
}
}
}
if len(deleteErrors) > 0 {
return fmt.Errorf("failed to delete %d DCDs: %v", len(deleteErrors), deleteErrors)
}
return nil
}
// aggregateOldWorkerServiceStatuses fetches all non-current worker DCDs and returns their
// aggregated service statuses keyed by service name. Accumulates across multiple old generations.
func (r *DynamoGraphDeploymentReconciler) aggregateOldWorkerServiceStatuses(
ctx context.Context,
dgd *nvidiacomv1alpha1.DynamoGraphDeployment,
rollingUpdateCtx dynamo.RollingUpdateContext,
) (map[string]nvidiacomv1alpha1.ServiceReplicaStatus, error) {
oldStatuses := make(map[string]nvidiacomv1alpha1.ServiceReplicaStatus)
oldDCDs, err := r.listOldWorkerDCDs(ctx, dgd, rollingUpdateCtx.NewWorkerHash)
if err != nil {
return nil, fmt.Errorf("failed to list old worker DCDs for status aggregation: %w", err)
}
for _, dcd := range oldDCDs {
if _, inRollout := rollingUpdateCtx.OldWorkerReplicas[dcd.Spec.ServiceName]; !inRollout {
continue
}
if dcd.Status.Service == nil {
continue
}
existing, found := oldStatuses[dcd.Spec.ServiceName]
if !found {
status := *dcd.Status.Service
status.ComponentNames = []string{dcd.Status.Service.ComponentName}
oldStatuses[dcd.Spec.ServiceName] = status
} else {
// Accumulate across multiple old DCDs
existing.Replicas += dcd.Status.Service.Replicas
existing.ReadyReplicas = addOptionalInt32(existing.ReadyReplicas, dcd.Status.Service.ReadyReplicas)
existing.AvailableReplicas = addOptionalInt32(existing.AvailableReplicas, dcd.Status.Service.AvailableReplicas)
existing.ComponentNames = append(existing.ComponentNames, dcd.Status.Service.ComponentName)
oldStatuses[dcd.Spec.ServiceName] = existing
}
}
return oldStatuses, nil
}
// resolveRollingUpdateParams reads the deployment strategy annotations from a service spec
// and resolves maxSurge and maxUnavailable to concrete replica counts.
// Defaults: maxSurge=25%, maxUnavailable=25% (matches Kubernetes Deployment defaults).
// TODO: support the recreate strategy
func resolveRollingUpdateParams(annotations map[string]string, desiredReplicas int32) (maxSurge int32, maxUnavailable int32) {
surgeValue := intstr.FromString("25%")
unavailValue := intstr.FromString("25%")
if v := annotations[KubeAnnotationDeploymentRollingUpdateMaxSurge]; v != "" {
surgeValue = intstr.Parse(v)
}
if v := annotations[KubeAnnotationDeploymentRollingUpdateMaxUnavailable]; v != "" {
unavailValue = intstr.Parse(v)
}
// Resolve percentages against desiredReplicas. Round up for surge (more aggressive scale-up),
// round down for unavailable (more conservative, matches Kubernetes deployment controller behavior).
// https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#max-unavailable
surge, _ := intstr.GetScaledValueFromIntOrPercent(&surgeValue, int(desiredReplicas), true)
unavail, _ := intstr.GetScaledValueFromIntOrPercent(&unavailValue, int(desiredReplicas), false)
// Ensure at least one of surge/unavailable is > 0 to guarantee progress
if surge == 0 && unavail == 0 {
surge = 1
}
return int32(surge), int32(unavail)
}
// buildRollingUpdateContext creates a RollingUpdateContext.
// It computes namespaces and pre-calculates old and new worker replica counts.
//
// Replica calculation:
// - oldReplicas = max(0, desiredReplicas - newReadyReplicas - maxUnavailable)
// - newReplicas = min(desiredReplicas, desiredReplicas + maxSurge - oldReplicas)
func (r *DynamoGraphDeploymentReconciler) buildRollingUpdateContext(
ctx context.Context,
dgd *nvidiacomv1alpha1.DynamoGraphDeployment,
) dynamo.RollingUpdateContext {
logger := log.FromContext(ctx)
// Compute hashes
newWorkerHash := dynamo.ComputeDGDWorkersSpecHash(dgd)
prevWorkerHash := r.getCurrentWorkerHash(dgd)
if prevWorkerHash == newWorkerHash {
return dynamo.RollingUpdateContext{
NewWorkerHash: newWorkerHash,
OldWorkerReplicas: make(map[string]int32),
NewWorkerReplicas: make(map[string]int32),
}
}
// Pre-calculate old and new worker replicas based on new worker readiness
oldWorkerReplicas := make(map[string]int32)
newWorkerReplicas := make(map[string]int32)
for serviceName, spec := range dgd.Spec.Services {
if spec == nil || !dynamo.IsWorkerComponent(spec.ComponentType) {
continue
}
// Get desired replicas from spec
desiredReplicas := int32(1)
if spec.Replicas != nil {
desiredReplicas = *spec.Replicas
}
maxSurge, maxUnavailable := resolveRollingUpdateParams(spec.Annotations, desiredReplicas)
// Query new DCD to get ready replicas (using hash-based naming)
newDCDName := dynamo.GetDCDResourceName(dgd, serviceName, newWorkerHash)
newDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{}
err := r.Get(ctx, types.NamespacedName{Name: newDCDName, Namespace: dgd.Namespace}, newDCD)
newReadyReplicas := int32(0)
if err == nil && newDCD.Status.Service != nil && newDCD.Status.Service.ReadyReplicas != nil {
newReadyReplicas = *newDCD.Status.Service.ReadyReplicas
}
// Calculate old replicas: allow scaling down by maxUnavailable
// oldReplicas = max(0, desiredReplicas - newReadyReplicas - maxUnavailable)
oldNeeded := desiredReplicas - newReadyReplicas - maxUnavailable
if oldNeeded < 0 {
oldNeeded = 0
}
// Calculate new replicas: stay within surge budget
// newReplicas = min(desiredReplicas, desiredReplicas + maxSurge - oldNeeded)
newNeeded := desiredReplicas + maxSurge - oldNeeded
if newNeeded > desiredReplicas {
newNeeded = desiredReplicas
}
if newNeeded < 0 {
newNeeded = 0
}
newWorkerReplicas[serviceName] = newNeeded
oldWorkerReplicas[serviceName] = oldNeeded
logger.V(1).Info("Calculated worker replicas for rollingUpdate",
"service", serviceName,
"desired", desiredReplicas,
"newReady", newReadyReplicas,
"maxSurge", maxSurge,
"maxUnavailable", maxUnavailable,
"newNeeded", newNeeded,
"oldNeeded", oldNeeded)
}
return dynamo.RollingUpdateContext{
NewWorkerHash: newWorkerHash,
OldWorkerReplicas: oldWorkerReplicas,
NewWorkerReplicas: newWorkerReplicas,
}
}
// mergeWorkerServiceStatuses merges old worker service statuses into the existing service statuses.
// For each worker service present in both maps, it aggregates replica counts so that the status
// reflects the total across old and new worker DCDs during a rolling update.
func mergeWorkerServiceStatuses(
serviceStatuses map[string]nvidiacomv1alpha1.ServiceReplicaStatus,
oldWorkerStatuses map[string]nvidiacomv1alpha1.ServiceReplicaStatus,
) {
for serviceName, oldStatus := range oldWorkerStatuses {
newStatus, exists := serviceStatuses[serviceName]
if !exists {
continue
}
// Build sorted ComponentNames from old and new DCD names.
componentNames := append(oldStatus.ComponentNames, newStatus.ComponentName)
slices.Sort(componentNames)
newStatus.ComponentNames = componentNames
// Aggregate replica counts
newStatus.Replicas += oldStatus.Replicas
// UpdatedReplicas stays as-is (only new are "updated")
newStatus.ReadyReplicas = addOptionalInt32(newStatus.ReadyReplicas, oldStatus.ReadyReplicas)
newStatus.AvailableReplicas = addOptionalInt32(newStatus.AvailableReplicas, oldStatus.AvailableReplicas)
serviceStatuses[serviceName] = newStatus
}
}
// addOptionalInt32 adds two optional int32 pointers. Returns nil only if both are nil.
func addOptionalInt32(a, b *int32) *int32 {
if a == nil && b == nil {
return nil
}
var sum int32
if a != nil {
sum += *a
}
if b != nil {
sum += *b
}
return &sum
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package controller
import (
"context"
"sort"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/tools/record"
"k8s.io/utils/ptr"
"sigs.k8s.io/controller-runtime/pkg/client/fake"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/dynamo"
)
const (
testOldWorkerHash = "oldhash1"
testNewWorkerHash = "newhash2"
)
// createTestDGD creates a DynamoGraphDeployment for testing with the given services
func createTestDGD(name string, services map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec) *nvidiacomv1alpha1.DynamoGraphDeployment {
return &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
Services: services,
},
}
}
// createTestReconciler creates a DynamoGraphDeploymentReconciler for testing
func createTestReconciler(objs ...runtime.Object) *DynamoGraphDeploymentReconciler {
scheme := runtime.NewScheme()
_ = nvidiacomv1alpha1.AddToScheme(scheme)
_ = corev1.AddToScheme(scheme)
fakeClient := fake.NewClientBuilder().
WithScheme(scheme).
WithRuntimeObjects(objs...).
Build()
return &DynamoGraphDeploymentReconciler{
Client: fakeClient,
Recorder: record.NewFakeRecorder(10),
}
}
func TestShouldTriggerRollingUpdate(t *testing.T) {
tests := []struct {
name string
services map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec
existingHash string // empty means no annotation, "compute" means compute from services
expected bool
}{
{
name: "new deployment - no hash annotation",
services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {
ComponentType: consts.ComponentTypeWorker,
Envs: []corev1.EnvVar{{Name: "FOO", Value: "bar"}},
},
},
existingHash: "",
expected: false,
},
{
name: "hash unchanged - matches current spec",
services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {
ComponentType: consts.ComponentTypeWorker,
Envs: []corev1.EnvVar{{Name: "FOO", Value: "bar"}},
},
},
existingHash: "compute",
expected: false,
},
{
name: "hash changed - differs from current spec",
services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {
ComponentType: consts.ComponentTypeWorker,
Envs: []corev1.EnvVar{{Name: "FOO", Value: "new-value"}},
},
},
existingHash: "old-hash-12345678",
expected: true,
},
{
name: "frontend-only change - hash unchanged",
services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"frontend": {
ComponentType: consts.ComponentTypeFrontend,
Envs: []corev1.EnvVar{{Name: "FRONTEND_VAR", Value: "changed"}},
},
"worker": {
ComponentType: consts.ComponentTypeWorker,
Envs: []corev1.EnvVar{{Name: "WORKER_VAR", Value: "unchanged"}},
},
},
existingHash: "compute",
expected: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
dgd := createTestDGD("test-dgd", tt.services)
if tt.existingHash == "compute" {
hash := dynamo.ComputeDGDWorkersSpecHash(dgd)
dgd.Annotations = map[string]string{consts.AnnotationCurrentWorkerHash: hash}
} else if tt.existingHash != "" {
dgd.Annotations = map[string]string{consts.AnnotationCurrentWorkerHash: tt.existingHash}
}
r := createTestReconciler(dgd)
result := r.shouldTriggerRollingUpdate(dgd)
if result != tt.expected {
t.Errorf("shouldTriggerRollingUpdate() = %v, expected %v", result, tt.expected)
}
})
}
}
func TestInitializeWorkerHashIfNeeded_FirstDeploy(t *testing.T) {
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {
ComponentType: consts.ComponentTypeWorker,
Envs: []corev1.EnvVar{
{Name: "FOO", Value: "bar"},
},
},
})
// Create reconciler with DGD already in the fake client (simulates existing resource)
r := createTestReconciler(dgd)
ctx := context.Background()
// Initialize the hash
err := r.initializeWorkerHashIfNeeded(ctx, dgd)
require.NoError(t, err)
// Verify the hash was set
hash := r.getCurrentWorkerHash(dgd)
assert.NotEmpty(t, hash, "Hash should be set after initialization")
// Verify the hash is correct
expectedHash := dynamo.ComputeDGDWorkersSpecHash(dgd)
assert.Equal(t, expectedHash, hash, "Hash should match computed value")
}
func TestInitializeWorkerHashIfNeeded_AlreadyInitialized(t *testing.T) {
existingHash := "existing-hash"
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {
ComponentType: consts.ComponentTypeWorker,
Envs: []corev1.EnvVar{
{Name: "FOO", Value: "bar"},
},
},
})
dgd.Annotations = map[string]string{
consts.AnnotationCurrentWorkerHash: existingHash,
}
// Create reconciler with DGD already in the fake client
r := createTestReconciler(dgd)
ctx := context.Background()
// Initialize should be a no-op
err := r.initializeWorkerHashIfNeeded(ctx, dgd)
require.NoError(t, err)
// Verify the hash was NOT changed
hash := r.getCurrentWorkerHash(dgd)
assert.Equal(t, existingHash, hash, "Hash should not change when already initialized")
}
func TestSupportsManagedRollingUpdate(t *testing.T) {
tests := []struct {
name string
services map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec
expected bool
}{
{
name: "standard single-node deployment",
services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {ComponentType: consts.ComponentTypeWorker},
},
expected: true,
},
{
name: "multinode deployment",
services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {
ComponentType: consts.ComponentTypeWorker,
Multinode: &nvidiacomv1alpha1.MultinodeSpec{NodeCount: 4},
},
},
expected: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
dgd := createTestDGD("test-dgd", tt.services)
r := createTestReconciler(dgd)
result := r.supportsManagedRollingUpdate(dgd)
if result != tt.expected {
t.Errorf("isUnsupportedRollingUpdatePathway() = %v, expected %v", result, tt.expected)
}
})
}
}
func TestWorkerHashChanges_OnlyWhenWorkerSpecChanges(t *testing.T) {
// Test that hash only changes when worker specs change, not frontend specs
workerEnvs := []corev1.EnvVar{{Name: "WORKER_VAR", Value: "value1"}}
frontendEnvs := []corev1.EnvVar{{Name: "FRONTEND_VAR", Value: "value1"}}
dgd1 := createTestDGD("test", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {ComponentType: consts.ComponentTypeWorker, Envs: workerEnvs},
"frontend": {ComponentType: consts.ComponentTypeFrontend, Envs: frontendEnvs},
})
hash1 := dynamo.ComputeDGDWorkersSpecHash(dgd1)
// Change only frontend envs
dgd2 := createTestDGD("test", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {ComponentType: consts.ComponentTypeWorker, Envs: workerEnvs},
"frontend": {ComponentType: consts.ComponentTypeFrontend, Envs: []corev1.EnvVar{{Name: "FRONTEND_VAR", Value: "changed"}}},
})
hash2 := dynamo.ComputeDGDWorkersSpecHash(dgd2)
assert.Equal(t, hash1, hash2, "Hash should not change when only frontend changes")
// Change worker envs
dgd3 := createTestDGD("test", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {ComponentType: consts.ComponentTypeWorker, Envs: []corev1.EnvVar{{Name: "WORKER_VAR", Value: "changed"}}},
"frontend": {ComponentType: consts.ComponentTypeFrontend, Envs: frontendEnvs},
})
hash3 := dynamo.ComputeDGDWorkersSpecHash(dgd3)
assert.NotEqual(t, hash1, hash3, "Hash should change when worker specs change")
}
func TestWorkerHashChanges_PrefillAndDecode(t *testing.T) {
// Test that prefill and decode component types are also considered workers
dgd1 := createTestDGD("test", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"prefill": {ComponentType: consts.ComponentTypePrefill, Envs: []corev1.EnvVar{{Name: "VAR", Value: "v1"}}},
"decode": {ComponentType: consts.ComponentTypeDecode, Envs: []corev1.EnvVar{{Name: "VAR", Value: "v1"}}},
})
hash1 := dynamo.ComputeDGDWorkersSpecHash(dgd1)
assert.NotEmpty(t, hash1, "Hash should be computed for prefill/decode")
// Change prefill spec
dgd2 := createTestDGD("test", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"prefill": {ComponentType: consts.ComponentTypePrefill, Envs: []corev1.EnvVar{{Name: "VAR", Value: "v2"}}},
"decode": {ComponentType: consts.ComponentTypeDecode, Envs: []corev1.EnvVar{{Name: "VAR", Value: "v1"}}},
})
hash2 := dynamo.ComputeDGDWorkersSpecHash(dgd2)
assert.NotEqual(t, hash1, hash2, "Hash should change when prefill specs change")
// Change decode spec
dgd3 := createTestDGD("test", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"prefill": {ComponentType: consts.ComponentTypePrefill, Envs: []corev1.EnvVar{{Name: "VAR", Value: "v1"}}},
"decode": {ComponentType: consts.ComponentTypeDecode, Envs: []corev1.EnvVar{{Name: "VAR", Value: "v2"}}},
})
hash3 := dynamo.ComputeDGDWorkersSpecHash(dgd3)
assert.NotEqual(t, hash1, hash3, "Hash should change when decode specs change")
}
func TestGetOrCreateRollingUpdateStatus(t *testing.T) {
tests := []struct {
name string
existingStatus *nvidiacomv1alpha1.RollingUpdateStatus
expectedPhase nvidiacomv1alpha1.RollingUpdatePhase
}{
{
name: "creates new status when nil",
existingStatus: nil,
expectedPhase: nvidiacomv1alpha1.RollingUpdatePhaseNone,
},
{
name: "returns existing status",
existingStatus: &nvidiacomv1alpha1.RollingUpdateStatus{
Phase: nvidiacomv1alpha1.RollingUpdatePhaseInProgress,
},
expectedPhase: nvidiacomv1alpha1.RollingUpdatePhaseInProgress,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {ComponentType: consts.ComponentTypeWorker},
})
dgd.Status.RollingUpdate = tt.existingStatus
r := createTestReconciler(dgd)
status := r.getOrCreateRollingUpdateStatus(dgd)
assert.NotNil(t, status)
assert.Equal(t, tt.expectedPhase, status.Phase)
})
}
}
func TestIsRollingUpdateInProgress(t *testing.T) {
tests := []struct {
name string
status *nvidiacomv1alpha1.RollingUpdateStatus
expected bool
}{
{
name: "nil status - not in progress",
status: nil,
expected: false,
},
{
name: "phase none - not in progress",
status: &nvidiacomv1alpha1.RollingUpdateStatus{Phase: nvidiacomv1alpha1.RollingUpdatePhaseNone},
expected: false,
},
{
name: "phase pending - in progress",
status: &nvidiacomv1alpha1.RollingUpdateStatus{Phase: nvidiacomv1alpha1.RollingUpdatePhasePending},
expected: true,
},
{
name: "phase in progress - in progress",
status: &nvidiacomv1alpha1.RollingUpdateStatus{Phase: nvidiacomv1alpha1.RollingUpdatePhaseInProgress},
expected: true,
},
{
name: "phase completed - not in progress",
status: &nvidiacomv1alpha1.RollingUpdateStatus{Phase: nvidiacomv1alpha1.RollingUpdatePhaseCompleted},
expected: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {ComponentType: consts.ComponentTypeWorker},
})
dgd.Status.RollingUpdate = tt.status
r := createTestReconciler(dgd)
result := r.isRollingUpdateInProgress(dgd)
assert.Equal(t, tt.expected, result)
})
}
}
func TestGetDesiredWorkerReplicas(t *testing.T) {
tests := []struct {
name string
services map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec
expected int32
}{
{
name: "single worker with replicas",
services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {
ComponentType: consts.ComponentTypeWorker,
Replicas: ptr.To(int32(3)),
},
},
expected: 3,
},
{
name: "single worker without replicas defaults to 1",
services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {
ComponentType: consts.ComponentTypeWorker,
},
},
expected: 1,
},
{
name: "multiple workers",
services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"prefill": {
ComponentType: consts.ComponentTypePrefill,
Replicas: ptr.To(int32(2)),
},
"decode": {
ComponentType: consts.ComponentTypeDecode,
Replicas: ptr.To(int32(4)),
},
},
expected: 6,
},
{
name: "workers and frontend - only counts workers",
services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"frontend": {
ComponentType: consts.ComponentTypeFrontend,
Replicas: ptr.To(int32(2)),
},
"worker": {
ComponentType: consts.ComponentTypeWorker,
Replicas: ptr.To(int32(3)),
},
},
expected: 3,
},
{
name: "no workers",
services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{},
expected: 0,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
dgd := createTestDGD("test-dgd", tt.services)
r := createTestReconciler(dgd)
result := r.getDesiredWorkerReplicas(dgd)
assert.Equal(t, tt.expected, result)
})
}
}
func TestDeleteOldWorkerDCDs(t *testing.T) {
newWorkerHash := testNewWorkerHash
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {ComponentType: consts.ComponentTypeWorker},
})
// Create DCD with old worker hash
oldDCD1 := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-worker-oldhash1",
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
consts.KubeLabelDynamoWorkerHash: testOldWorkerHash,
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeWorker,
},
},
}
// Create DCD with new worker hash (should not be deleted)
newDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-worker-newhash2",
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
consts.KubeLabelDynamoWorkerHash: newWorkerHash,
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeWorker,
},
},
}
r := createTestReconciler(dgd, oldDCD1, newDCD)
ctx := context.Background()
// Delete old worker DCDs
err := r.deleteOldWorkerDCDs(ctx, dgd, newWorkerHash)
require.NoError(t, err)
// Verify old DCD is deleted
dcdList := &nvidiacomv1alpha1.DynamoComponentDeploymentList{}
err = r.List(ctx, dcdList)
require.NoError(t, err)
// Should only have the new DCD remaining
assert.Len(t, dcdList.Items, 1)
assert.Equal(t, "test-dgd-worker-newhash2", dcdList.Items[0].Name)
}
func TestDeleteOldWorkerDCDs_NoDCDsToDelete(t *testing.T) {
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {ComponentType: consts.ComponentTypeWorker},
})
r := createTestReconciler(dgd)
ctx := context.Background()
// Delete old worker DCDs when there are none - should not error
err := r.deleteOldWorkerDCDs(ctx, dgd, "somehash")
require.NoError(t, err)
}
// createTestReconcilerWithStatus creates a reconciler with status subresource support.
func createTestReconcilerWithStatus(dgd *nvidiacomv1alpha1.DynamoGraphDeployment, objs ...runtime.Object) *DynamoGraphDeploymentReconciler {
scheme := runtime.NewScheme()
_ = nvidiacomv1alpha1.AddToScheme(scheme)
_ = corev1.AddToScheme(scheme)
allObjs := append([]runtime.Object{dgd}, objs...)
fakeClient := fake.NewClientBuilder().
WithScheme(scheme).
WithRuntimeObjects(allObjs...).
WithStatusSubresource(&nvidiacomv1alpha1.DynamoGraphDeployment{}).
Build()
return &DynamoGraphDeploymentReconciler{
Client: fakeClient,
Recorder: record.NewFakeRecorder(10),
}
}
func TestContinueRollingUpdate_UpdatedServicesPartialCompletion(t *testing.T) {
oldWorkerHash := testOldWorkerHash
newWorkerHash := testNewWorkerHash
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"prefill": {
ComponentType: consts.ComponentTypePrefill,
Replicas: ptr.To(int32(2)),
},
"decode": {
ComponentType: consts.ComponentTypeDecode,
Replicas: ptr.To(int32(3)),
},
})
dgd.Annotations = map[string]string{
consts.AnnotationCurrentWorkerHash: oldWorkerHash,
}
dgd.Status.RollingUpdate = &nvidiacomv1alpha1.RollingUpdateStatus{
Phase: nvidiacomv1alpha1.RollingUpdatePhaseInProgress,
}
// New DCDs: prefill fully ready, decode not ready yet
newPrefillDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-prefill-" + newWorkerHash[:8],
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
consts.KubeLabelDynamoWorkerHash: newWorkerHash,
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypePrefill,
ServiceName: "prefill",
Replicas: ptr.To(int32(2)),
},
},
Status: nvidiacomv1alpha1.DynamoComponentDeploymentStatus{
Service: &nvidiacomv1alpha1.ServiceReplicaStatus{
ReadyReplicas: ptr.To(int32(2)),
},
},
}
newDecodeDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-decode-" + newWorkerHash[:8],
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
consts.KubeLabelDynamoWorkerHash: newWorkerHash,
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeDecode,
ServiceName: "decode",
Replicas: ptr.To(int32(3)),
},
},
Status: nvidiacomv1alpha1.DynamoComponentDeploymentStatus{
Service: &nvidiacomv1alpha1.ServiceReplicaStatus{
ReadyReplicas: ptr.To(int32(1)), // Not yet fully ready
},
},
}
// Old DCDs: prefill gone, decode still has replicas
oldDecodeDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-decode-" + oldWorkerHash[:8],
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
consts.KubeLabelDynamoWorkerHash: oldWorkerHash,
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeDecode,
ServiceName: "decode",
Replicas: ptr.To(int32(3)),
},
},
Status: nvidiacomv1alpha1.DynamoComponentDeploymentStatus{
Service: &nvidiacomv1alpha1.ServiceReplicaStatus{
ReadyReplicas: ptr.To(int32(2)), // Still has old replicas
},
},
}
r := createTestReconcilerWithStatus(dgd, newPrefillDCD, newDecodeDCD, oldDecodeDCD)
ctx := context.Background()
rollingUpdateStatus := dgd.Status.RollingUpdate
err := r.continueRollingUpdate(ctx, dgd, rollingUpdateStatus, newWorkerHash)
require.NoError(t, err)
// Prefill is updated (new ready >= desired, old gone), decode is not
assert.Equal(t, []string{"prefill"}, rollingUpdateStatus.UpdatedServices)
// Rolling update should remain in progress since not all services are updated
assert.Equal(t, nvidiacomv1alpha1.RollingUpdatePhaseInProgress, rollingUpdateStatus.Phase)
}
func TestContinueRollingUpdate_AggregateReadyButPerServiceNot(t *testing.T) {
oldWorkerHash := testOldWorkerHash
newWorkerHash := testNewWorkerHash
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"prefill": {
ComponentType: consts.ComponentTypePrefill,
Replicas: ptr.To(int32(2)),
},
"decode": {
ComponentType: consts.ComponentTypeDecode,
Replicas: ptr.To(int32(3)),
},
})
dgd.Annotations = map[string]string{
consts.AnnotationCurrentWorkerHash: oldWorkerHash,
}
dgd.Status.RollingUpdate = &nvidiacomv1alpha1.RollingUpdateStatus{
Phase: nvidiacomv1alpha1.RollingUpdatePhaseInProgress,
}
// New DCDs: prefill has excess ready replicas (5), decode has 0
// Aggregate: 5 total new ready >= 5 desired, 0 old ready == 0
// Per-service: prefill ready (5 >= 2), decode NOT ready (0 < 3)
newPrefillDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-prefill-" + newWorkerHash[:8],
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
consts.KubeLabelDynamoWorkerHash: newWorkerHash,
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypePrefill,
ServiceName: "prefill",
Replicas: ptr.To(int32(2)),
},
},
Status: nvidiacomv1alpha1.DynamoComponentDeploymentStatus{
Service: &nvidiacomv1alpha1.ServiceReplicaStatus{
ReadyReplicas: ptr.To(int32(5)), // Excess ready replicas
},
},
}
newDecodeDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-decode-" + newWorkerHash[:8],
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
consts.KubeLabelDynamoWorkerHash: newWorkerHash,
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeDecode,
ServiceName: "decode",
Replicas: ptr.To(int32(3)),
},
},
Status: nvidiacomv1alpha1.DynamoComponentDeploymentStatus{
Service: &nvidiacomv1alpha1.ServiceReplicaStatus{
ReadyReplicas: ptr.To(int32(0)), // No ready replicas
},
},
}
// No old DCDs — old workers are fully scaled down
r := createTestReconcilerWithStatus(dgd, newPrefillDCD, newDecodeDCD)
ctx := context.Background()
rollingUpdateStatus := dgd.Status.RollingUpdate
err := r.continueRollingUpdate(ctx, dgd, rollingUpdateStatus, newWorkerHash)
require.NoError(t, err)
// Only prefill is updated; decode has 0 ready replicas
assert.Equal(t, []string{"prefill"}, rollingUpdateStatus.UpdatedServices)
// Rolling update must NOT complete — decode is not ready
assert.Equal(t, nvidiacomv1alpha1.RollingUpdatePhaseInProgress, rollingUpdateStatus.Phase)
}
func TestStartRollingUpdate_UpdatedServicesInitializedToNil(t *testing.T) {
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {
ComponentType: consts.ComponentTypeWorker,
Replicas: ptr.To(int32(2)),
},
})
dgd.Annotations = map[string]string{
consts.AnnotationCurrentWorkerHash: testOldWorkerHash,
}
// Simulate a previous rolling update that had UpdatedServices populated
dgd.Status.RollingUpdate = &nvidiacomv1alpha1.RollingUpdateStatus{
Phase: nvidiacomv1alpha1.RollingUpdatePhaseNone,
UpdatedServices: []string{"worker"},
}
r := createTestReconcilerWithStatus(dgd)
ctx := context.Background()
rollingUpdateStatus := dgd.Status.RollingUpdate
err := r.startRollingUpdate(ctx, dgd, rollingUpdateStatus, testNewWorkerHash)
require.NoError(t, err)
assert.Nil(t, rollingUpdateStatus.UpdatedServices)
assert.Equal(t, nvidiacomv1alpha1.RollingUpdatePhasePending, rollingUpdateStatus.Phase)
}
func TestCompleteRollingUpdate_UpdatedServicesContainsAllWorkers(t *testing.T) {
oldWorkerHash := testOldWorkerHash
newWorkerHash := testNewWorkerHash
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"frontend": {
ComponentType: consts.ComponentTypeFrontend,
Replicas: ptr.To(int32(1)),
},
"prefill": {
ComponentType: consts.ComponentTypePrefill,
Replicas: ptr.To(int32(2)),
},
"decode": {
ComponentType: consts.ComponentTypeDecode,
Replicas: ptr.To(int32(3)),
},
})
dgd.Annotations = map[string]string{
consts.AnnotationCurrentWorkerHash: oldWorkerHash,
}
dgd.Status.RollingUpdate = &nvidiacomv1alpha1.RollingUpdateStatus{
Phase: nvidiacomv1alpha1.RollingUpdatePhaseInProgress,
}
r := createTestReconcilerWithStatus(dgd)
ctx := context.Background()
rollingUpdateStatus := dgd.Status.RollingUpdate
err := r.completeRollingUpdate(ctx, dgd, rollingUpdateStatus, newWorkerHash)
require.NoError(t, err)
// Should contain all worker services (sorted), but not frontend
assert.Equal(t, []string{"decode", "prefill"}, rollingUpdateStatus.UpdatedServices)
assert.Equal(t, nvidiacomv1alpha1.RollingUpdatePhaseCompleted, rollingUpdateStatus.Phase)
assert.NotNil(t, rollingUpdateStatus.EndTime)
}
func TestContinueRollingUpdate_AllServicesUpdated(t *testing.T) {
oldWorkerHash := testOldWorkerHash
newWorkerHash := testNewWorkerHash
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"prefill": {
ComponentType: consts.ComponentTypePrefill,
Replicas: ptr.To(int32(2)),
},
"decode": {
ComponentType: consts.ComponentTypeDecode,
Replicas: ptr.To(int32(3)),
},
})
dgd.Annotations = map[string]string{
consts.AnnotationCurrentWorkerHash: oldWorkerHash,
}
dgd.Status.RollingUpdate = &nvidiacomv1alpha1.RollingUpdateStatus{
Phase: nvidiacomv1alpha1.RollingUpdatePhaseInProgress,
}
// All new DCDs fully ready
newPrefillDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-prefill-" + newWorkerHash[:8],
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
consts.KubeLabelDynamoWorkerHash: newWorkerHash,
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypePrefill,
ServiceName: "prefill",
Replicas: ptr.To(int32(2)),
},
},
Status: nvidiacomv1alpha1.DynamoComponentDeploymentStatus{
Service: &nvidiacomv1alpha1.ServiceReplicaStatus{
ReadyReplicas: ptr.To(int32(2)),
},
},
}
newDecodeDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-decode-" + newWorkerHash[:8],
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
consts.KubeLabelDynamoWorkerHash: newWorkerHash,
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeDecode,
ServiceName: "decode",
Replicas: ptr.To(int32(3)),
},
},
Status: nvidiacomv1alpha1.DynamoComponentDeploymentStatus{
Service: &nvidiacomv1alpha1.ServiceReplicaStatus{
ReadyReplicas: ptr.To(int32(3)),
},
},
}
// No old DCDs (all scaled down and removed)
r := createTestReconcilerWithStatus(dgd, newPrefillDCD, newDecodeDCD)
ctx := context.Background()
rollingUpdateStatus := dgd.Status.RollingUpdate
err := r.continueRollingUpdate(ctx, dgd, rollingUpdateStatus, newWorkerHash)
require.NoError(t, err)
// Rolling update should complete, and all services should be listed
assert.Equal(t, nvidiacomv1alpha1.RollingUpdatePhaseCompleted, rollingUpdateStatus.Phase)
assert.Equal(t, []string{"decode", "prefill"}, rollingUpdateStatus.UpdatedServices)
}
func TestGetWorkerInfoForWorkerHash(t *testing.T) {
workerHash := "hash1234"
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"prefill": {ComponentType: consts.ComponentTypePrefill},
"decode": {ComponentType: consts.ComponentTypeDecode},
})
// Create DCDs for prefill and decode with different ready counts
prefillDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-prefill-hash1234",
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
consts.KubeLabelDynamoWorkerHash: workerHash,
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypePrefill,
ServiceName: "prefill",
Replicas: ptr.To(int32(2)),
},
},
Status: nvidiacomv1alpha1.DynamoComponentDeploymentStatus{
Service: &nvidiacomv1alpha1.ServiceReplicaStatus{
ReadyReplicas: ptr.To(int32(2)),
},
},
}
decodeDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-decode-hash1234",
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
consts.KubeLabelDynamoWorkerHash: workerHash,
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeDecode,
ServiceName: "decode",
Replicas: ptr.To(int32(3)),
},
},
Status: nvidiacomv1alpha1.DynamoComponentDeploymentStatus{
Service: &nvidiacomv1alpha1.ServiceReplicaStatus{
ReadyReplicas: ptr.To(int32(1)),
},
},
}
r := createTestReconciler(dgd, prefillDCD, decodeDCD)
ctx := context.Background()
status, err := r.getWorkerInfoForWorkerHash(ctx, dgd, workerHash)
require.NoError(t, err)
assert.Len(t, status.services, 2)
assert.Equal(t, int32(2), status.services[consts.ComponentTypePrefill].readyReplicas)
assert.Equal(t, int32(1), status.services[consts.ComponentTypeDecode].readyReplicas)
assert.Equal(t, int32(3), status.totalReadyWorkers) // 2 + 1
}
func TestMergeWorkerServiceStatuses(t *testing.T) {
tests := []struct {
name string
serviceStatuses map[string]nvidiacomv1alpha1.ServiceReplicaStatus
oldWorkerStatuses map[string]nvidiacomv1alpha1.ServiceReplicaStatus
expected map[string]nvidiacomv1alpha1.ServiceReplicaStatus
}{
{
name: "merges old and new for a single worker service",
serviceStatuses: map[string]nvidiacomv1alpha1.ServiceReplicaStatus{
"prefill": {
ComponentKind: "Deployment",
ComponentName: "dgd-prefill-newhash1",
Replicas: 2,
UpdatedReplicas: 2,
ReadyReplicas: ptr.To(int32(2)),
AvailableReplicas: ptr.To(int32(2)),
},
},
oldWorkerStatuses: map[string]nvidiacomv1alpha1.ServiceReplicaStatus{
"prefill": {
ComponentKind: "Deployment",
ComponentName: "dgd-prefill-oldhash1",
ComponentNames: []string{"dgd-prefill-oldhash1"},
Replicas: 1,
UpdatedReplicas: 0,
ReadyReplicas: ptr.To(int32(1)),
AvailableReplicas: ptr.To(int32(1)),
},
},
expected: map[string]nvidiacomv1alpha1.ServiceReplicaStatus{
"prefill": {
ComponentKind: "Deployment",
ComponentName: "dgd-prefill-newhash1",
ComponentNames: []string{"dgd-prefill-newhash1", "dgd-prefill-oldhash1"},
Replicas: 3,
UpdatedReplicas: 2, // Only new are "updated"
ReadyReplicas: ptr.To(int32(3)),
AvailableReplicas: ptr.To(int32(3)),
},
},
},
{
name: "no old statuses - no-op",
serviceStatuses: map[string]nvidiacomv1alpha1.ServiceReplicaStatus{
"prefill": {
ComponentKind: "Deployment",
ComponentName: "dgd-prefill-newhash1",
Replicas: 2,
ReadyReplicas: ptr.To(int32(2)),
},
},
oldWorkerStatuses: map[string]nvidiacomv1alpha1.ServiceReplicaStatus{},
expected: map[string]nvidiacomv1alpha1.ServiceReplicaStatus{
"prefill": {
ComponentKind: "Deployment",
ComponentName: "dgd-prefill-newhash1",
Replicas: 2,
ReadyReplicas: ptr.To(int32(2)),
},
},
},
{
name: "old exists but new doesn't yet",
serviceStatuses: map[string]nvidiacomv1alpha1.ServiceReplicaStatus{},
oldWorkerStatuses: map[string]nvidiacomv1alpha1.ServiceReplicaStatus{
"prefill": {
ComponentKind: "Deployment",
ComponentName: "dgd-prefill-oldhash1",
Replicas: 2,
ReadyReplicas: ptr.To(int32(2)),
},
},
expected: map[string]nvidiacomv1alpha1.ServiceReplicaStatus{},
},
{
name: "handles nil ReadyReplicas and AvailableReplicas on old",
serviceStatuses: map[string]nvidiacomv1alpha1.ServiceReplicaStatus{
"prefill": {
ComponentKind: "Deployment",
ComponentName: "dgd-prefill-newhash1",
Replicas: 2,
ReadyReplicas: ptr.To(int32(2)),
AvailableReplicas: ptr.To(int32(1)),
},
},
oldWorkerStatuses: map[string]nvidiacomv1alpha1.ServiceReplicaStatus{
"prefill": {
ComponentKind: "Deployment",
ComponentName: "dgd-prefill-oldhash1",
ComponentNames: []string{"dgd-prefill-oldhash1"},
Replicas: 1,
ReadyReplicas: nil,
AvailableReplicas: nil,
},
},
expected: map[string]nvidiacomv1alpha1.ServiceReplicaStatus{
"prefill": {
ComponentKind: "Deployment",
ComponentName: "dgd-prefill-newhash1",
ComponentNames: []string{"dgd-prefill-newhash1", "dgd-prefill-oldhash1"},
Replicas: 3,
ReadyReplicas: ptr.To(int32(2)),
AvailableReplicas: ptr.To(int32(1)),
},
},
},
{
name: "frontend status untouched by merge",
serviceStatuses: map[string]nvidiacomv1alpha1.ServiceReplicaStatus{
"frontend": {
ComponentKind: "Deployment",
ComponentName: "dgd-frontend",
Replicas: 1,
ReadyReplicas: ptr.To(int32(1)),
},
"prefill": {
ComponentKind: "Deployment",
ComponentName: "dgd-prefill-newhash1",
Replicas: 2,
ReadyReplicas: ptr.To(int32(2)),
},
},
oldWorkerStatuses: map[string]nvidiacomv1alpha1.ServiceReplicaStatus{
"prefill": {
ComponentKind: "Deployment",
ComponentName: "dgd-prefill-oldhash1",
ComponentNames: []string{"dgd-prefill-oldhash1"},
Replicas: 1,
ReadyReplicas: ptr.To(int32(1)),
},
},
expected: map[string]nvidiacomv1alpha1.ServiceReplicaStatus{
"frontend": {
ComponentKind: "Deployment",
ComponentName: "dgd-frontend",
Replicas: 1,
ReadyReplicas: ptr.To(int32(1)),
},
"prefill": {
ComponentKind: "Deployment",
ComponentName: "dgd-prefill-newhash1",
ComponentNames: []string{"dgd-prefill-newhash1", "dgd-prefill-oldhash1"},
Replicas: 3,
ReadyReplicas: ptr.To(int32(3)),
},
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
mergeWorkerServiceStatuses(tt.serviceStatuses, tt.oldWorkerStatuses)
assert.Equal(t, tt.expected, tt.serviceStatuses)
})
}
}
func TestAggregateOldWorkerServiceStatuses(t *testing.T) {
t.Run("old DCD exists with status", func(t *testing.T) {
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"prefill": {
ComponentType: consts.ComponentTypePrefill,
Replicas: ptr.To(int32(2)),
},
})
oldDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-prefill-oldhash1",
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
consts.KubeLabelDynamoWorkerHash: testOldWorkerHash,
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypePrefill,
ServiceName: "prefill",
Replicas: ptr.To(int32(1)),
},
},
Status: nvidiacomv1alpha1.DynamoComponentDeploymentStatus{
Service: &nvidiacomv1alpha1.ServiceReplicaStatus{
ComponentKind: "Deployment",
ComponentName: "test-dgd-prefill-oldhash1",
Replicas: 1,
UpdatedReplicas: 0,
ReadyReplicas: ptr.To(int32(1)),
},
},
}
r := createTestReconciler(dgd, oldDCD)
ctx := context.Background()
rollingUpdateCtx := dynamo.RollingUpdateContext{
NewWorkerHash: testNewWorkerHash,
OldWorkerReplicas: map[string]int32{"prefill": 1},
NewWorkerReplicas: map[string]int32{"prefill": 2},
}
statuses, err := r.aggregateOldWorkerServiceStatuses(ctx, dgd, rollingUpdateCtx)
require.NoError(t, err)
assert.Len(t, statuses, 1)
assert.Equal(t, "test-dgd-prefill-oldhash1", statuses["prefill"].ComponentName)
assert.Equal(t, []string{"test-dgd-prefill-oldhash1"}, statuses["prefill"].ComponentNames)
assert.Equal(t, int32(1), statuses["prefill"].Replicas)
assert.Equal(t, ptr.To(int32(1)), statuses["prefill"].ReadyReplicas)
})
t.Run("old DCD not found - skips gracefully", func(t *testing.T) {
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"prefill": {
ComponentType: consts.ComponentTypePrefill,
Replicas: ptr.To(int32(2)),
},
})
r := createTestReconciler(dgd)
ctx := context.Background()
rollingUpdateCtx := dynamo.RollingUpdateContext{
NewWorkerHash: testNewWorkerHash,
OldWorkerReplicas: map[string]int32{"prefill": 1},
NewWorkerReplicas: map[string]int32{"prefill": 2},
}
statuses, err := r.aggregateOldWorkerServiceStatuses(ctx, dgd, rollingUpdateCtx)
require.NoError(t, err)
assert.Empty(t, statuses)
})
}
func TestGetExistingRestartAnnotationsDCD(t *testing.T) {
t.Run("worker DCD with hash suffix - finds annotation", func(t *testing.T) {
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"frontend": {
ComponentType: consts.ComponentTypeFrontend,
},
"worker": {
ComponentType: consts.ComponentTypeWorker,
},
})
// Annotation hash can differ from computed hash — function uses computed hash
computedHash := dynamo.ComputeDGDWorkersSpecHash(dgd)
dgd.Annotations = map[string]string{
consts.AnnotationCurrentWorkerHash: "oldhash",
}
frontendDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-frontend",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Annotations: map[string]string{
consts.RestartAnnotation: "2025-01-01T00:00:00Z",
},
},
},
}
workerDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-worker-" + computedHash,
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Annotations: map[string]string{
consts.RestartAnnotation: "2025-01-01T00:00:00Z",
},
},
},
}
r := createTestReconciler(dgd, frontendDCD, workerDCD)
ctx := context.Background()
annotations, err := r.getExistingRestartAnnotationsDCD(ctx, dgd)
require.NoError(t, err)
assert.Equal(t, "2025-01-01T00:00:00Z", annotations["frontend"])
assert.Equal(t, "2025-01-01T00:00:00Z", annotations["worker"])
})
t.Run("worker DCD not found during rolling update - gracefully skips", func(t *testing.T) {
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"frontend": {
ComponentType: consts.ComponentTypeFrontend,
},
"worker": {
ComponentType: consts.ComponentTypeWorker,
},
})
dgd.Annotations = map[string]string{
consts.AnnotationCurrentWorkerHash: testOldWorkerHash,
}
frontendDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-frontend",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Annotations: map[string]string{
consts.RestartAnnotation: "2025-01-01T00:00:00Z",
},
},
},
}
r := createTestReconciler(dgd, frontendDCD)
ctx := context.Background()
annotations, err := r.getExistingRestartAnnotationsDCD(ctx, dgd)
require.NoError(t, err)
assert.Equal(t, "2025-01-01T00:00:00Z", annotations["frontend"])
_, hasWorker := annotations["worker"]
assert.False(t, hasWorker, "worker annotation should not be present when DCD doesn't exist")
})
t.Run("non-worker without hash suffix - found normally", func(t *testing.T) {
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"frontend": {
ComponentType: consts.ComponentTypeFrontend,
},
})
dgd.Annotations = map[string]string{
consts.AnnotationCurrentWorkerHash: testOldWorkerHash,
}
frontendDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-frontend",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Annotations: map[string]string{
consts.RestartAnnotation: "2025-01-01T00:00:00Z",
},
},
},
}
r := createTestReconciler(dgd, frontendDCD)
ctx := context.Background()
annotations, err := r.getExistingRestartAnnotationsDCD(ctx, dgd)
require.NoError(t, err)
assert.Equal(t, "2025-01-01T00:00:00Z", annotations["frontend"])
})
}
func TestCheckComponentServiceFullyUpdated(t *testing.T) {
t.Run("worker with hash suffix - finds DCD", func(t *testing.T) {
workerHash := "abc12345"
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {
ComponentType: consts.ComponentTypeWorker,
},
})
dgd.Annotations = map[string]string{
consts.AnnotationCurrentWorkerHash: workerHash + "fullhashextra",
}
workerDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-worker-" + workerHash + "fullhashextra",
Namespace: "default",
Generation: 1,
},
Status: nvidiacomv1alpha1.DynamoComponentDeploymentStatus{
ObservedGeneration: 1,
Conditions: []metav1.Condition{
{
Type: nvidiacomv1alpha1.DynamoGraphDeploymentConditionTypeAvailable,
Status: metav1.ConditionTrue,
},
},
},
}
r := createTestReconciler(dgd, workerDCD)
ctx := context.Background()
isReady, reason := r.checkComponentServiceFullyUpdated(ctx, dgd, "worker")
assert.True(t, isReady, "worker DCD should be ready")
assert.Empty(t, reason)
})
t.Run("non-worker without hash suffix - finds DCD", func(t *testing.T) {
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"frontend": {
ComponentType: consts.ComponentTypeFrontend,
},
})
frontendDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-frontend",
Namespace: "default",
Generation: 1,
},
Status: nvidiacomv1alpha1.DynamoComponentDeploymentStatus{
ObservedGeneration: 1,
Conditions: []metav1.Condition{
{
Type: nvidiacomv1alpha1.DynamoGraphDeploymentConditionTypeAvailable,
Status: metav1.ConditionTrue,
},
},
},
}
r := createTestReconciler(dgd, frontendDCD)
ctx := context.Background()
isReady, reason := r.checkComponentServiceFullyUpdated(ctx, dgd, "frontend")
assert.True(t, isReady, "frontend DCD should be ready")
assert.Empty(t, reason)
})
t.Run("worker without hash annotation - falls back to non-hash name", func(t *testing.T) {
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {
ComponentType: consts.ComponentTypeWorker,
},
})
// No worker hash annotation
workerDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-worker",
Namespace: "default",
Generation: 1,
},
Status: nvidiacomv1alpha1.DynamoComponentDeploymentStatus{
ObservedGeneration: 1,
Conditions: []metav1.Condition{
{
Type: nvidiacomv1alpha1.DynamoGraphDeploymentConditionTypeAvailable,
Status: metav1.ConditionTrue,
},
},
},
}
r := createTestReconciler(dgd, workerDCD)
ctx := context.Background()
isReady, reason := r.checkComponentServiceFullyUpdated(ctx, dgd, "worker")
assert.True(t, isReady, "worker DCD should be ready via fallback")
assert.Empty(t, reason)
})
}
func TestInitializeWorkerHashIfNeeded_LegacyDCDsMigration(t *testing.T) {
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {
ComponentType: consts.ComponentTypeWorker,
Envs: []corev1.EnvVar{{Name: "FOO", Value: "bar"}},
},
})
// Create a legacy worker DCD: has DGD name label but NO worker hash label.
// This simulates a DCD created by a pre-rolling-update operator version.
legacyWorkerDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-worker",
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
// Note: No KubeLabelDynamoWorkerHash label
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeWorker,
ServiceName: "worker",
},
},
}
r := createTestReconciler(dgd, legacyWorkerDCD)
ctx := context.Background()
err := r.initializeWorkerHashIfNeeded(ctx, dgd)
require.NoError(t, err)
// DGD annotation should be set to the legacy sentinel, NOT the computed hash
hash := r.getCurrentWorkerHash(dgd)
assert.Equal(t, consts.LegacyWorkerHash, hash, "Hash should be legacy sentinel after migration")
// Legacy DCD should now have the worker hash label backfilled
updatedDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{}
err = r.Get(ctx, types.NamespacedName{Name: "test-dgd-worker", Namespace: "default"}, updatedDCD)
require.NoError(t, err)
assert.Equal(t, consts.LegacyWorkerHash, updatedDCD.Labels[consts.KubeLabelDynamoWorkerHash],
"Legacy DCD should have worker hash label backfilled")
}
func TestInitializeWorkerHashIfNeeded_LegacyMultipleWorkers(t *testing.T) {
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"prefill": {
ComponentType: consts.ComponentTypePrefill,
},
"decode": {
ComponentType: consts.ComponentTypeDecode,
},
"frontend": {
ComponentType: consts.ComponentTypeFrontend,
},
})
// Legacy worker DCDs (no hash label)
legacyPrefillDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-prefill",
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypePrefill,
ServiceName: "prefill",
},
},
}
legacyDecodeDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-decode",
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeDecode,
ServiceName: "decode",
},
},
}
// Frontend DCD (not a worker, should not be touched)
frontendDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-frontend",
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeFrontend,
ServiceName: "frontend",
},
},
}
r := createTestReconciler(dgd, legacyPrefillDCD, legacyDecodeDCD, frontendDCD)
ctx := context.Background()
err := r.initializeWorkerHashIfNeeded(ctx, dgd)
require.NoError(t, err)
// DGD should have legacy sentinel hash
assert.Equal(t, consts.LegacyWorkerHash, r.getCurrentWorkerHash(dgd))
// Both worker DCDs should have hash label backfilled
for _, name := range []string{"test-dgd-prefill", "test-dgd-decode"} {
dcd := &nvidiacomv1alpha1.DynamoComponentDeployment{}
err = r.Get(ctx, types.NamespacedName{Name: name, Namespace: "default"}, dcd)
require.NoError(t, err)
assert.Equal(t, consts.LegacyWorkerHash, dcd.Labels[consts.KubeLabelDynamoWorkerHash],
"Worker DCD %s should have legacy hash label", name)
}
// Frontend should NOT have hash label
fe := &nvidiacomv1alpha1.DynamoComponentDeployment{}
err = r.Get(ctx, types.NamespacedName{Name: "test-dgd-frontend", Namespace: "default"}, fe)
require.NoError(t, err)
assert.Empty(t, fe.Labels[consts.KubeLabelDynamoWorkerHash],
"Frontend DCD should not have worker hash label")
}
func TestFindLegacyWorkerDCDs(t *testing.T) {
t.Run("finds worker DCDs without hash label", func(t *testing.T) {
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {ComponentType: consts.ComponentTypeWorker},
})
legacyDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-worker",
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeWorker,
},
},
}
r := createTestReconciler(dgd, legacyDCD)
ctx := context.Background()
result, err := r.findLegacyWorkerDCDs(ctx, dgd)
require.NoError(t, err)
assert.Len(t, result, 1)
assert.Equal(t, "test-dgd-worker", result[0].Name)
})
t.Run("ignores non-worker DCDs", func(t *testing.T) {
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"frontend": {ComponentType: consts.ComponentTypeFrontend},
})
frontendDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-frontend",
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeFrontend,
},
},
}
r := createTestReconciler(dgd, frontendDCD)
ctx := context.Background()
result, err := r.findLegacyWorkerDCDs(ctx, dgd)
require.NoError(t, err)
assert.Empty(t, result)
})
t.Run("ignores DCDs that already have hash label", func(t *testing.T) {
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {ComponentType: consts.ComponentTypeWorker},
})
hashedDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-worker-abc12345",
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
consts.KubeLabelDynamoWorkerHash: "abc12345",
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeWorker,
},
},
}
r := createTestReconciler(dgd, hashedDCD)
ctx := context.Background()
result, err := r.findLegacyWorkerDCDs(ctx, dgd)
require.NoError(t, err)
assert.Empty(t, result)
})
t.Run("ignores DCDs from other DGDs", func(t *testing.T) {
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {ComponentType: consts.ComponentTypeWorker},
})
otherDGDWorkerDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "other-dgd-worker",
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "other-dgd",
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeWorker,
},
},
}
r := createTestReconciler(dgd, otherDGDWorkerDCD)
ctx := context.Background()
result, err := r.findLegacyWorkerDCDs(ctx, dgd)
require.NoError(t, err)
assert.Empty(t, result)
})
t.Run("no DCDs at all", func(t *testing.T) {
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {ComponentType: consts.ComponentTypeWorker},
})
r := createTestReconciler(dgd)
ctx := context.Background()
result, err := r.findLegacyWorkerDCDs(ctx, dgd)
require.NoError(t, err)
assert.Empty(t, result)
})
}
func TestListOldWorkerDCDs(t *testing.T) {
t.Run("finds legacy DCDs as old", func(t *testing.T) {
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {ComponentType: consts.ComponentTypeWorker},
})
// Legacy DCD with backfilled "legacy" hash label
legacyDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-worker",
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
consts.KubeLabelDynamoWorkerHash: consts.LegacyWorkerHash,
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeWorker,
ServiceName: "worker",
},
},
}
r := createTestReconciler(dgd, legacyDCD)
ctx := context.Background()
result, err := r.listOldWorkerDCDs(ctx, dgd, "newhash1")
require.NoError(t, err)
assert.Len(t, result, 1)
assert.Equal(t, "test-dgd-worker", result[0].Name)
})
t.Run("excludes current hash DCDs", func(t *testing.T) {
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {ComponentType: consts.ComponentTypeWorker},
})
currentDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-worker-abc12345",
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
consts.KubeLabelDynamoWorkerHash: "abc12345",
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeWorker,
ServiceName: "worker",
},
},
}
r := createTestReconciler(dgd, currentDCD)
ctx := context.Background()
result, err := r.listOldWorkerDCDs(ctx, dgd, "abc12345")
require.NoError(t, err)
assert.Empty(t, result)
})
t.Run("excludes non-worker DCDs", func(t *testing.T) {
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"frontend": {ComponentType: consts.ComponentTypeFrontend},
"worker": {ComponentType: consts.ComponentTypeWorker},
})
// A frontend DCD with non-matching hash (should be excluded as non-worker)
frontendDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-frontend",
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
consts.KubeLabelDynamoWorkerHash: testOldWorkerHash,
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeFrontend,
ServiceName: "frontend",
},
},
}
workerDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-worker-oldhash1",
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
consts.KubeLabelDynamoWorkerHash: testOldWorkerHash,
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeWorker,
ServiceName: "worker",
},
},
}
r := createTestReconciler(dgd, frontendDCD, workerDCD)
ctx := context.Background()
result, err := r.listOldWorkerDCDs(ctx, dgd, testNewWorkerHash)
require.NoError(t, err)
assert.Len(t, result, 1)
assert.Equal(t, "test-dgd-worker-oldhash1", result[0].Name)
})
}
func TestScaleOldWorkerDCDs_LegacyDCDs(t *testing.T) {
t.Run("scales legacy-named DCD via label lookup", func(t *testing.T) {
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {
ComponentType: consts.ComponentTypeWorker,
Replicas: ptr.To(int32(3)),
},
})
// Legacy DCD with backfilled hash label but old-style name (no hash suffix)
legacyDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-worker",
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
consts.KubeLabelDynamoWorkerHash: consts.LegacyWorkerHash,
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeWorker,
ServiceName: "worker",
Replicas: ptr.To(int32(3)),
},
},
}
r := createTestReconciler(dgd, legacyDCD)
ctx := context.Background()
rollingUpdateCtx := dynamo.RollingUpdateContext{
NewWorkerHash: "newhash1",
OldWorkerReplicas: map[string]int32{"worker": 1},
NewWorkerReplicas: map[string]int32{"worker": 3},
}
err := r.scaleOldWorkerDCDs(ctx, dgd, rollingUpdateCtx)
require.NoError(t, err)
// Verify the legacy DCD was scaled down
updatedDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{}
err = r.Get(ctx, types.NamespacedName{Name: "test-dgd-worker", Namespace: "default"}, updatedDCD)
require.NoError(t, err)
assert.Equal(t, int32(1), *updatedDCD.Spec.Replicas, "Legacy DCD should be scaled to 1")
})
t.Run("no-op when rolling update not in progress", func(t *testing.T) {
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {ComponentType: consts.ComponentTypeWorker},
})
r := createTestReconciler(dgd)
ctx := context.Background()
// Empty OldWorkerReplicas = not in progress
rollingUpdateCtx := dynamo.RollingUpdateContext{
NewWorkerHash: "samehash",
OldWorkerReplicas: map[string]int32{},
NewWorkerReplicas: map[string]int32{},
}
err := r.scaleOldWorkerDCDs(ctx, dgd, rollingUpdateCtx)
require.NoError(t, err)
})
t.Run("skips when replicas already at desired value", func(t *testing.T) {
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {
ComponentType: consts.ComponentTypeWorker,
Replicas: ptr.To(int32(3)),
},
})
legacyDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-worker",
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
consts.KubeLabelDynamoWorkerHash: consts.LegacyWorkerHash,
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeWorker,
ServiceName: "worker",
Replicas: ptr.To(int32(1)),
},
},
}
r := createTestReconciler(dgd, legacyDCD)
ctx := context.Background()
rollingUpdateCtx := dynamo.RollingUpdateContext{
NewWorkerHash: "newhash1",
OldWorkerReplicas: map[string]int32{"worker": 1},
NewWorkerReplicas: map[string]int32{"worker": 3},
}
err := r.scaleOldWorkerDCDs(ctx, dgd, rollingUpdateCtx)
require.NoError(t, err)
// Replicas should remain at 1 (no patch needed)
updatedDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{}
err = r.Get(ctx, types.NamespacedName{Name: "test-dgd-worker", Namespace: "default"}, updatedDCD)
require.NoError(t, err)
assert.Equal(t, int32(1), *updatedDCD.Spec.Replicas)
})
}
func TestAggregateOldWorkerServiceStatuses_LegacyDCDs(t *testing.T) {
t.Run("aggregates status from legacy-named DCD", func(t *testing.T) {
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {
ComponentType: consts.ComponentTypeWorker,
Replicas: ptr.To(int32(3)),
},
})
// Legacy DCD with old-style name but backfilled hash label
legacyDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-worker",
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
consts.KubeLabelDynamoWorkerHash: consts.LegacyWorkerHash,
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeWorker,
ServiceName: "worker",
Replicas: ptr.To(int32(2)),
},
},
Status: nvidiacomv1alpha1.DynamoComponentDeploymentStatus{
Service: &nvidiacomv1alpha1.ServiceReplicaStatus{
ComponentKind: "Deployment",
ComponentName: "test-dgd-worker",
Replicas: 2,
ReadyReplicas: ptr.To(int32(2)),
},
},
}
r := createTestReconciler(dgd, legacyDCD)
ctx := context.Background()
rollingUpdateCtx := dynamo.RollingUpdateContext{
NewWorkerHash: "newhash1",
OldWorkerReplicas: map[string]int32{"worker": 2},
NewWorkerReplicas: map[string]int32{"worker": 3},
}
statuses, err := r.aggregateOldWorkerServiceStatuses(ctx, dgd, rollingUpdateCtx)
require.NoError(t, err)
assert.Len(t, statuses, 1)
assert.Equal(t, "test-dgd-worker", statuses["worker"].ComponentName)
assert.Equal(t, []string{"test-dgd-worker"}, statuses["worker"].ComponentNames)
assert.Equal(t, int32(2), statuses["worker"].Replicas)
assert.Equal(t, ptr.To(int32(2)), statuses["worker"].ReadyReplicas)
})
t.Run("no legacy DCDs found - returns empty", func(t *testing.T) {
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {
ComponentType: consts.ComponentTypeWorker,
},
})
r := createTestReconciler(dgd)
ctx := context.Background()
rollingUpdateCtx := dynamo.RollingUpdateContext{
NewWorkerHash: "newhash1",
OldWorkerReplicas: map[string]int32{"worker": 1},
NewWorkerReplicas: map[string]int32{"worker": 1},
}
statuses, err := r.aggregateOldWorkerServiceStatuses(ctx, dgd, rollingUpdateCtx)
require.NoError(t, err)
assert.Empty(t, statuses)
})
}
func TestDeleteOldWorkerDCDs_LegacyDCDs(t *testing.T) {
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {ComponentType: consts.ComponentTypeWorker},
})
// Legacy DCD with backfilled hash label
legacyDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-worker",
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
consts.KubeLabelDynamoWorkerHash: consts.LegacyWorkerHash,
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeWorker,
},
},
}
// New DCD with real hash (should NOT be deleted)
newDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-worker-abc12345",
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
consts.KubeLabelDynamoWorkerHash: "abc12345",
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeWorker,
},
},
}
r := createTestReconciler(dgd, legacyDCD, newDCD)
ctx := context.Background()
err := r.deleteOldWorkerDCDs(ctx, dgd, "abc12345")
require.NoError(t, err)
// Verify legacy DCD is deleted and new DCD remains
dcdList := &nvidiacomv1alpha1.DynamoComponentDeploymentList{}
err = r.List(ctx, dcdList)
require.NoError(t, err)
assert.Len(t, dcdList.Items, 1)
assert.Equal(t, "test-dgd-worker-abc12345", dcdList.Items[0].Name)
}
func TestDeleteOldWorkerDCDs_MultipleGenerations(t *testing.T) {
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {ComponentType: consts.ComponentTypeWorker},
})
// Generation A (legacy)
legacyDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-worker",
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
consts.KubeLabelDynamoWorkerHash: consts.LegacyWorkerHash,
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeWorker,
},
},
}
// Generation B (intermediate)
genBDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-worker-hashbbbb",
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
consts.KubeLabelDynamoWorkerHash: "hashbbbb",
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeWorker,
},
},
}
// Generation C (current)
currentDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-worker-hashcccc",
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
consts.KubeLabelDynamoWorkerHash: "hashcccc",
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeWorker,
},
},
}
r := createTestReconciler(dgd, legacyDCD, genBDCD, currentDCD)
ctx := context.Background()
err := r.deleteOldWorkerDCDs(ctx, dgd, "hashcccc")
require.NoError(t, err)
// Verify both old generations are deleted, only current remains
dcdList := &nvidiacomv1alpha1.DynamoComponentDeploymentList{}
err = r.List(ctx, dcdList)
require.NoError(t, err)
assert.Len(t, dcdList.Items, 1)
assert.Equal(t, "test-dgd-worker-hashcccc", dcdList.Items[0].Name)
}
func TestListOldWorkerDCDs_ExcludesCurrentHash(t *testing.T) {
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {ComponentType: consts.ComponentTypeWorker},
})
// Generation A
genADCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-worker-hashaaaa",
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
consts.KubeLabelDynamoWorkerHash: "hashaaaa",
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeWorker,
ServiceName: "worker",
},
},
}
// Generation B
genBDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-worker-hashbbbb",
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
consts.KubeLabelDynamoWorkerHash: "hashbbbb",
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeWorker,
ServiceName: "worker",
},
},
}
// Generation C (current)
genCDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-worker-hashcccc",
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
consts.KubeLabelDynamoWorkerHash: "hashcccc",
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeWorker,
ServiceName: "worker",
},
},
}
r := createTestReconciler(dgd, genADCD, genBDCD, genCDCD)
ctx := context.Background()
result, err := r.listOldWorkerDCDs(ctx, dgd, "hashcccc")
require.NoError(t, err)
assert.Len(t, result, 2)
names := []string{result[0].Name, result[1].Name}
sort.Strings(names)
assert.Equal(t, []string{"test-dgd-worker-hashaaaa", "test-dgd-worker-hashbbbb"}, names)
}
func TestScaleOldWorkerDCDs_MultipleOldGenerations(t *testing.T) {
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {
ComponentType: consts.ComponentTypeWorker,
Replicas: ptr.To(int32(4)),
},
})
now := metav1.Now()
earlier := metav1.NewTime(now.Add(-1 * 60 * 1e9)) // 1 minute earlier
// Generation A (oldest)
genADCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-worker-hashaaaa",
Namespace: "default",
CreationTimestamp: earlier,
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
consts.KubeLabelDynamoWorkerHash: "hashaaaa",
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeWorker,
ServiceName: "worker",
Replicas: ptr.To(int32(2)),
},
},
}
// Generation B (newer old)
genBDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-worker-hashbbbb",
Namespace: "default",
CreationTimestamp: now,
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
consts.KubeLabelDynamoWorkerHash: "hashbbbb",
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeWorker,
ServiceName: "worker",
Replicas: ptr.To(int32(2)),
},
},
}
r := createTestReconciler(dgd, genADCD, genBDCD)
ctx := context.Background()
// oldNeeded = 2: newest old (B) should get 2, oldest (A) should get 0
rollingUpdateCtx := dynamo.RollingUpdateContext{
NewWorkerHash: "hashcccc",
OldWorkerReplicas: map[string]int32{"worker": 2},
NewWorkerReplicas: map[string]int32{"worker": 4},
}
err := r.scaleOldWorkerDCDs(ctx, dgd, rollingUpdateCtx)
require.NoError(t, err)
// Newest old (B) should keep replicas (up to 2)
updatedB := &nvidiacomv1alpha1.DynamoComponentDeployment{}
err = r.Get(ctx, types.NamespacedName{Name: "test-dgd-worker-hashbbbb", Namespace: "default"}, updatedB)
require.NoError(t, err)
assert.Equal(t, int32(2), *updatedB.Spec.Replicas, "Newest old DCD should have 2 replicas")
// Oldest (A) should be drained to 0
updatedA := &nvidiacomv1alpha1.DynamoComponentDeployment{}
err = r.Get(ctx, types.NamespacedName{Name: "test-dgd-worker-hashaaaa", Namespace: "default"}, updatedA)
require.NoError(t, err)
assert.Equal(t, int32(0), *updatedA.Spec.Replicas, "Oldest old DCD should be drained to 0")
}
func TestAggregateOldWorkerServiceStatuses_MultipleOldGenerations(t *testing.T) {
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {
ComponentType: consts.ComponentTypeWorker,
Replicas: ptr.To(int32(4)),
},
})
// Generation A
genADCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-worker-hashaaaa",
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
consts.KubeLabelDynamoWorkerHash: "hashaaaa",
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeWorker,
ServiceName: "worker",
Replicas: ptr.To(int32(1)),
},
},
Status: nvidiacomv1alpha1.DynamoComponentDeploymentStatus{
Service: &nvidiacomv1alpha1.ServiceReplicaStatus{
ComponentKind: "Deployment",
ComponentName: "test-dgd-worker-hashaaaa",
Replicas: 1,
ReadyReplicas: ptr.To(int32(1)),
},
},
}
// Generation B
genBDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-worker-hashbbbb",
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
consts.KubeLabelDynamoWorkerHash: "hashbbbb",
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeWorker,
ServiceName: "worker",
Replicas: ptr.To(int32(2)),
},
},
Status: nvidiacomv1alpha1.DynamoComponentDeploymentStatus{
Service: &nvidiacomv1alpha1.ServiceReplicaStatus{
ComponentKind: "Deployment",
ComponentName: "test-dgd-worker-hashbbbb",
Replicas: 2,
ReadyReplicas: ptr.To(int32(2)),
},
},
}
r := createTestReconciler(dgd, genADCD, genBDCD)
ctx := context.Background()
rollingUpdateCtx := dynamo.RollingUpdateContext{
NewWorkerHash: "hashcccc",
OldWorkerReplicas: map[string]int32{"worker": 3},
NewWorkerReplicas: map[string]int32{"worker": 4},
}
statuses, err := r.aggregateOldWorkerServiceStatuses(ctx, dgd, rollingUpdateCtx)
require.NoError(t, err)
assert.Len(t, statuses, 1)
// Replicas should be summed across both old generations
assert.Equal(t, int32(3), statuses["worker"].Replicas)
assert.Equal(t, ptr.To(int32(3)), statuses["worker"].ReadyReplicas)
// ComponentNames should include both old DCDs
assert.Len(t, statuses["worker"].ComponentNames, 2)
}
func TestContinueRollingUpdate_CascadingSpecChange(t *testing.T) {
// Scenario: A→B rolling update in progress, spec changes to C.
// B DCDs should be treated as old alongside A DCDs.
newWorkerHash := "hashcccc"
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {
ComponentType: consts.ComponentTypeWorker,
Replicas: ptr.To(int32(2)),
},
})
dgd.Annotations = map[string]string{
consts.AnnotationCurrentWorkerHash: "hashaaaa",
}
dgd.Status.RollingUpdate = &nvidiacomv1alpha1.RollingUpdateStatus{
Phase: nvidiacomv1alpha1.RollingUpdatePhaseInProgress,
}
// Generation A (old)
genADCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-worker-hashaaaa",
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
consts.KubeLabelDynamoWorkerHash: "hashaaaa",
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeWorker,
ServiceName: "worker",
Replicas: ptr.To(int32(1)),
},
},
Status: nvidiacomv1alpha1.DynamoComponentDeploymentStatus{
Service: &nvidiacomv1alpha1.ServiceReplicaStatus{
ReadyReplicas: ptr.To(int32(1)),
},
},
}
// Generation B (intermediate, now also old)
genBDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-worker-hashbbbb",
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
consts.KubeLabelDynamoWorkerHash: "hashbbbb",
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeWorker,
ServiceName: "worker",
Replicas: ptr.To(int32(1)),
},
},
Status: nvidiacomv1alpha1.DynamoComponentDeploymentStatus{
Service: &nvidiacomv1alpha1.ServiceReplicaStatus{
ReadyReplicas: ptr.To(int32(1)),
},
},
}
// Generation C (new, not yet ready)
genCDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-worker-" + newWorkerHash[:8],
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
consts.KubeLabelDynamoWorkerHash: newWorkerHash,
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeWorker,
ServiceName: "worker",
Replicas: ptr.To(int32(2)),
},
},
Status: nvidiacomv1alpha1.DynamoComponentDeploymentStatus{
Service: &nvidiacomv1alpha1.ServiceReplicaStatus{
ReadyReplicas: ptr.To(int32(0)),
},
},
}
r := createTestReconcilerWithStatus(dgd, genADCD, genBDCD, genCDCD)
ctx := context.Background()
rollingUpdateStatus := dgd.Status.RollingUpdate
err := r.continueRollingUpdate(ctx, dgd, rollingUpdateStatus, newWorkerHash)
require.NoError(t, err)
// Both A and B have ready replicas, C has 0 — rolling update not complete
assert.Equal(t, nvidiacomv1alpha1.RollingUpdatePhaseInProgress, rollingUpdateStatus.Phase)
assert.Empty(t, rollingUpdateStatus.UpdatedServices, "No services should be fully updated yet")
}
func TestResolveRollingUpdateParams(t *testing.T) {
tests := []struct {
name string
annotations map[string]string
desiredReplicas int32
expectedSurge int32
expectedUnavail int32
}{
{
name: "defaults - no annotations - 25%/25% of 4 = 1/1",
annotations: nil,
desiredReplicas: 4,
expectedSurge: 1,
expectedUnavail: 1,
},
{
name: "absolute maxSurge overrides default",
annotations: map[string]string{
KubeAnnotationDeploymentRollingUpdateMaxSurge: "2",
},
desiredReplicas: 4,
expectedSurge: 2,
expectedUnavail: 1,
},
{
name: "absolute maxUnavailable overrides default",
annotations: map[string]string{
KubeAnnotationDeploymentRollingUpdateMaxUnavailable: "0",
},
desiredReplicas: 4,
expectedSurge: 1,
expectedUnavail: 0,
},
{
name: "percentage maxSurge - 50% of 4 = 2",
annotations: map[string]string{
KubeAnnotationDeploymentRollingUpdateMaxSurge: "50%",
},
desiredReplicas: 4,
expectedSurge: 2,
expectedUnavail: 1,
},
{
name: "percentage maxUnavailable - 50% of 4 = 2",
annotations: map[string]string{
KubeAnnotationDeploymentRollingUpdateMaxUnavailable: "50%",
},
desiredReplicas: 4,
expectedSurge: 1,
expectedUnavail: 2,
},
{
name: "both annotations set with percentages",
annotations: map[string]string{
KubeAnnotationDeploymentRollingUpdateMaxSurge: "50%",
KubeAnnotationDeploymentRollingUpdateMaxUnavailable: "25%",
},
desiredReplicas: 4,
expectedSurge: 2,
expectedUnavail: 1,
},
{
name: "both zero - force surge to 1 for progress",
annotations: map[string]string{
KubeAnnotationDeploymentRollingUpdateMaxSurge: "0",
KubeAnnotationDeploymentRollingUpdateMaxUnavailable: "0",
},
desiredReplicas: 4,
expectedSurge: 1,
expectedUnavail: 0,
},
{
name: "maxSurge 0 with maxUnavailable 1 - allowed",
annotations: map[string]string{
KubeAnnotationDeploymentRollingUpdateMaxSurge: "0",
KubeAnnotationDeploymentRollingUpdateMaxUnavailable: "1",
},
desiredReplicas: 4,
expectedSurge: 0,
expectedUnavail: 1,
},
{
name: "percentage surge rounds up - 34% of 3 rounds up to 2",
annotations: map[string]string{
KubeAnnotationDeploymentRollingUpdateMaxSurge: "34%",
},
desiredReplicas: 3,
expectedSurge: 2,
expectedUnavail: 0,
},
{
name: "percentage unavailable rounds down - 34% of 3 rounds down to 1",
annotations: map[string]string{
KubeAnnotationDeploymentRollingUpdateMaxUnavailable: "34%",
},
desiredReplicas: 3,
expectedSurge: 1,
expectedUnavail: 1,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
surge, unavail := resolveRollingUpdateParams(tt.annotations, tt.desiredReplicas)
assert.Equal(t, tt.expectedSurge, surge, "maxSurge")
assert.Equal(t, tt.expectedUnavail, unavail, "maxUnavailable")
})
}
}
// --- reconcileRollingUpdate state machine tests ---
func TestReconcileRollingUpdate_NoChange(t *testing.T) {
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {ComponentType: consts.ComponentTypeWorker},
})
hash := dynamo.ComputeDGDWorkersSpecHash(dgd)
dgd.Annotations = map[string]string{consts.AnnotationCurrentWorkerHash: hash}
dgd.Status.RollingUpdate = &nvidiacomv1alpha1.RollingUpdateStatus{
Phase: nvidiacomv1alpha1.RollingUpdatePhaseCompleted,
}
r := createTestReconcilerWithStatus(dgd)
err := r.reconcileRollingUpdate(context.Background(), dgd)
require.NoError(t, err)
// Phase should stay Completed — no spec change
assert.Equal(t, nvidiacomv1alpha1.RollingUpdatePhaseCompleted, dgd.Status.RollingUpdate.Phase)
}
func TestReconcileRollingUpdate_SpecChangeStartsRollout(t *testing.T) {
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {ComponentType: consts.ComponentTypeWorker},
})
dgd.Annotations = map[string]string{consts.AnnotationCurrentWorkerHash: "stale000"}
dgd.Status.RollingUpdate = &nvidiacomv1alpha1.RollingUpdateStatus{
Phase: nvidiacomv1alpha1.RollingUpdatePhaseCompleted,
}
r := createTestReconcilerWithStatus(dgd)
err := r.reconcileRollingUpdate(context.Background(), dgd)
require.NoError(t, err)
// Should transition to Pending (new rollout started)
assert.Equal(t, nvidiacomv1alpha1.RollingUpdatePhasePending, dgd.Status.RollingUpdate.Phase)
assert.NotNil(t, dgd.Status.RollingUpdate.StartTime)
}
func TestReconcileRollingUpdate_PendingToInProgress(t *testing.T) {
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {ComponentType: consts.ComponentTypeWorker},
})
dgd.Annotations = map[string]string{consts.AnnotationCurrentWorkerHash: "oldhash0"}
dgd.Status.RollingUpdate = &nvidiacomv1alpha1.RollingUpdateStatus{
Phase: nvidiacomv1alpha1.RollingUpdatePhasePending,
}
r := createTestReconcilerWithStatus(dgd)
err := r.reconcileRollingUpdate(context.Background(), dgd)
require.NoError(t, err)
assert.Equal(t, nvidiacomv1alpha1.RollingUpdatePhaseInProgress, dgd.Status.RollingUpdate.Phase)
}
func TestReconcileRollingUpdate_StuckDetection(t *testing.T) {
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {ComponentType: consts.ComponentTypeWorker},
})
hash := dynamo.ComputeDGDWorkersSpecHash(dgd)
// Hash matches current but phase is InProgress — stuck
dgd.Annotations = map[string]string{consts.AnnotationCurrentWorkerHash: hash}
dgd.Status.RollingUpdate = &nvidiacomv1alpha1.RollingUpdateStatus{
Phase: nvidiacomv1alpha1.RollingUpdatePhaseInProgress,
}
r := createTestReconcilerWithStatus(dgd)
err := r.reconcileRollingUpdate(context.Background(), dgd)
require.NoError(t, err)
// Should auto-complete
assert.Equal(t, nvidiacomv1alpha1.RollingUpdatePhaseCompleted, dgd.Status.RollingUpdate.Phase)
}
func TestReconcileRollingUpdate_NewRollingUpdate(t *testing.T) {
newHash := "newhash1"
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {ComponentType: consts.ComponentTypeWorker},
})
dgd.Annotations = map[string]string{consts.AnnotationCurrentWorkerHash: "oldhash0"}
dgd.Status.RollingUpdate = &nvidiacomv1alpha1.RollingUpdateStatus{
Phase: nvidiacomv1alpha1.RollingUpdatePhaseCompleted,
}
// Create a DCD with the new hash that has ready replicas — stale annotation scenario
newDCD := &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd-worker-" + newHash,
Namespace: "default",
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
consts.KubeLabelDynamoWorkerHash: newHash,
},
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeWorker,
ServiceName: "worker",
},
},
Status: nvidiacomv1alpha1.DynamoComponentDeploymentStatus{
Service: &nvidiacomv1alpha1.ServiceReplicaStatus{
ReadyReplicas: ptr.To(int32(1)),
},
},
}
r := createTestReconcilerWithStatus(dgd, newDCD)
// When computed hash != current hash and no DCDs exist with computed hash, start rollout.
err := r.reconcileRollingUpdate(context.Background(), dgd)
require.NoError(t, err)
// Should start a new rolling update (Pending) since computed hash DCDs don't exist
assert.Equal(t, nvidiacomv1alpha1.RollingUpdatePhasePending, dgd.Status.RollingUpdate.Phase)
}
func TestReconcileRollingUpdate_NonePhaseStartsRollout(t *testing.T) {
dgd := createTestDGD("test-dgd", map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {ComponentType: consts.ComponentTypeWorker},
})
dgd.Annotations = map[string]string{consts.AnnotationCurrentWorkerHash: "oldhash0"}
dgd.Status.RollingUpdate = &nvidiacomv1alpha1.RollingUpdateStatus{
Phase: nvidiacomv1alpha1.RollingUpdatePhaseNone,
}
r := createTestReconcilerWithStatus(dgd)
err := r.reconcileRollingUpdate(context.Background(), dgd)
require.NoError(t, err)
assert.Equal(t, nvidiacomv1alpha1.RollingUpdatePhasePending, dgd.Status.RollingUpdate.Phase)
assert.NotNil(t, dgd.Status.RollingUpdate.StartTime)
assert.Nil(t, dgd.Status.RollingUpdate.UpdatedServices)
}
......@@ -50,6 +50,7 @@ type ComponentContext struct {
ParentGraphDeploymentNamespace string
DiscoveryBackend string
EPPConfig *v1alpha1.EPPConfig
WorkerHashSuffix string
}
func (b *BaseComponentDefaults) GetBaseContainer(context ComponentContext) (corev1.Container, error) {
......
......@@ -76,6 +76,10 @@ func (f *FrontendDefaults) GetBaseContainer(context ComponentContext) (corev1.Co
Name: "DYN_HTTP_PORT", // TODO: need to reconcile DYNAMO_PORT and DYN_HTTP_PORT
Value: fmt.Sprintf("%d", commonconsts.DynamoServicePort),
},
{
Name: commonconsts.DynamoNamespacePrefixEnvVar,
Value: context.DynamoNamespace,
},
}...)
return container, nil
......
......@@ -109,5 +109,14 @@ func (w *WorkerDefaults) GetBaseContainer(context ComponentContext) (corev1.Cont
},
}...)
if context.WorkerHashSuffix != "" {
container.Env = append(container.Env, []corev1.EnvVar{
{
Name: commonconsts.DynamoNamespaceWorkerSuffixEnvVar,
Value: context.WorkerHashSuffix,
},
}...)
}
return container, nil
}
......@@ -84,6 +84,12 @@ func DetermineRestartState(dgd *v1alpha1.DynamoGraphDeployment, restartStatus *v
isNewRestart := restartStatus.ObservedID == "" ||
dgd.Spec.Restart.ID != restartStatus.ObservedID
if !isNewRestart && restartStatus.Phase == v1alpha1.RestartPhaseSuperseded {
// Superseded: don't push any new annotations. Existing annotations
// are preserved via the existingRestartAnnotations fallback path.
return nil
}
if !isNewRestart && restartStatus.Phase == v1alpha1.RestartPhaseCompleted {
return &RestartState{
Timestamp: specID,
......@@ -249,100 +255,149 @@ func ParseDynDeploymentConfig(ctx context.Context, jsonContent []byte) (DynDeplo
return config, err
}
// GenerateDynamoComponentsDeployments generates a map of DynamoComponentDeployments from a DynamoGraphConfig
func GenerateDynamoComponentsDeployments(ctx context.Context, parentDynamoGraphDeployment *v1alpha1.DynamoGraphDeployment, defaultIngressSpec *v1alpha1.IngressSpec, restartState *RestartState, existingRestartAnnotations map[string]string) (map[string]*v1alpha1.DynamoComponentDeployment, error) {
func (r RollingUpdateContext) InProgress() bool {
return len(r.OldWorkerReplicas) > 0
}
// RollingUpdateContext provides information about an in-progress rolling update.
type RollingUpdateContext struct {
// NewWorkerHash is the short hash (8 chars) for the new worker spec, used for DCD naming
NewWorkerHash string
// OldWorkerReplicas maps service name to the desired replica count for old workers.
// Used by the controller to patch old worker DCDs directly.
// Calculated as: max(0, desiredReplicas - newReadyReplicas)
OldWorkerReplicas map[string]int32
// NewWorkerReplicas maps service name to the desired replica count for new workers.
// Calculated as: min(desiredReplicas, newReadyReplicas + 1) to gradually scale up.
NewWorkerReplicas map[string]int32
}
// GenerateDynamoComponentsDeployments generates a map of DynamoComponentDeployments from a DynamoGraphConfig.
// The map key is a unique identifier for each DCD (serviceName).
func GenerateDynamoComponentsDeployments(
ctx context.Context,
parentDGD *v1alpha1.DynamoGraphDeployment,
defaultIngressSpec *v1alpha1.IngressSpec,
restartState *RestartState,
existingRestartAnnotations map[string]string,
rollingUpdateCtx RollingUpdateContext,
) (map[string]*v1alpha1.DynamoComponentDeployment, error) {
deployments := make(map[string]*v1alpha1.DynamoComponentDeployment)
for componentName, component := range parentDynamoGraphDeployment.Spec.Services {
dynamoNamespace := GetDynamoNamespace(parentDynamoGraphDeployment, component)
deployment := &v1alpha1.DynamoComponentDeployment{}
deployment.Spec.DynamoComponentDeploymentSharedSpec = *component
deployment.Name = GetDynamoComponentName(parentDynamoGraphDeployment, componentName)
deployment.Spec.BackendFramework = parentDynamoGraphDeployment.Spec.BackendFramework
deployment.Namespace = parentDynamoGraphDeployment.Namespace
deployment.Spec.ServiceName = componentName
deployment.Spec.DynamoNamespace = &dynamoNamespace
labels := make(map[string]string)
// add the labels in the spec in order to label all sub-resources
deployment.Spec.Labels = labels
// and add the labels to the deployment itself
deployment.Labels = labels
labels[commonconsts.KubeLabelDynamoComponent] = componentName
labels[commonconsts.KubeLabelDynamoNamespace] = dynamoNamespace
labels[commonconsts.KubeLabelDynamoGraphDeploymentName] = parentDynamoGraphDeployment.Name
// Propagate annotations from parent deployment if present
if parentDynamoGraphDeployment.Annotations != nil {
if deployment.Spec.Annotations == nil {
deployment.Spec.Annotations = make(map[string]string)
}
if val, exists := parentDynamoGraphDeployment.Annotations[commonconsts.KubeAnnotationEnableMetrics]; exists {
deployment.Spec.Annotations[commonconsts.KubeAnnotationEnableMetrics] = val
}
if val, exists := parentDynamoGraphDeployment.Annotations[commonconsts.KubeAnnotationDynamoDiscoveryBackend]; exists {
deployment.Spec.Annotations[commonconsts.KubeAnnotationDynamoDiscoveryBackend] = val
}
// Propagate operator origin version for version-gated behavior in backends
if val, exists := parentDynamoGraphDeployment.Annotations[commonconsts.KubeAnnotationDynamoOperatorOriginVersion]; exists {
deployment.Spec.Annotations[commonconsts.KubeAnnotationDynamoOperatorOriginVersion] = val
}
}
// Apply restart annotation if this service should be restarted.
// For services not in the current restart order, preserve their existing annotation
// to avoid triggering unwanted rollouts when a new restart begins.
if restartState.ShouldAnnotateService(componentName) {
if deployment.Spec.Annotations == nil {
deployment.Spec.Annotations = make(map[string]string)
}
deployment.Spec.Annotations[commonconsts.RestartAnnotation] = restartState.Timestamp
} else if existingRestartAnnotations != nil {
if existingRestartAt, ok := existingRestartAnnotations[componentName]; ok && existingRestartAt != "" {
if deployment.Spec.Annotations == nil {
deployment.Spec.Annotations = make(map[string]string)
}
deployment.Spec.Annotations[commonconsts.RestartAnnotation] = existingRestartAt
}
// Generate DCDs for each service
for componentName, component := range parentDGD.Spec.Services {
dynamoNamespace := parentDGD.GetDynamoNamespaceForService(component)
dcd, err := generateSingleDCD(ctx, parentDGD, componentName, component, dynamoNamespace, defaultIngressSpec, restartState, existingRestartAnnotations, rollingUpdateCtx)
if err != nil {
return nil, err
}
deployments[componentName] = dcd
}
if component.ComponentType == commonconsts.ComponentTypePlanner {
// ensure that the extraPodSpec is not nil
if deployment.Spec.ExtraPodSpec == nil {
deployment.Spec.ExtraPodSpec = &v1alpha1.ExtraPodSpec{}
}
// ensure that the embedded PodSpec struct is not nil
if deployment.Spec.ExtraPodSpec.PodSpec == nil {
deployment.Spec.ExtraPodSpec.PodSpec = &corev1.PodSpec{}
}
// finally set the service account name
deployment.Spec.ExtraPodSpec.PodSpec.ServiceAccountName = commonconsts.PlannerServiceAccountName
return deployments, nil
}
func GetDynamoNamespace(object metav1.Object, service *v1alpha1.DynamoComponentDeploymentSharedSpec) string {
return v1alpha1.ComputeDynamoNamespace(service.GlobalDynamoNamespace, object.GetNamespace(), object.GetName())
}
// generateSingleDCD creates a DynamoComponentDeployment for a single service.
func generateSingleDCD(
ctx context.Context,
parentDGD *v1alpha1.DynamoGraphDeployment,
componentName string,
component *v1alpha1.DynamoComponentDeploymentSharedSpec,
dynamoNamespace string,
defaultIngressSpec *v1alpha1.IngressSpec,
restartState *RestartState,
existingRestartAnnotations map[string]string,
rollingUpdateCtx RollingUpdateContext,
) (*v1alpha1.DynamoComponentDeployment, error) {
deployment := &v1alpha1.DynamoComponentDeployment{}
deployment.Spec.DynamoComponentDeploymentSharedSpec = *component
deployment.Name = GetDCDResourceName(parentDGD, componentName, rollingUpdateCtx.NewWorkerHash)
deployment.Spec.BackendFramework = parentDGD.Spec.BackendFramework
deployment.Namespace = parentDGD.Namespace
deployment.Spec.ServiceName = componentName
deployment.Spec.DynamoNamespace = &dynamoNamespace
labels := make(map[string]string)
deployment.Spec.Labels = labels
deployment.Labels = labels
labels[commonconsts.KubeLabelDynamoComponent] = componentName
labels[commonconsts.KubeLabelDynamoNamespace] = dynamoNamespace
labels[commonconsts.KubeLabelDynamoGraphDeploymentName] = parentDGD.Name
// only label worker DCDs with their hash for cleanup during rolling updates
if IsWorkerComponent(component.ComponentType) {
labels[commonconsts.KubeLabelDynamoWorkerHash] = rollingUpdateCtx.NewWorkerHash
}
// Propagate metrics annotation from parent deployment if present
if parentDGD.Annotations != nil {
if deployment.Spec.Annotations == nil {
deployment.Spec.Annotations = make(map[string]string)
}
if deployment.IsFrontendComponent() && defaultIngressSpec != nil && deployment.Spec.Ingress == nil {
deployment.Spec.Ingress = defaultIngressSpec
if val, exists := parentDGD.Annotations[commonconsts.KubeAnnotationEnableMetrics]; exists {
deployment.Spec.Annotations[commonconsts.KubeAnnotationEnableMetrics] = val
}
// merge the envs from the parent deployment with the envs from the service
if len(parentDynamoGraphDeployment.Spec.Envs) > 0 {
deployment.Spec.Envs = MergeEnvs(parentDynamoGraphDeployment.Spec.Envs, deployment.Spec.Envs)
if val, exists := parentDGD.Annotations[commonconsts.KubeAnnotationDynamoDiscoveryBackend]; exists {
deployment.Spec.Annotations[commonconsts.KubeAnnotationDynamoDiscoveryBackend] = val
}
err := updateDynDeploymentConfig(deployment, commonconsts.DynamoServicePort)
if err != nil {
return nil, err
// Propagate operator origin version for version-gated behavior in backends
if val, exists := parentDGD.Annotations[commonconsts.KubeAnnotationDynamoOperatorOriginVersion]; exists {
deployment.Spec.Annotations[commonconsts.KubeAnnotationDynamoOperatorOriginVersion] = val
}
err = overrideWithDynDeploymentConfig(ctx, deployment)
if err != nil {
return nil, err
}
// Apply restart annotation if this service should be restarted.
if restartState.ShouldAnnotateService(componentName) {
if deployment.Spec.Annotations == nil {
deployment.Spec.Annotations = make(map[string]string)
}
// we only override the replicas if it is not set in the CRD.
// replicas, if set in the CRD must always be the source of truth.
if component.Replicas != nil {
deployment.Spec.Replicas = component.Replicas
deployment.Spec.Annotations[commonconsts.RestartAnnotation] = restartState.Timestamp
} else if existingRestartAnnotations != nil {
if existingRestartAt, ok := existingRestartAnnotations[componentName]; ok && existingRestartAt != "" {
if deployment.Spec.Annotations == nil {
deployment.Spec.Annotations = make(map[string]string)
}
deployment.Spec.Annotations[commonconsts.RestartAnnotation] = existingRestartAt
}
deployments[componentName] = deployment
}
return deployments, nil
}
func GetDynamoNamespace(object metav1.Object, service *v1alpha1.DynamoComponentDeploymentSharedSpec) string {
return v1alpha1.ComputeDynamoNamespace(service.GlobalDynamoNamespace, object.GetNamespace(), object.GetName())
if component.ComponentType == commonconsts.ComponentTypePlanner {
if deployment.Spec.ExtraPodSpec == nil {
deployment.Spec.ExtraPodSpec = &v1alpha1.ExtraPodSpec{}
}
if deployment.Spec.ExtraPodSpec.PodSpec == nil {
deployment.Spec.ExtraPodSpec.PodSpec = &corev1.PodSpec{}
}
deployment.Spec.ExtraPodSpec.PodSpec.ServiceAccountName = commonconsts.PlannerServiceAccountName
}
if deployment.IsFrontendComponent() && defaultIngressSpec != nil && deployment.Spec.Ingress == nil {
deployment.Spec.Ingress = defaultIngressSpec
}
if len(parentDGD.Spec.Envs) > 0 {
deployment.Spec.Envs = MergeEnvs(parentDGD.Spec.Envs, deployment.Spec.Envs)
}
if err := updateDynDeploymentConfig(deployment, commonconsts.DynamoServicePort); err != nil {
return nil, err
}
if err := overrideWithDynDeploymentConfig(ctx, deployment); err != nil {
return nil, err
}
// during a rolling update, the replica count is determined by the rollingUpdateCtx instead of the component spec
if rollingUpdateCtx.InProgress() && IsWorkerComponent(component.ComponentType) && rollingUpdateCtx.NewWorkerReplicas[componentName] != 0 {
deployment.Spec.Replicas = ptr.To(rollingUpdateCtx.NewWorkerReplicas[componentName])
} else if component.Replicas != nil {
deployment.Spec.Replicas = component.Replicas
}
return deployment, nil
}
// updateDynDeploymentConfig updates the runtime config object for the given dynamoDeploymentComponent
......@@ -449,8 +504,15 @@ func MergeEnvs(common, specific []corev1.EnvVar) []corev1.EnvVar {
return merged
}
func GetDynamoComponentName(dynamoDeployment *v1alpha1.DynamoGraphDeployment, component string) string {
return fmt.Sprintf("%s-%s", dynamoDeployment.Name, strings.ToLower(component))
// GetDCDResourceName returns the Kubernetes resource name for a DynamoComponentDeployment.
// If using for a non DCD resource (i.e. Ingress or VirtualService), use the empty string for the workerSuffix.
// For DCD Resources, Worker components include the workerSuffix; for non-workers, workerSuffix is ignored
func GetDCDResourceName(dgd *v1alpha1.DynamoGraphDeployment, serviceName string, workerSuffix string) string {
baseName := fmt.Sprintf("%s-%s", dgd.Name, strings.ToLower(serviceName))
if spec := dgd.Spec.Services[serviceName]; spec != nil && IsWorkerComponent(spec.ComponentType) && workerSuffix != "" {
return baseName + "-" + workerSuffix
}
return baseName
}
type SecretsRetriever interface {
......@@ -555,7 +617,7 @@ func GenerateComponentService(ctx context.Context, dynamoDeployment *v1alpha1.Dy
return nil, fmt.Errorf("expected DynamoComponentDeployment %s to have a dynamoNamespace", componentName)
}
// DNS-safe service resource name: "{dgd-name}-{lowercase(componentName)}"
kubeServiceName := GetDynamoComponentName(dynamoDeployment, componentName)
kubeServiceName := GetDCDResourceName(dynamoDeployment, componentName, "")
var servicePort corev1.ServicePort
switch component.ComponentType {
......@@ -845,8 +907,8 @@ func MultinodeDeployerFactory(multinodeDeploymentType commonconsts.MultinodeDepl
}
}
// isWorkerComponent checks if a component is a worker that needs backend framework detection
func isWorkerComponent(componentType string) bool {
// IsWorkerComponent checks if a component is a worker that needs backend framework detection
func IsWorkerComponent(componentType string) bool {
return componentType == commonconsts.ComponentTypeWorker ||
componentType == commonconsts.ComponentTypePrefill ||
componentType == commonconsts.ComponentTypeDecode
......@@ -1129,6 +1191,11 @@ func setMetricsLabels(labels map[string]string, dynamoGraphDeployment *v1alpha1.
func generateComponentContext(component *v1alpha1.DynamoComponentDeploymentSharedSpec, parentGraphDeploymentName string, namespace string, numberOfNodes int32, discoveryBackend string) ComponentContext {
dynamoNamespace := v1alpha1.ComputeDynamoNamespace(component.GlobalDynamoNamespace, namespace, parentGraphDeploymentName)
var workerHashSuffix string
if IsWorkerComponent(component.ComponentType) && component.Labels[commonconsts.KubeLabelDynamoWorkerHash] != "" {
workerHashSuffix = component.Labels[commonconsts.KubeLabelDynamoWorkerHash]
}
componentContext := ComponentContext{
numberOfNodes: numberOfNodes,
ComponentType: component.ComponentType,
......@@ -1137,6 +1204,7 @@ func generateComponentContext(component *v1alpha1.DynamoComponentDeploymentShare
DiscoveryBackend: discoveryBackend,
DynamoNamespace: dynamoNamespace,
EPPConfig: component.EPPConfig,
WorkerHashSuffix: workerHashSuffix,
}
return componentContext
}
......@@ -1316,7 +1384,7 @@ func GenerateGrovePodCliqueSet(
func generateLabels(component *v1alpha1.DynamoComponentDeploymentSharedSpec, dynamoDeployment *v1alpha1.DynamoGraphDeployment, componentName string) (map[string]string, error) {
labels := make(map[string]string)
labels[commonconsts.KubeLabelDynamoSelector] = GetDynamoComponentName(dynamoDeployment, componentName)
labels[commonconsts.KubeLabelDynamoSelector] = GetDCDResourceName(dynamoDeployment, componentName, "")
labels[commonconsts.KubeLabelDynamoGraphDeploymentName] = dynamoDeployment.Name
labels[commonconsts.KubeLabelDynamoComponent] = componentName
if component.DynamoNamespace != nil {
......@@ -1409,7 +1477,7 @@ func determineBackendFramework(
explicitBackendFramework string,
) (BackendFramework, error) {
// Check if this is a worker component - if not, use noop backend
if !isWorkerComponent(componentType) {
if !IsWorkerComponent(componentType) {
return BackendFrameworkNoop, nil
}
......
......@@ -520,7 +520,7 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
BackendFramework: string(BackendFrameworkSGLang),
Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{
"service1": {
DynamoNamespace: &[]string{"default-test-dynamographdeployment"}[0],
DynamoNamespace: &[]string{"default-test-dynamographdeployment-44136fa3"}[0],
ComponentType: "frontend",
Replicas: &[]int32{3}[0],
Resources: &v1alpha1.Resources{
......@@ -539,7 +539,7 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
},
},
"service2": {
DynamoNamespace: &[]string{"default-test-dynamographdeployment"}[0],
DynamoNamespace: &[]string{"default-test-dynamographdeployment-44136fa3"}[0],
Replicas: &[]int32{3}[0],
Resources: &v1alpha1.Resources{
Requests: &v1alpha1.ResourceItem{
......@@ -663,7 +663,7 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
BackendFramework: string(BackendFrameworkSGLang),
Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{
"service1": {
DynamoNamespace: &[]string{"default-test-dynamographdeployment"}[0],
DynamoNamespace: &[]string{"default-test-dynamographdeployment-44136fa3"}[0],
ComponentType: "frontend",
Replicas: &[]int32{3}[0],
Resources: &v1alpha1.Resources{
......@@ -722,7 +722,7 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := GenerateDynamoComponentsDeployments(context.Background(), tt.args.parentDynamoGraphDeployment, tt.args.ingressSpec, nil, nil)
got, err := GenerateDynamoComponentsDeployments(context.Background(), tt.args.parentDynamoGraphDeployment, tt.args.ingressSpec, nil, nil, RollingUpdateContext{})
if (err != nil) != tt.wantErr {
t.Errorf("GenerateDynamoComponentsDeployments() error = %v, wantErr %v", err, tt.wantErr)
return
......@@ -756,7 +756,7 @@ func Test_GetDynamoComponentDeploymentsGlobalNamespace(t *testing.T) {
},
}
got, err := GenerateDynamoComponentsDeployments(context.Background(), dgd, nil, nil, nil)
got, err := GenerateDynamoComponentsDeployments(context.Background(), dgd, nil, nil, nil, RollingUpdateContext{})
if !assert.NoError(t, err) {
return
}
......@@ -823,7 +823,7 @@ func TestGenerateComponentContext(t *testing.T) {
namespace: "djangoz",
numberOfNodes: 1,
discoveryBackend: "kubernetes",
expectedDynamoNamespace: "djangoz-vllm-disagg", // Should be k8s-namespace + DGD name
expectedDynamoNamespace: "djangoz-vllm-disagg",
expectedComponentType: commonconsts.ComponentTypeFrontend,
expectedParentDGDName: "vllm-disagg",
expectedParentDGDNamespace: "djangoz",
......@@ -840,7 +840,7 @@ func TestGenerateComponentContext(t *testing.T) {
namespace: "production",
numberOfNodes: 2,
discoveryBackend: "etcd",
expectedDynamoNamespace: commonconsts.GlobalDynamoNamespace, // "dynamo"
expectedDynamoNamespace: commonconsts.GlobalDynamoNamespace,
expectedComponentType: commonconsts.ComponentTypeWorker,
expectedParentDGDName: "shared-frontend",
expectedParentDGDNamespace: "production",
......@@ -849,7 +849,7 @@ func TestGenerateComponentContext(t *testing.T) {
name: "nil dynamoNamespace field still computes correctly",
component: &v1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: commonconsts.ComponentTypePlanner,
DynamoNamespace: nil, // Not set at all
DynamoNamespace: nil,
},
parentGraphDeploymentName: "test-dgd",
namespace: "default",
......@@ -1216,15 +1216,6 @@ func Test_mergeEnvs(t *testing.T) {
}
}
func sortEnvVars(envs []corev1.EnvVar) []corev1.EnvVar {
sorted := make([]corev1.EnvVar, len(envs))
copy(sorted, envs)
sort.Slice(sorted, func(i, j int) bool {
return sorted[i].Name < sorted[j].Name
})
return sorted
}
func TestGenerateGrovePodCliqueSet(t *testing.T) {
type args struct {
ctx context.Context
......@@ -1544,6 +1535,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
Name: commonconsts.DynamoNamespaceEnvVar,
Value: "test-namespace-test-dynamo-graph-deployment",
},
{
Name: commonconsts.DynamoNamespacePrefixEnvVar,
Value: "test-namespace-test-dynamo-graph-deployment",
},
{
Name: commonconsts.DynamoComponentEnvVar,
Value: commonconsts.ComponentTypeFrontend,
......@@ -2498,6 +2493,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
Name: commonconsts.DynamoNamespaceEnvVar,
Value: "test-namespace-test-dynamo-graph-deployment",
},
{
Name: commonconsts.DynamoNamespacePrefixEnvVar,
Value: "test-namespace-test-dynamo-graph-deployment",
},
{
Name: commonconsts.DynamoComponentEnvVar,
Value: commonconsts.ComponentTypeFrontend,
......@@ -3476,6 +3475,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
Name: commonconsts.DynamoNamespaceEnvVar,
Value: "test-namespace-test-dynamo-graph-deployment",
},
{
Name: commonconsts.DynamoNamespacePrefixEnvVar,
Value: "test-namespace-test-dynamo-graph-deployment",
},
{
Name: commonconsts.DynamoComponentEnvVar,
Value: commonconsts.ComponentTypeFrontend,
......@@ -3764,6 +3767,15 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
}
}
func sortEnvVars(envs []corev1.EnvVar) []corev1.EnvVar {
sorted := make([]corev1.EnvVar, len(envs))
copy(sorted, envs)
sort.Slice(sorted, func(i, j int) bool {
return sorted[i].Name < sorted[j].Name
})
return sorted
}
func Test_GeneratePodCliqueSetGlobalDynamoNamespace(t *testing.T) {
dynamoDeployment := &v1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
......@@ -5303,12 +5315,13 @@ func TestGenerateBasePodSpec_Worker(t *testing.T) {
expectedPodSpec *corev1.PodSpec
}{
{
name: "Planner component should have planner service account",
name: "Worker component with DynamoNamespace set",
component: &v1alpha1.DynamoComponentDeploymentSharedSpec{
Envs: []corev1.EnvVar{
{Name: "ANOTHER_COMPONENTENV", Value: "true"},
},
ComponentType: commonconsts.ComponentTypeWorker,
ComponentType: commonconsts.ComponentTypeWorker,
DynamoNamespace: ptr.To("default-test-deployment"), // Namespace set by caller
ExtraPodSpec: &v1alpha1.ExtraPodSpec{
MainContainer: &corev1.Container{
Command: []string{"python3"},
......@@ -6388,6 +6401,25 @@ func TestDetermineGroveRestartState(t *testing.T) {
wantSvcs: []string{"Frontend", "Worker"},
wantTimestamp: ptr.To(restartID),
},
{
name: "superseded restart returns nil - preserves existing annotations via fallback",
dgd: &v1alpha1.DynamoGraphDeployment{
Spec: v1alpha1.DynamoGraphDeploymentSpec{
Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{
"Frontend": {},
"Worker": {},
},
Restart: &v1alpha1.Restart{
ID: restartID,
},
},
},
restartStatus: &v1alpha1.RestartStatus{
ObservedID: restartID,
Phase: v1alpha1.RestartPhaseSuperseded,
},
wantNil: true,
},
}
for _, tt := range tests {
......@@ -6738,3 +6770,172 @@ func TestGenerateGrovePodCliqueSet_RestartAnnotations(t *testing.T) {
})
}
}
func TestIsWorkerComponent(t *testing.T) {
workers := []string{commonconsts.ComponentTypeWorker, commonconsts.ComponentTypePrefill, commonconsts.ComponentTypeDecode}
nonWorkers := []string{commonconsts.ComponentTypeFrontend, commonconsts.ComponentTypePlanner, commonconsts.ComponentTypeEPP, "custom", ""}
for _, ct := range workers {
assert.True(t, IsWorkerComponent(ct), "%s should be a worker", ct)
}
for _, ct := range nonWorkers {
assert.False(t, IsWorkerComponent(ct), "%s should not be a worker", ct)
}
}
func TestRollingUpdateContext_InProgress(t *testing.T) {
assert.False(t, RollingUpdateContext{}.InProgress())
assert.False(t, RollingUpdateContext{NewWorkerHash: "abc"}.InProgress())
assert.True(t, RollingUpdateContext{OldWorkerReplicas: map[string]int32{"w": 1}}.InProgress())
}
func TestGetDCDResourceName(t *testing.T) {
dgd := &v1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{Name: "my-dgd"},
Spec: v1alpha1.DynamoGraphDeploymentSpec{
Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{
"prefill": {ComponentType: commonconsts.ComponentTypePrefill},
"decode": {ComponentType: commonconsts.ComponentTypeDecode},
"worker": {ComponentType: commonconsts.ComponentTypeWorker},
"frontend": {ComponentType: commonconsts.ComponentTypeFrontend},
},
},
}
// Workers get hash suffix
assert.Equal(t, "my-dgd-prefill-abc12345", GetDCDResourceName(dgd, "prefill", "abc12345"))
assert.Equal(t, "my-dgd-decode-abc12345", GetDCDResourceName(dgd, "decode", "abc12345"))
assert.Equal(t, "my-dgd-worker-abc12345", GetDCDResourceName(dgd, "worker", "abc12345"))
// Non-workers never get hash suffix
assert.Equal(t, "my-dgd-frontend", GetDCDResourceName(dgd, "frontend", "abc12345"))
// Empty hash — workers don't get suffix
assert.Equal(t, "my-dgd-prefill", GetDCDResourceName(dgd, "prefill", ""))
}
func TestGenerateSingleDCD_RollingUpdateContext(t *testing.T) {
dgd := &v1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{Name: "my-dgd", Namespace: "ns"},
Spec: v1alpha1.DynamoGraphDeploymentSpec{
Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{
"prefill": {ComponentType: commonconsts.ComponentTypePrefill, Replicas: ptr.To(int32(4))},
"frontend": {ComponentType: commonconsts.ComponentTypeFrontend, Replicas: ptr.To(int32(1))},
},
},
}
ctx := context.Background()
ruCtx := RollingUpdateContext{
NewWorkerHash: "aabb1122",
OldWorkerReplicas: map[string]int32{"prefill": 2},
NewWorkerReplicas: map[string]int32{"prefill": 2},
}
dcds, err := GenerateDynamoComponentsDeployments(ctx, dgd, nil, &RestartState{}, nil, ruCtx)
assert.NoError(t, err)
// Worker DCD: hash suffix in name, hash label, replica override
prefillDCD := dcds["prefill"]
assert.Equal(t, "my-dgd-prefill-aabb1122", prefillDCD.Name)
assert.Equal(t, "aabb1122", prefillDCD.Labels[commonconsts.KubeLabelDynamoWorkerHash])
assert.Equal(t, int32(2), *prefillDCD.Spec.Replicas)
// Non-worker DCD: no hash suffix, no hash label, original replicas
frontendDCD := dcds["frontend"]
assert.Equal(t, "my-dgd-frontend", frontendDCD.Name)
assert.Empty(t, frontendDCD.Labels[commonconsts.KubeLabelDynamoWorkerHash])
assert.Equal(t, int32(1), *frontendDCD.Spec.Replicas)
}
func TestGenerateSingleDCD_NoRollingUpdate(t *testing.T) {
dgd := &v1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{Name: "my-dgd", Namespace: "ns"},
Spec: v1alpha1.DynamoGraphDeploymentSpec{
Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {ComponentType: commonconsts.ComponentTypeWorker, Replicas: ptr.To(int32(3))},
},
},
}
dcds, err := GenerateDynamoComponentsDeployments(context.Background(), dgd, nil, &RestartState{}, nil, RollingUpdateContext{})
assert.NoError(t, err)
dcd := dcds["worker"]
assert.Equal(t, "my-dgd-worker", dcd.Name)
assert.Empty(t, dcd.Labels[commonconsts.KubeLabelDynamoWorkerHash])
assert.Equal(t, int32(3), *dcd.Spec.Replicas)
}
func TestGenerateComponentContext_WorkerHashSuffix(t *testing.T) {
// Worker with hash label gets WorkerHashSuffix
component := &v1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: commonconsts.ComponentTypeWorker,
Labels: map[string]string{commonconsts.KubeLabelDynamoWorkerHash: "abc123"},
}
compCtx := generateComponentContext(component, "dgd", "ns", 1, "kubernetes")
assert.Equal(t, "abc123", compCtx.WorkerHashSuffix)
// Worker without hash label
component2 := &v1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: commonconsts.ComponentTypeWorker,
}
compCtx2 := generateComponentContext(component2, "dgd", "ns", 1, "kubernetes")
assert.Empty(t, compCtx2.WorkerHashSuffix)
// Frontend never gets WorkerHashSuffix, even with the label
component3 := &v1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: commonconsts.ComponentTypeFrontend,
Labels: map[string]string{commonconsts.KubeLabelDynamoWorkerHash: "abc123"},
}
compCtx3 := generateComponentContext(component3, "dgd", "ns", 1, "kubernetes")
assert.Empty(t, compCtx3.WorkerHashSuffix)
}
func TestWorkerDefaults_WorkerHashSuffixEnvVar(t *testing.T) {
w := NewWorkerDefaults()
// With suffix
container, err := w.GetBaseContainer(ComponentContext{
DynamoNamespace: "ns-dgd",
ComponentType: commonconsts.ComponentTypeWorker,
WorkerHashSuffix: "abc123",
})
assert.NoError(t, err)
found := false
for _, env := range container.Env {
if env.Name == commonconsts.DynamoNamespaceWorkerSuffixEnvVar {
assert.Equal(t, "abc123", env.Value)
found = true
}
}
assert.True(t, found, "DYN_NAMESPACE_WORKER_SUFFIX should be set")
// Without suffix — env var should NOT be present
container2, err := w.GetBaseContainer(ComponentContext{
DynamoNamespace: "ns-dgd",
ComponentType: commonconsts.ComponentTypeWorker,
})
assert.NoError(t, err)
for _, env := range container2.Env {
assert.NotEqual(t, commonconsts.DynamoNamespaceWorkerSuffixEnvVar, env.Name,
"DYN_NAMESPACE_WORKER_SUFFIX should not be set when suffix is empty")
}
}
func TestFrontendDefaults_NamespacePrefixEnvVar(t *testing.T) {
f := NewFrontendDefaults()
container, err := f.GetBaseContainer(ComponentContext{
DynamoNamespace: "myns-mydgd",
ComponentType: commonconsts.ComponentTypeFrontend,
})
assert.NoError(t, err)
found := false
for _, env := range container.Env {
if env.Name == commonconsts.DynamoNamespacePrefixEnvVar {
assert.Equal(t, "myns-mydgd", env.Value)
found = true
}
}
assert.True(t, found, "DYN_NAMESPACE_PREFIX should be set on frontend")
}
......@@ -126,6 +126,7 @@ func CheckPodCliqueReady(ctx context.Context, client client.Client, resourceName
serviceStatus := v1alpha1.ServiceReplicaStatus{
ComponentKind: v1alpha1.ComponentKindPodClique,
ComponentName: resourceName,
ComponentNames: []string{resourceName},
Replicas: podClique.Status.Replicas,
UpdatedReplicas: podClique.Status.UpdatedReplicas,
ReadyReplicas: &readyReplicas,
......@@ -199,6 +200,7 @@ func CheckPCSGReady(ctx context.Context, client client.Client, resourceName, nam
serviceStatus := v1alpha1.ServiceReplicaStatus{
ComponentKind: v1alpha1.ComponentKindPodCliqueScalingGroup,
ComponentName: resourceName,
ComponentNames: []string{resourceName},
Replicas: pcsg.Status.Replicas,
UpdatedReplicas: pcsg.Status.UpdatedReplicas,
AvailableReplicas: &availableReplicas,
......
......@@ -363,6 +363,7 @@ func TestCheckPodCliqueReady(t *testing.T) {
wantServiceStatus: v1alpha1.ServiceReplicaStatus{
ComponentKind: v1alpha1.ComponentKindPodClique,
ComponentName: "ready-podclique",
ComponentNames: []string{"ready-podclique"},
Replicas: 3,
UpdatedReplicas: 3,
ReadyReplicas: ptr.To(int32(3)),
......@@ -392,6 +393,7 @@ func TestCheckPodCliqueReady(t *testing.T) {
wantServiceStatus: v1alpha1.ServiceReplicaStatus{
ComponentKind: v1alpha1.ComponentKindPodClique,
ComponentName: "zero-replicas-podclique",
ComponentNames: []string{"zero-replicas-podclique"},
Replicas: 0,
UpdatedReplicas: 0,
ReadyReplicas: ptr.To(int32(0)),
......@@ -422,6 +424,7 @@ func TestCheckPodCliqueReady(t *testing.T) {
wantServiceStatus: v1alpha1.ServiceReplicaStatus{
ComponentKind: v1alpha1.ComponentKindPodClique,
ComponentName: "stale-podclique",
ComponentNames: []string{"stale-podclique"},
Replicas: 2,
UpdatedReplicas: 2,
ReadyReplicas: ptr.To(int32(2)),
......@@ -452,6 +455,7 @@ func TestCheckPodCliqueReady(t *testing.T) {
wantServiceStatus: v1alpha1.ServiceReplicaStatus{
ComponentKind: v1alpha1.ComponentKindPodClique,
ComponentName: "not-ready-podclique",
ComponentNames: []string{"not-ready-podclique"},
Replicas: 3,
UpdatedReplicas: 3,
ReadyReplicas: ptr.To(int32(1)),
......@@ -482,6 +486,7 @@ func TestCheckPodCliqueReady(t *testing.T) {
wantServiceStatus: v1alpha1.ServiceReplicaStatus{
ComponentKind: v1alpha1.ComponentKindPodClique,
ComponentName: "not-updated-podclique",
ComponentNames: []string{"not-updated-podclique"},
Replicas: 3,
UpdatedReplicas: 2,
ReadyReplicas: ptr.To(int32(3)),
......@@ -512,6 +517,7 @@ func TestCheckPodCliqueReady(t *testing.T) {
wantServiceStatus: v1alpha1.ServiceReplicaStatus{
ComponentKind: v1alpha1.ComponentKindPodClique,
ComponentName: "rolling-update-podclique",
ComponentNames: []string{"rolling-update-podclique"},
Replicas: 4,
UpdatedReplicas: 3,
ReadyReplicas: ptr.To(int32(3)),
......@@ -542,6 +548,7 @@ func TestCheckPodCliqueReady(t *testing.T) {
wantServiceStatus: v1alpha1.ServiceReplicaStatus{
ComponentKind: v1alpha1.ComponentKindPodClique,
ComponentName: "nil-observed-gen-podclique",
ComponentNames: []string{"nil-observed-gen-podclique"},
Replicas: 2,
UpdatedReplicas: 2,
ReadyReplicas: ptr.To(int32(2)),
......@@ -628,6 +635,7 @@ func TestCheckPCSGReady(t *testing.T) {
wantServiceStatus: v1alpha1.ServiceReplicaStatus{
ComponentKind: v1alpha1.ComponentKindPodCliqueScalingGroup,
ComponentName: "ready-pcsg",
ComponentNames: []string{"ready-pcsg"},
Replicas: 3,
UpdatedReplicas: 3,
AvailableReplicas: ptr.To(int32(3)),
......@@ -657,6 +665,7 @@ func TestCheckPCSGReady(t *testing.T) {
wantServiceStatus: v1alpha1.ServiceReplicaStatus{
ComponentKind: v1alpha1.ComponentKindPodCliqueScalingGroup,
ComponentName: "zero-replicas-pcsg",
ComponentNames: []string{"zero-replicas-pcsg"},
Replicas: 0,
UpdatedReplicas: 0,
AvailableReplicas: ptr.To(int32(0)),
......@@ -687,6 +696,7 @@ func TestCheckPCSGReady(t *testing.T) {
wantServiceStatus: v1alpha1.ServiceReplicaStatus{
ComponentKind: v1alpha1.ComponentKindPodCliqueScalingGroup,
ComponentName: "stale-pcsg",
ComponentNames: []string{"stale-pcsg"},
Replicas: 2,
UpdatedReplicas: 2,
AvailableReplicas: ptr.To(int32(2)),
......@@ -717,6 +727,7 @@ func TestCheckPCSGReady(t *testing.T) {
wantServiceStatus: v1alpha1.ServiceReplicaStatus{
ComponentKind: v1alpha1.ComponentKindPodCliqueScalingGroup,
ComponentName: "not-ready-pcsg",
ComponentNames: []string{"not-ready-pcsg"},
Replicas: 3,
UpdatedReplicas: 3,
AvailableReplicas: ptr.To(int32(1)),
......@@ -747,6 +758,7 @@ func TestCheckPCSGReady(t *testing.T) {
wantServiceStatus: v1alpha1.ServiceReplicaStatus{
ComponentKind: v1alpha1.ComponentKindPodCliqueScalingGroup,
ComponentName: "not-updated-pcsg",
ComponentNames: []string{"not-updated-pcsg"},
Replicas: 3,
UpdatedReplicas: 2,
AvailableReplicas: ptr.To(int32(3)),
......@@ -777,6 +789,7 @@ func TestCheckPCSGReady(t *testing.T) {
wantServiceStatus: v1alpha1.ServiceReplicaStatus{
ComponentKind: v1alpha1.ComponentKindPodCliqueScalingGroup,
ComponentName: "rolling-update-pcsg",
ComponentNames: []string{"rolling-update-pcsg"},
Replicas: 4,
UpdatedReplicas: 3,
AvailableReplicas: ptr.To(int32(3)),
......@@ -807,6 +820,7 @@ func TestCheckPCSGReady(t *testing.T) {
wantServiceStatus: v1alpha1.ServiceReplicaStatus{
ComponentKind: v1alpha1.ComponentKindPodCliqueScalingGroup,
ComponentName: "nil-observed-gen-pcsg",
ComponentNames: []string{"nil-observed-gen-pcsg"},
Replicas: 2,
UpdatedReplicas: 2,
AvailableReplicas: ptr.To(int32(2)),
......@@ -895,6 +909,7 @@ func Test_GetComponentReadinessAndServiceReplicaStatuses(t *testing.T) {
"frontend": {
ComponentKind: v1alpha1.ComponentKindPodClique,
ComponentName: "test-dgd-0-frontend",
ComponentNames: []string{"test-dgd-0-frontend"},
Replicas: 2,
UpdatedReplicas: 2,
ReadyReplicas: ptr.To(int32(1)),
......@@ -963,6 +978,7 @@ func Test_GetComponentReadinessAndServiceReplicaStatuses(t *testing.T) {
"decode": {
ComponentKind: v1alpha1.ComponentKindPodCliqueScalingGroup,
ComponentName: "test-dgd-0-decode",
ComponentNames: []string{"test-dgd-0-decode"},
Replicas: 2,
UpdatedReplicas: 2,
AvailableReplicas: ptr.To(int32(2)),
......@@ -970,6 +986,7 @@ func Test_GetComponentReadinessAndServiceReplicaStatuses(t *testing.T) {
"prefill": {
ComponentKind: v1alpha1.ComponentKindPodCliqueScalingGroup,
ComponentName: "test-dgd-0-prefill",
ComponentNames: []string{"test-dgd-0-prefill"},
Replicas: 3,
UpdatedReplicas: 3,
AvailableReplicas: ptr.To(int32(3)),
......@@ -1014,6 +1031,7 @@ func Test_GetComponentReadinessAndServiceReplicaStatuses(t *testing.T) {
"worker": {
ComponentKind: v1alpha1.ComponentKindPodCliqueScalingGroup,
ComponentName: "test-dgd-0-worker",
ComponentNames: []string{"test-dgd-0-worker"},
Replicas: 2,
UpdatedReplicas: 2,
AvailableReplicas: ptr.To(int32(1)),
......@@ -1103,6 +1121,7 @@ func Test_GetComponentReadinessAndServiceReplicaStatuses(t *testing.T) {
"frontend": {
ComponentKind: v1alpha1.ComponentKindPodClique,
ComponentName: "test-dgd-0-frontend",
ComponentNames: []string{"test-dgd-0-frontend"},
Replicas: 1,
UpdatedReplicas: 1,
ReadyReplicas: ptr.To(int32(1)),
......@@ -1110,6 +1129,7 @@ func Test_GetComponentReadinessAndServiceReplicaStatuses(t *testing.T) {
"decode": {
ComponentKind: v1alpha1.ComponentKindPodCliqueScalingGroup,
ComponentName: "test-dgd-0-decode",
ComponentNames: []string{"test-dgd-0-decode"},
Replicas: 2,
UpdatedReplicas: 2,
AvailableReplicas: ptr.To(int32(1)),
......@@ -1117,6 +1137,7 @@ func Test_GetComponentReadinessAndServiceReplicaStatuses(t *testing.T) {
"prefill": {
ComponentKind: v1alpha1.ComponentKindPodCliqueScalingGroup,
ComponentName: "test-dgd-0-prefill",
ComponentNames: []string{"test-dgd-0-prefill"},
Replicas: 2,
UpdatedReplicas: 2,
AvailableReplicas: ptr.To(int32(2)),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment