Unverified Commit bce060d2 authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

fix(operator): drop Recreate override on restore-target Deployments (#8434)


Signed-off-by: default avatarSchwinn Saereesitthipitak <schwinns@nvidia.com>
parent 9321c2e5
...@@ -43,7 +43,6 @@ import ( ...@@ -43,7 +43,6 @@ import (
"github.com/ai-dynamo/dynamo/deploy/operator/internal/dra" "github.com/ai-dynamo/dynamo/deploy/operator/internal/dra"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/dynamo" "github.com/ai-dynamo/dynamo/deploy/operator/internal/dynamo"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/observability" "github.com/ai-dynamo/dynamo/deploy/operator/internal/observability"
snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol"
networkingv1beta1 "istio.io/client-go/pkg/apis/networking/v1beta1" networkingv1beta1 "istio.io/client-go/pkg/apis/networking/v1beta1"
k8serrors "k8s.io/apimachinery/pkg/api/errors" k8serrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta" "k8s.io/apimachinery/pkg/api/meta"
...@@ -978,17 +977,6 @@ func (r *DynamoComponentDeploymentReconciler) generateDeployment(ctx context.Con ...@@ -978,17 +977,6 @@ func (r *DynamoComponentDeploymentReconciler) generateDeployment(ctx context.Con
} }
} }
// Checkpoint-restore pods must avoid overlap with prior replicas.
// Enforce Recreate whenever the rendered template is a restore target so
// the old pod is terminated before the restore placeholder is started.
if podTemplateSpec != nil &&
podTemplateSpec.Labels != nil &&
podTemplateSpec.Labels[snapshotprotocol.RestoreTargetLabel] == commonconsts.KubeLabelValueTrue {
strategy = appsv1.DeploymentStrategy{
Type: appsv1.RecreateDeploymentStrategyType,
}
}
kubeDeployment.Spec = appsv1.DeploymentSpec{ kubeDeployment.Spec = appsv1.DeploymentSpec{
Replicas: opt.dynamoComponentDeployment.Spec.Replicas, Replicas: opt.dynamoComponentDeployment.Spec.Replicas,
Selector: &metav1.LabelSelector{ Selector: &metav1.LabelSelector{
......
...@@ -1681,7 +1681,11 @@ func TestDynamoComponentDeploymentReconciler_generateDeployment_RestoreStrategy( ...@@ -1681,7 +1681,11 @@ func TestDynamoComponentDeploymentReconciler_generateDeployment_RestoreStrategy(
} }
} }
t.Run("ready checkpoint forces Recreate strategy", func(t *testing.T) { t.Run("ready checkpoint keeps RollingUpdate strategy", func(t *testing.T) {
// Restore-target pods do not need a special Recreate override. The
// default RollingUpdate strategy works for failure-replacement and
// scale-up; users who specifically want Recreate on tight-GPU nodes
// can still opt in via the nvidia.com/deployment-strategy annotation.
identity := v1alpha1.DynamoCheckpointIdentity{Model: "test-model", BackendFramework: "vllm"} identity := v1alpha1.DynamoCheckpointIdentity{Model: "test-model", BackendFramework: "vllm"}
checkpointName, err := checkpoint.ComputeIdentityHash(identity) checkpointName, err := checkpoint.ComputeIdentityHash(identity)
if err != nil { if err != nil {
...@@ -1709,8 +1713,8 @@ func TestDynamoComponentDeploymentReconciler_generateDeployment_RestoreStrategy( ...@@ -1709,8 +1713,8 @@ func TestDynamoComponentDeploymentReconciler_generateDeployment_RestoreStrategy(
if toDelete { if toDelete {
t.Fatalf("expected deployment to be retained") t.Fatalf("expected deployment to be retained")
} }
if deploy.Spec.Strategy.Type != appsv1.RecreateDeploymentStrategyType { if deploy.Spec.Strategy.Type != appsv1.RollingUpdateDeploymentStrategyType {
t.Fatalf("expected Recreate strategy, got %s", deploy.Spec.Strategy.Type) t.Fatalf("expected RollingUpdate strategy, got %s", deploy.Spec.Strategy.Type)
} }
}) })
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment