Unverified Commit ee9d67e4 authored by Thomas Montfort's avatar Thomas Montfort Committed by GitHub
Browse files

fix(operator): prevent DGD deletion thrashing under foreground cascading deletion (#8207)


Signed-off-by: default avatarTyler Montfort <tmontfort@nvidia.com>
parent cabdaebd
...@@ -60,8 +60,11 @@ func HandleFinalizer[T client.Object](ctx context.Context, obj T, writer client. ...@@ -60,8 +60,11 @@ func HandleFinalizer[T client.Object](ctx context.Context, obj T, writer client.
return false, err return false, err
} }
logger.Info("Finalizer removed from object", "resourceVersion", obj.GetResourceVersion()) logger.Info("Finalizer removed from object", "resourceVersion", obj.GetResourceVersion())
return true, nil
} }
// Object is being deleted — signal the caller to skip reconciliation
// regardless of whether we just removed the finalizer or it was already
// gone (e.g., removed by a previous reconcile)
return true, nil
} }
return false, nil return false, nil
} }
...@@ -147,6 +147,25 @@ func TestHandleFinalizer(t *testing.T) { ...@@ -147,6 +147,25 @@ func TestHandleFinalizer(t *testing.T) {
want: false, want: false,
wantErr: true, wantErr: true,
}, },
{
name: "deleted object without finalizer - should return true to prevent reconciliation",
args: args{
ctx: context.Background(),
obj: &ObjectMock{
GetDeletionTimestampFunc: func() *metav1.Time {
return &metav1.Time{Time: time.Now()}
},
GetFinalizersFunc: func() []string {
return []string{}
},
GetResourceVersionFunc: func() string {
return "1"
},
},
},
want: true,
wantErr: false,
},
{ {
name: "non deleted object without finalizer - nominal case", name: "non deleted object without finalizer - nominal case",
args: args{ args: args{
......
...@@ -12,11 +12,12 @@ import ( ...@@ -12,11 +12,12 @@ import (
"github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts" commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
gmsruntime "github.com/ai-dynamo/dynamo/deploy/operator/internal/gms"
corev1 "k8s.io/api/core/v1" corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/intstr" "k8s.io/apimachinery/pkg/util/intstr"
) )
var failoverLockFile = filepath.Join(gmsSharedMountPath, "failover.lock") var failoverLockFile = filepath.Join(gmsruntime.SharedMountPath, "failover.lock")
const ( const (
failoverEngineCount = 2 failoverEngineCount = 2
......
...@@ -12,6 +12,7 @@ import ( ...@@ -12,6 +12,7 @@ import (
"github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts" commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
gmsruntime "github.com/ai-dynamo/dynamo/deploy/operator/internal/gms"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
corev1 "k8s.io/api/core/v1" corev1 "k8s.io/api/core/v1"
...@@ -35,7 +36,7 @@ func failoverPodSpec() corev1.PodSpec { ...@@ -35,7 +36,7 @@ func failoverPodSpec() corev1.PodSpec {
{Name: "DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS", Value: "true"}, {Name: "DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS", Value: "true"},
{Name: "DYN_HEALTH_CHECK_ENABLED", Value: "true"}, {Name: "DYN_HEALTH_CHECK_ENABLED", Value: "true"},
{Name: commonconsts.DynamoDiscoveryBackendEnvVar, Value: "kubernetes"}, {Name: commonconsts.DynamoDiscoveryBackendEnvVar, Value: "kubernetes"},
{Name: "TMPDIR", Value: gmsSharedMountPath}, {Name: "TMPDIR", Value: gmsruntime.SharedMountPath},
}, },
Ports: []corev1.ContainerPort{ Ports: []corev1.ContainerPort{
{Name: "system", ContainerPort: 9090, Protocol: corev1.ProtocolTCP}, {Name: "system", ContainerPort: 9090, Protocol: corev1.ProtocolTCP},
...@@ -56,10 +57,10 @@ func failoverPodSpec() corev1.PodSpec { ...@@ -56,10 +57,10 @@ func failoverPodSpec() corev1.PodSpec {
}, },
}, },
Resources: corev1.ResourceRequirements{ Resources: corev1.ResourceRequirements{
Claims: []corev1.ResourceClaim{{Name: gmsDRAClaimName}}, Claims: []corev1.ResourceClaim{{Name: gmsruntime.DRAClaimName}},
}, },
VolumeMounts: []corev1.VolumeMount{ VolumeMounts: []corev1.VolumeMount{
{Name: gmsSharedVolumeName, MountPath: gmsSharedMountPath}, {Name: gmsruntime.SharedVolumeName, MountPath: gmsruntime.SharedMountPath},
}, },
}, },
{ {
...@@ -164,7 +165,7 @@ func TestBuildFailoverPod_PreservesDRAClaim(t *testing.T) { ...@@ -164,7 +165,7 @@ func TestBuildFailoverPod_PreservesDRAClaim(t *testing.T) {
for i := range 2 { for i := range 2 {
engine := ps.Containers[i] engine := ps.Containers[i]
require.Len(t, engine.Resources.Claims, 1, "engine-%d should retain DRA claim", i) require.Len(t, engine.Resources.Claims, 1, "engine-%d should retain DRA claim", i)
assert.Equal(t, gmsDRAClaimName, engine.Resources.Claims[0].Name) assert.Equal(t, gmsruntime.DRAClaimName, engine.Resources.Claims[0].Name)
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment