Unverified Commit ee9d67e4 authored by Thomas Montfort's avatar Thomas Montfort Committed by GitHub
Browse files

fix(operator): prevent DGD deletion thrashing under foreground cascading deletion (#8207)


Signed-off-by: default avatarTyler Montfort <tmontfort@nvidia.com>
parent cabdaebd
......@@ -60,8 +60,11 @@ func HandleFinalizer[T client.Object](ctx context.Context, obj T, writer client.
return false, err
}
logger.Info("Finalizer removed from object", "resourceVersion", obj.GetResourceVersion())
return true, nil
}
// Object is being deleted — signal the caller to skip reconciliation
// regardless of whether we just removed the finalizer or it was already
// gone (e.g., removed by a previous reconcile)
return true, nil
}
return false, nil
}
......@@ -147,6 +147,25 @@ func TestHandleFinalizer(t *testing.T) {
want: false,
wantErr: true,
},
{
name: "deleted object without finalizer - should return true to prevent reconciliation",
args: args{
ctx: context.Background(),
obj: &ObjectMock{
GetDeletionTimestampFunc: func() *metav1.Time {
return &metav1.Time{Time: time.Now()}
},
GetFinalizersFunc: func() []string {
return []string{}
},
GetResourceVersionFunc: func() string {
return "1"
},
},
},
want: true,
wantErr: false,
},
{
name: "non deleted object without finalizer - nominal case",
args: args{
......
......@@ -12,11 +12,12 @@ import (
"github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
gmsruntime "github.com/ai-dynamo/dynamo/deploy/operator/internal/gms"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/intstr"
)
var failoverLockFile = filepath.Join(gmsSharedMountPath, "failover.lock")
var failoverLockFile = filepath.Join(gmsruntime.SharedMountPath, "failover.lock")
const (
failoverEngineCount = 2
......
......@@ -12,6 +12,7 @@ import (
"github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
gmsruntime "github.com/ai-dynamo/dynamo/deploy/operator/internal/gms"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
corev1 "k8s.io/api/core/v1"
......@@ -35,7 +36,7 @@ func failoverPodSpec() corev1.PodSpec {
{Name: "DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS", Value: "true"},
{Name: "DYN_HEALTH_CHECK_ENABLED", Value: "true"},
{Name: commonconsts.DynamoDiscoveryBackendEnvVar, Value: "kubernetes"},
{Name: "TMPDIR", Value: gmsSharedMountPath},
{Name: "TMPDIR", Value: gmsruntime.SharedMountPath},
},
Ports: []corev1.ContainerPort{
{Name: "system", ContainerPort: 9090, Protocol: corev1.ProtocolTCP},
......@@ -56,10 +57,10 @@ func failoverPodSpec() corev1.PodSpec {
},
},
Resources: corev1.ResourceRequirements{
Claims: []corev1.ResourceClaim{{Name: gmsDRAClaimName}},
Claims: []corev1.ResourceClaim{{Name: gmsruntime.DRAClaimName}},
},
VolumeMounts: []corev1.VolumeMount{
{Name: gmsSharedVolumeName, MountPath: gmsSharedMountPath},
{Name: gmsruntime.SharedVolumeName, MountPath: gmsruntime.SharedMountPath},
},
},
{
......@@ -164,7 +165,7 @@ func TestBuildFailoverPod_PreservesDRAClaim(t *testing.T) {
for i := range 2 {
engine := ps.Containers[i]
require.Len(t, engine.Resources.Claims, 1, "engine-%d should retain DRA claim", i)
assert.Equal(t, gmsDRAClaimName, engine.Resources.Claims[0].Name)
assert.Equal(t, gmsruntime.DRAClaimName, engine.Resources.Claims[0].Name)
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment