"lib/runtime/vscode:/vscode.git/clone" did not exist on "3998fdcb28865547c3ce7580a46dbff2ef92095e"
controller_test.go 13.2 KB
Newer Older
1
package controller
2
3
4

import (
	"context"
5
	"errors"
6
7
8
9
10
11
	"os"
	"path/filepath"
	"testing"
	"time"

	"github.com/go-logr/logr/testr"
12
13
	batchv1 "k8s.io/api/batch/v1"
	coordinationv1 "k8s.io/api/coordination/v1"
14
15
	corev1 "k8s.io/api/core/v1"
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
16
	"k8s.io/apimachinery/pkg/runtime"
17
	"k8s.io/client-go/kubernetes/fake"
18
	clientgotesting "k8s.io/client-go/testing"
19

20
21
	"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
	snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol"
22
23
24
)

const testNodeName = "test-node"
25
const testContainerID = "test-container"
26

27
28
// makeTestController creates a NodeController with a fake k8s client and nil executors.
// The fake clientset is empty so any goroutine launched by runCheckpoint/runRestore
29
// will fail on the first annotatePod call and exit cleanly.
30
func makeTestController(t *testing.T, objs ...runtime.Object) *NodeController {
31
	t.Helper()
32
	return &NodeController{
33
34
		config: &types.AgentConfig{
			NodeName: testNodeName,
35
36
37
38
			Storage: types.StorageSpec{
				Type:     snapshotprotocol.StorageTypePVC,
				BasePath: t.TempDir(),
			},
39
		},
40
		clientset: fake.NewClientset(objs...),
41
		log:       testr.New(t),
42
		holderID:  "test-holder",
43
44
45
46
47
		inFlight:  make(map[string]struct{}),
		stopCh:    make(chan struct{}),
	}
}

48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
func makeLease(namespace, name, holder string, renewTime time.Time) *coordinationv1.Lease {
	leaseDurationSeconds := int32(checkpointLeaseDuration.Seconds())
	renewMicroTime := metav1.NewMicroTime(renewTime)
	return &coordinationv1.Lease{
		ObjectMeta: metav1.ObjectMeta{
			Name:      name,
			Namespace: namespace,
		},
		Spec: coordinationv1.LeaseSpec{
			HolderIdentity:       &holder,
			LeaseDurationSeconds: &leaseDurationSeconds,
			AcquireTime:          &renewMicroTime,
			RenewTime:            &renewMicroTime,
		},
	}
}

65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
func makePod(name, namespace, nodeName string, phase corev1.PodPhase, ready bool, labels, annotations map[string]string) *corev1.Pod {
	var conditions []corev1.PodCondition
	if ready {
		conditions = append(conditions, corev1.PodCondition{
			Type:   corev1.PodReady,
			Status: corev1.ConditionTrue,
		})
	}
	return &corev1.Pod{
		ObjectMeta: metav1.ObjectMeta{
			Name:        name,
			Namespace:   namespace,
			Labels:      labels,
			Annotations: annotations,
		},
		Spec: corev1.PodSpec{
			NodeName: nodeName,
			Containers: []corev1.Container{
				{Name: "main"},
			},
		},
		Status: corev1.PodStatus{
			Phase:      phase,
			Conditions: conditions,
		},
	}
}

93
func TestReconcileCheckpointPod(t *testing.T) {
94
95
96
97
98
99
100
	tests := []struct {
		name       string
		nodeName   string
		phase      corev1.PodPhase
		ready      bool
		hash       string
		annotation string
101
		lease      *coordinationv1.Lease
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
		preSeed    bool // pre-populate inFlight to test deduplication
		want       bool // true = pod passes filtering and triggers checkpoint
	}{
		{
			name:     "happy path",
			nodeName: testNodeName,
			phase:    corev1.PodRunning,
			ready:    true,
			hash:     "abc123",
			want:     true,
		},
		{
			name:     "wrong node",
			nodeName: "other-node",
			phase:    corev1.PodRunning,
			ready:    true,
			hash:     "abc123",
			want:     false,
		},
		{
			name:     "not running",
			nodeName: testNodeName,
			phase:    corev1.PodPending,
			ready:    false,
			hash:     "abc123",
			want:     false,
		},
		{
			name:     "running but not ready",
			nodeName: testNodeName,
			phase:    corev1.PodRunning,
			ready:    false,
			hash:     "abc123",
			want:     false,
		},
		{
			name:     "missing hash label",
			nodeName: testNodeName,
			phase:    corev1.PodRunning,
			ready:    true,
			hash:     "",
			want:     false,
		},
		{
			name:       "already completed",
			nodeName:   testNodeName,
			phase:      corev1.PodRunning,
			ready:      true,
			hash:       "abc123",
			annotation: "completed",
			want:       false,
		},
		{
155
			name:       "already failed",
156
157
158
159
			nodeName:   testNodeName,
			phase:      corev1.PodRunning,
			ready:      true,
			hash:       "abc123",
160
			annotation: "failed",
161
162
			want:       false,
		},
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
		{
			name:     "active lease held elsewhere",
			nodeName: testNodeName,
			phase:    corev1.PodRunning,
			ready:    true,
			hash:     "abc123",
			lease:    makeLease("default", "checkpoint-job", "other-holder", time.Now()),
			want:     false,
		},
		{
			name:     "expired lease can be reclaimed",
			nodeName: testNodeName,
			phase:    corev1.PodRunning,
			ready:    true,
			hash:     "abc123",
			lease:    makeLease("default", "checkpoint-job", "other-holder", time.Now().Add(-checkpointLeaseDuration-time.Second)),
			want:     true,
		},
181
182
183
184
185
186
187
188
189
190
191
192
193
194
		{
			name:     "duplicate in-flight",
			nodeName: testNodeName,
			phase:    corev1.PodRunning,
			ready:    true,
			hash:     "abc123",
			preSeed:  true,
			want:     false,
		},
	}

	for _, tc := range tests {
		t.Run(tc.name, func(t *testing.T) {
			labels := map[string]string{
195
196
				snapshotprotocol.CheckpointSourceLabel: "true",
				"batch.kubernetes.io/job-name":         "checkpoint-job",
197
198
			}
			if tc.hash != "" {
199
				labels[snapshotprotocol.CheckpointIDLabel] = tc.hash
200
201
			}

202
203
204
205
206
207
			job := &batchv1.Job{
				ObjectMeta: metav1.ObjectMeta{
					Name:      "checkpoint-job",
					Namespace: "default",
				},
			}
208
			if tc.annotation != "" {
209
				job.Annotations = map[string]string{
210
					snapshotprotocol.CheckpointStatusAnnotation: tc.annotation,
211
212
213
				}
			}

214
			pod := makePod("test-pod", "default", tc.nodeName, tc.phase, tc.ready, labels, nil)
215
216
217
218
219
220
			objs := []runtime.Object{job}
			if tc.lease != nil {
				objs = append(objs, tc.lease)
			}

			w := makeTestController(t, objs...)
221
222
223
224
225
226
			ctx := context.Background()

			if tc.preSeed {
				w.inFlight["default/test-pod"] = struct{}{}
			}

227
			w.reconcileCheckpointPod(ctx, pod)
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248

			// tryAcquire adds to inFlight synchronously before launching the goroutine.
			// For filtered pods, inFlight stays at its original size.
			triggered := len(w.inFlight) > 0 && !tc.preSeed
			if tc.preSeed {
				// Duplicate: inFlight was 1 before and should remain exactly 1
				triggered = false
			}

			if triggered != tc.want {
				t.Errorf("triggered = %v, want %v (inFlight=%d, preSeed=%v)", triggered, tc.want, len(w.inFlight), tc.preSeed)
			}

			// Let the background goroutine (if any) finish before the test ends
			if tc.want {
				time.Sleep(50 * time.Millisecond)
			}
		})
	}
}

249
func TestReconcileRestorePod(t *testing.T) {
250
	tests := []struct {
251
252
253
254
255
256
257
258
259
260
		name                  string
		nodeName              string
		phase                 corev1.PodPhase
		ready                 bool
		hash                  string
		annotationStatus      string
		annotationContainerID string
		createDir             bool // whether to create the checkpoint dir on disk
		preSeed               bool
		want                  bool
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
	}{
		{
			name:      "happy path",
			nodeName:  testNodeName,
			phase:     corev1.PodRunning,
			ready:     false,
			hash:      "abc123",
			createDir: true,
			want:      true,
		},
		{
			name:      "wrong node",
			nodeName:  "other-node",
			phase:     corev1.PodRunning,
			ready:     false,
			hash:      "abc123",
			createDir: true,
			want:      false,
		},
		{
			name:      "not running",
			nodeName:  testNodeName,
			phase:     corev1.PodPending,
			ready:     false,
			hash:      "abc123",
			createDir: true,
			want:      false,
		},
		{
290
			name:      "ready placeholder still restores",
291
292
293
294
295
			nodeName:  testNodeName,
			phase:     corev1.PodRunning,
			ready:     true,
			hash:      "abc123",
			createDir: true,
296
			want:      true,
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
		},
		{
			name:     "missing hash",
			nodeName: testNodeName,
			phase:    corev1.PodRunning,
			ready:    false,
			hash:     "",
			want:     false,
		},
		{
			name:      "invalid hash with path traversal",
			nodeName:  testNodeName,
			phase:     corev1.PodRunning,
			ready:     false,
			hash:      "../bad",
			createDir: true,
			want:      false,
		},
		{
316
317
318
319
320
321
322
323
324
			name:                  "already completed for same container",
			nodeName:              testNodeName,
			phase:                 corev1.PodRunning,
			ready:                 false,
			hash:                  "abc123",
			annotationStatus:      "completed",
			annotationContainerID: testContainerID,
			createDir:             true,
			want:                  false,
325
326
		},
		{
327
328
329
330
331
332
333
334
335
			name:                  "already in progress for same container",
			nodeName:              testNodeName,
			phase:                 corev1.PodRunning,
			ready:                 false,
			hash:                  "abc123",
			annotationStatus:      "in_progress",
			annotationContainerID: testContainerID,
			createDir:             true,
			want:                  false,
336
337
		},
		{
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
			name:                  "completed for previous container retries",
			nodeName:              testNodeName,
			phase:                 corev1.PodRunning,
			ready:                 false,
			hash:                  "abc123",
			annotationStatus:      "completed",
			annotationContainerID: "old-container",
			createDir:             true,
			want:                  true,
		},
		{
			name:                  "in progress for previous container retries",
			nodeName:              testNodeName,
			phase:                 corev1.PodRunning,
			ready:                 false,
			hash:                  "abc123",
			annotationStatus:      "in_progress",
			annotationContainerID: "old-container",
			createDir:             true,
			want:                  true,
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
		},
		{
			name:      "checkpoint not on disk",
			nodeName:  testNodeName,
			phase:     corev1.PodRunning,
			ready:     false,
			hash:      "abc123",
			createDir: false,
			want:      false,
		},
		{
			name:      "duplicate in-flight",
			nodeName:  testNodeName,
			phase:     corev1.PodRunning,
			ready:     false,
			hash:      "abc123",
			createDir: true,
			preSeed:   true,
			want:      false,
		},
	}

	for _, tc := range tests {
		t.Run(tc.name, func(t *testing.T) {
			labels := map[string]string{
383
				snapshotprotocol.RestoreTargetLabel: "true",
384
385
			}
			if tc.hash != "" {
386
				labels[snapshotprotocol.CheckpointIDLabel] = tc.hash
387
388
			}

389
			w := makeTestController(t)
390
			var annotations map[string]string
391
			if tc.annotationStatus != "" {
392
				annotations = map[string]string{
393
394
					snapshotprotocol.RestoreStatusAnnotation:      tc.annotationStatus,
					snapshotprotocol.RestoreContainerIDAnnotation: tc.annotationContainerID,
395
396
				}
			}
397
398

			pod := makePod("test-pod", "default", tc.nodeName, tc.phase, tc.ready, labels, annotations)
399
400
401
402
403
			pod.Status.ContainerStatuses = []corev1.ContainerStatus{{
				Name:        "main",
				Ready:       tc.ready,
				ContainerID: "containerd://" + testContainerID,
			}}
404
405

			if tc.createDir && tc.hash != "" {
406
				dir := filepath.Join(w.config.Storage.BasePath, tc.hash, "versions", snapshotprotocol.DefaultCheckpointArtifactVersion)
407
408
409
410
411
412
413
414
				if err := os.MkdirAll(dir, 0o755); err != nil {
					t.Fatalf("failed to create checkpoint dir: %v", err)
				}
			}

			ctx := context.Background()

			if tc.preSeed {
415
				w.inFlight["default/test-pod/"+testContainerID] = struct{}{}
416
417
			}

418
			w.reconcileRestorePod(ctx, pod)
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435

			triggered := len(w.inFlight) > 0 && !tc.preSeed
			if tc.preSeed {
				triggered = false
			}

			if triggered != tc.want {
				t.Errorf("triggered = %v, want %v (inFlight=%d, preSeed=%v)", triggered, tc.want, len(w.inFlight), tc.preSeed)
			}

			// Let the background goroutine (if any) finish before the test ends
			if tc.want {
				time.Sleep(50 * time.Millisecond)
			}
		})
	}
}
436

437
func TestRunCheckpointKeepsLeaseAndInFlightOnTerminalStatusPatchFailure(t *testing.T) {
438
439
440
441
	pod := &corev1.Pod{
		ObjectMeta: metav1.ObjectMeta{
			Name:      "test-pod",
			Namespace: "default",
442
443
444
445
446
447
448
449
450
			Labels: map[string]string{
				"batch.kubernetes.io/job-name": "checkpoint-job",
			},
		},
	}
	job := &batchv1.Job{
		ObjectMeta: metav1.ObjectMeta{
			Name:      "checkpoint-job",
			Namespace: "default",
451
452
		},
	}
453
	lease := makeLease("default", "checkpoint-job", "test-holder", time.Now())
454

455
	clientset := fake.NewClientset(pod.DeepCopy(), job, lease)
456
	patchCalls := 0
457
	clientset.PrependReactor("patch", "jobs", func(clientgotesting.Action) (bool, runtime.Object, error) {
458
459
460
461
		patchCalls++
		return true, nil, errors.New("terminal patch failed")
	})

462
	w := &NodeController{
463
464
		config: &types.AgentConfig{
			NodeName: testNodeName,
465
466
467
468
			Storage: types.StorageSpec{
				Type:     snapshotprotocol.StorageTypePVC,
				BasePath: t.TempDir(),
			},
469
470
471
		},
		clientset: clientset,
		log:       testr.New(t),
472
		holderID:  "test-holder",
473
474
475
476
477
478
		inFlight: map[string]struct{}{
			"default/test-pod": {},
		},
		stopCh: make(chan struct{}),
	}

479
	err := w.runCheckpoint(context.Background(), pod, job, "abc123", filepath.Join(t.TempDir(), "abc123"), "default/test-pod", time.Now())
480
481
482
483
484
485
	if err == nil {
		t.Fatal("expected terminal checkpoint status update to fail")
	}
	if _, ok := w.inFlight["default/test-pod"]; !ok {
		t.Fatal("checkpoint terminal status failure should keep pod in-flight")
	}
486
487
	if patchCalls != 1 {
		t.Fatalf("patchCalls = %d, want %d", patchCalls, 1)
488
489
	}

490
491
492
	remainingLease, err := clientset.CoordinationV1().Leases("default").Get(context.Background(), "checkpoint-job", metav1.GetOptions{})
	if err != nil {
		t.Fatalf("expected checkpoint lease to remain after terminal status patch failure: %v", err)
493
	}
494
495
	if remainingLease.Spec.HolderIdentity == nil || *remainingLease.Spec.HolderIdentity != "test-holder" {
		t.Fatalf("unexpected remaining lease holder: %#v", remainingLease.Spec.HolderIdentity)
496
497
	}
}