refactor: clean up checkpoint orchestration (#7309)

Signed-off-by: Schwinn Saereesitthipitak <schwinns@nvidia.com>

refactor: clean up checkpoint orchestration (#7309)
Signed-off-by: Schwinn Saereesitthipitak <schwinns@nvidia.com>
38bb9d37 · Schwinn Saereesitthipitak · GitHub · 9ea3acad · 38bb9d37 · 38bb9d37
Unverified Commit 38bb9d37 authored Mar 18, 2026 by Schwinn Saereesitthipitak Committed by GitHub Mar 18, 2026
18 changed files
--- a/deploy/snapshot/Dockerfile
+++ b/deploy/snapshot/Dockerfile
@@ -16,7 +16,8 @@
 # =============================================================================
 ARG DOCKER_PROXY
 ARG GO_VERSION=1.25
-ARG CRIU_VERSION=v4.2
+ARG CRIU_REPO=https://github.com/dfeigin-nv/criu.git
+ARG CRIU_VERSION=add-aio-and-parallel-memfd
 ARG AGENT_BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:25.11-cuda13.0-devel-ubuntu24.04

 # For placeholder target only - this default allows agent builds to succeed,
@@ -74,6 +75,7 @@ RUN CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build -ldflags="-w -s
 # =============================================================================
 FROM ubuntu:24.04 AS criu-builder

+ARG CRIU_REPO
 ARG CRIU_VERSION

 RUN apt-get update && apt-get install -y --no-install-recommends \
@@ -97,7 +99,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    uuid-dev \
    && rm -rf /var/lib/apt/lists/*

-RUN git clone --branch ${CRIU_VERSION} https://github.com/checkpoint-restore/criu.git /tmp/criu \
+RUN git clone --depth 1 --branch ${CRIU_VERSION} ${CRIU_REPO} /tmp/criu \
    && cd /tmp/criu \
    && make -j$(nproc) \
    && make DESTDIR=/criu-install install-criu install-lib install-cuda_plugin

--- a/deploy/snapshot/cmd/agent/main.go
+++ b/deploy/snapshot/cmd/agent/main.go
-// Package main provides the snapshot DaemonSet agent.
-// The agent watches for pods with checkpoint/restore labels on its node
-// and triggers operations via the orchestrators.
+// Package main provides the snapshot-agent DaemonSet entrypoint.
+// The agent runs the node-local snapshot controller and delegates CRIU/CUDA
+// execution to the snapshot executor workflows.
 package main

 import (
@@ -13,8 +13,8 @@ import (
 	"github.com/go-logr/logr"

 	"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/common"
+	"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/controller"
 	"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/logging"
-	"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/watcher"
 )

 func main() {
@@ -43,37 +43,36 @@ func main() {

 	agentLog.Info("Starting snapshot agent",
 		"node", cfg.NodeName,
-		"checkpoint_dir", cfg.BasePath,
-		"watch_namespace", cfg.RestrictedNamespace,
+		"restricted_namespace", cfg.RestrictedNamespace,
 	)

-	podWatcher, err := watcher.NewWatcher(cfg, ctrd, rootLog.WithName("watcher"))
+	nodeController, err := controller.NewNodeController(cfg, ctrd, rootLog.WithName("controller"))
 	if err != nil {
-		fatal(agentLog, err, "Failed to create pod watcher")
+		fatal(agentLog, err, "Failed to create snapshot node controller")
 	}

-	// Run watcher in the background
-	watcherDone := make(chan error, 1)
+	// Run the node-local controller in the background.
+	controllerDone := make(chan error, 1)
 	go func() {
-		agentLog.Info("Pod watcher started")
-		watcherDone <- podWatcher.Start(ctx)
+		agentLog.Info("Snapshot node controller started")
+		controllerDone <- nodeController.Run(ctx)
 	}()

-	// Wait for signal or watcher exit
+	// Wait for signal or controller exit.
 	select {
 	case <-sigChan:
 		agentLog.Info("Shutting down")
 		cancel()
 		select {
-		case err := <-watcherDone:
+		case err := <-controllerDone:
 			if err != nil {
-				agentLog.Error(err, "Pod watcher exited with error during shutdown")
+				agentLog.Error(err, "Snapshot node controller exited with error during shutdown")
 			}
 		default:
 		}
-	case err := <-watcherDone:
+	case err := <-controllerDone:
 		if err != nil {
-			fatal(agentLog, err, "Pod watcher exited with error")
+			fatal(agentLog, err, "Snapshot node controller exited with error")
 		}
 	}


--- a/deploy/snapshot/cmd/nsrestore/main.go
+++ b/deploy/snapshot/cmd/nsrestore/main.go
@@ -8,8 +8,8 @@ import (

 	"github.com/go-logr/logr"

+	"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/executor"
 	"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/logging"
-	"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/orchestrate"
 )

 func main() {
@@ -25,13 +25,13 @@ func main() {
 		fatal(log, nil, "--checkpoint-path is required")
 	}

-	opts := orchestrate.RestoreOptions{
+	opts := executor.RestoreOptions{
 		CheckpointPath: *checkpointPath,
 		CUDADeviceMap:  *cudaDeviceMap,
 		CgroupRoot:     *cgroupRoot,
 	}

-	restoredPID, err := orchestrate.RestoreInNamespace(context.Background(), opts, log)
+	restoredPID, err := executor.RestoreInNamespace(context.Background(), opts, log)
 	if err != nil {
 		fatal(log, err, "restore failed")
 	}

--- a/deploy/snapshot/pkg/common/overlay.go
+++ b/deploy/snapshot/pkg/common/overlay.go
@@ -77,11 +77,7 @@ func CaptureRootfsDiff(upperDir, checkpointDir string, exclusions types.OverlayS

 // buildExclusions merges exclusion lists and normalizes paths for tar --exclude patterns.
 func buildExclusions(s types.OverlaySettings) []string {
-	total := len(s.SystemDirs) + len(s.CacheDirs) + len(s.AdditionalExclusions)
-	exclusions := make([]string, 0, total)
-	exclusions = append(exclusions, s.SystemDirs...)
-	exclusions = append(exclusions, s.CacheDirs...)
-	exclusions = append(exclusions, s.AdditionalExclusions...)
+	exclusions := append([]string(nil), s.Exclusions...)
 	for i, p := range exclusions {
 		if strings.HasPrefix(p, "*") {
 			continue

--- a/deploy/snapshot/pkg/common/overlay_test.go
+++ b/deploy/snapshot/pkg/common/overlay_test.go
@@ -18,11 +18,9 @@ func TestBuildExclusions(t *testing.T) {
 		want     map[string]bool // expected entries (true = must be present)
 	}{
 		{
-			name: "merges all lists and normalizes paths",
+			name: "normalizes rooted paths",
 			settings: types.OverlaySettings{
-				SystemDirs:           []string{"/proc", "/sys"},
-				CacheDirs:            []string{"/root/.cache"},
-				AdditionalExclusions: []string{"/tmp"},
+				Exclusions: []string{"/proc", "/sys", "/root/.cache", "/tmp"},
 			},
 			want: map[string]bool{
 				"./proc":        true,
@@ -34,7 +32,7 @@ func TestBuildExclusions(t *testing.T) {
 		{
 			name: "strips leading dot and slash before prepending ./",
 			settings: types.OverlaySettings{
-				SystemDirs: []string{"./proc", "/sys", "tmp"},
+				Exclusions: []string{"./proc", "/sys", "tmp"},
 			},
 			want: map[string]bool{
 				"./proc": true,
@@ -45,9 +43,11 @@ func TestBuildExclusions(t *testing.T) {
 		{
 			name: "glob patterns starting with * are untouched",
 			settings: types.OverlaySettings{
-				AdditionalExclusions: []string{"*.pyc", "*/__pycache__"},
+				Exclusions: []string{"*/.cache/huggingface", "*/.cache/vllm/torch_compile_cache", "*.pyc", "*/__pycache__"},
 			},
 			want: map[string]bool{
+				"*/.cache/huggingface":              true,
+				"*/.cache/vllm/torch_compile_cache": true,
 				"*.pyc":                             true,
 				"*/__pycache__":                     true,
 			},

--- a/deploy/snapshot/pkg/watcher/watcher.go
+++ b/deploy/snapshot/pkg/watcher/watcher.go
-// Package watcher provides Kubernetes pod watching for automatic checkpoint/restore.
-// The watcher is the sole entry point for snapshot operations — it detects pods with
-// checkpoint/restore labels and calls the orchestrators directly.
-package watcher
+// Package controller implements the node-local control loop inside snapshot-agent.
+// It does not own CRDs or replace the operator. Instead it watches pod, job, and
+// lease state on the current node and delegates CRIU/CUDA execution to the
+// snapshot executor workflows.
+package controller

 import (
 	"context"
@@ -15,6 +16,8 @@ import (

 	"github.com/containerd/containerd"
 	"github.com/go-logr/logr"
+	"github.com/google/uuid"
+	batchv1 "k8s.io/api/batch/v1"
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/labels"
@@ -24,7 +27,7 @@ import (
 	"k8s.io/client-go/tools/cache"

 	"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/common"
-	"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/orchestrate"
+	"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/executor"
 	"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
 )

@@ -32,16 +35,21 @@ const (
 	kubeLabelIsCheckpointSource         = "nvidia.com/snapshot-is-checkpoint-source"
 	kubeLabelCheckpointHash             = "nvidia.com/snapshot-checkpoint-hash"
 	kubeLabelIsRestoreTarget            = "nvidia.com/snapshot-is-restore-target"
+	kubeAnnotationCheckpointLocation    = "nvidia.com/snapshot-checkpoint-location"
+	kubeAnnotationCheckpointStorageType = "nvidia.com/snapshot-checkpoint-storage-type"
 	kubeAnnotationCheckpointStatus      = "nvidia.com/snapshot-checkpoint-status"
 	kubeAnnotationRestoreStatus         = "nvidia.com/snapshot-restore-status"
+	kubeAnnotationRestoreContainerID    = "nvidia.com/snapshot-restore-container-id"
 )

-// Watcher watches for pods with checkpoint/restore labels and triggers operations.
-type Watcher struct {
+// NodeController watches local-node pods with checkpoint metadata and reconciles
+// snapshot execution for checkpoint and restore requests.
+type NodeController struct {
 	config     *types.AgentConfig
 	clientset  kubernetes.Interface
 	containerd *containerd.Client
 	log        logr.Logger
+	holderID   string

 	inFlight   map[string]struct{}
 	inFlightMu sync.Mutex
@@ -49,12 +57,12 @@ type Watcher struct {
 	stopCh chan struct{}
 }

-// NewWatcher creates a new pod watcher.
-func NewWatcher(
+// NewNodeController creates the node-local controller that runs inside snapshot-agent.
+func NewNodeController(
 	cfg *types.AgentConfig,
 	containerd *containerd.Client,
 	log logr.Logger,
-) (*Watcher, error) {
+) (*NodeController, error) {
 	restConfig, err := rest.InClusterConfig()
 	if err != nil {
 		return nil, fmt.Errorf("failed to get in-cluster config: %w", err)
@@ -65,19 +73,20 @@ func NewWatcher(
 		return nil, fmt.Errorf("failed to create kubernetes client: %w", err)
 	}

-	return &Watcher{
+	return &NodeController{
 		config:     cfg,
 		clientset:  clientset,
 		containerd: containerd,
 		log:        log,
+		holderID:   "snapshot-agent/" + uuid.NewString(),
 		inFlight:   make(map[string]struct{}),
 		stopCh:     make(chan struct{}),
 	}, nil
 }

-// Start begins watching for pods and processing checkpoint/restore events.
-func (w *Watcher) Start(ctx context.Context) error {
-	w.log.Info("Starting pod watcher",
+// Run starts the local pod informers and processes checkpoint/restore events.
+func (w *NodeController) Run(ctx context.Context) error {
+	w.log.Info("Starting snapshot node controller",
 		"node", w.config.NodeName,
 		"checkpoint", kubeLabelIsCheckpointSource,
 		"restore", kubeLabelIsRestoreTarget,
@@ -115,14 +124,14 @@ func (w *Watcher) Start(ctx context.Context) error {
 			if !ok {
 				return
 			}
-			w.handleCheckpointPodEvent(ctx, pod)
+			w.reconcileCheckpointPod(ctx, pod)
 		},
 		UpdateFunc: func(_, newObj interface{}) {
 			pod, ok := podFromInformerObj(newObj)
 			if !ok {
 				return
 			}
-			w.handleCheckpointPodEvent(ctx, pod)
+			w.reconcileCheckpointPod(ctx, pod)
 		},
 	}); err != nil {
 		return fmt.Errorf("failed to add checkpoint informer handler: %w", err)
@@ -152,14 +161,14 @@ func (w *Watcher) Start(ctx context.Context) error {
 			if !ok {
 				return
 			}
-			w.handleRestorePodEvent(ctx, pod)
+			w.reconcileRestorePod(ctx, pod)
 		},
 		UpdateFunc: func(_, newObj interface{}) {
 			pod, ok := podFromInformerObj(newObj)
 			if !ok {
 				return
 			}
-			w.handleRestorePodEvent(ctx, pod)
+			w.reconcileRestorePod(ctx, pod)
 		},
 	}); err != nil {
 		return fmt.Errorf("failed to add restore informer handler: %w", err)
@@ -171,13 +180,13 @@ func (w *Watcher) Start(ctx context.Context) error {
 		return fmt.Errorf("failed to sync informer caches")
 	}

-	w.log.Info("Pod watcher started and caches synced")
+	w.log.Info("Snapshot node controller started and caches synced")
 	<-ctx.Done()
 	close(w.stopCh)
 	return nil
 }

-func (w *Watcher) handleCheckpointPodEvent(ctx context.Context, pod *corev1.Pod) {
+func (w *NodeController) reconcileCheckpointPod(ctx context.Context, pod *corev1.Pod) {
 	if pod.Spec.NodeName != w.config.NodeName {
 		return
 	}
@@ -193,8 +202,14 @@ func (w *Watcher) handleCheckpointPodEvent(ctx context.Context, pod *corev1.Pod)
 		return
 	}

-	annotationStatus := pod.Annotations[kubeAnnotationCheckpointStatus]
-	if annotationStatus == "completed" || annotationStatus == "in_progress" {
+	job, err := getCheckpointJob(ctx, w.clientset, pod)
+	if err != nil {
+		w.log.Error(err, "Failed to resolve checkpoint job", "pod", podKey)
+		return
+	}
+
+	jobStatus := job.Annotations[kubeAnnotationCheckpointStatus]
+	if jobStatus == "completed" || jobStatus == "failed" {
 		return
 	}

@@ -202,19 +217,37 @@ func (w *Watcher) handleCheckpointPodEvent(ctx context.Context, pod *corev1.Pod)
 		return
 	}

+	checkpointLocation, checkpointStorageType, err := checkpointStorageFromPod(pod)
+	if err != nil {
+		w.release(podKey)
+		w.log.Error(err, "Checkpoint pod is missing storage metadata", "pod", podKey, "checkpoint_hash", checkpointHash)
+		return
+	}
+
+	acquiredLease, err := acquireCheckpointLease(ctx, w.clientset, w.log, job, w.holderID)
+	if err != nil {
+		w.release(podKey)
+		w.log.Error(err, "Failed to acquire checkpoint lease", "pod", podKey, "checkpoint_hash", checkpointHash)
+		return
+	}
+	if !acquiredLease {
+		w.release(podKey)
+		return
+	}
+
 	w.log.Info("Pod ready, triggering checkpoint", "pod", podKey, "checkpoint_hash", checkpointHash)
 	emitPodEvent(ctx, w.clientset, w.log, pod, "snapshot", corev1.EventTypeNormal, "CheckpointRequested", fmt.Sprintf("Checkpoint requested: %s", checkpointHash))

 	go func() {
-		if err := w.doCheckpoint(ctx, pod, checkpointHash, podKey); err != nil {
+		if err := w.runCheckpoint(ctx, pod, job, checkpointHash, checkpointLocation, checkpointStorageType, podKey); err != nil {
 			opLog := w.log.WithValues("pod", podKey, "checkpoint_hash", checkpointHash)
-			opLog.Error(err, "Checkpoint worker failed")
+			opLog.Error(err, "Checkpoint controller worker failed")
 			emitPodEvent(ctx, w.clientset, opLog, pod, "snapshot", corev1.EventTypeWarning, "CheckpointWorkerFailed", err.Error())
 		}
 	}()
 }

-func (w *Watcher) handleRestorePodEvent(ctx context.Context, pod *corev1.Pod) {
+func (w *NodeController) reconcileRestorePod(ctx context.Context, pod *corev1.Pod) {
 	if pod.Spec.NodeName != w.config.NodeName {
 		return
 	}
@@ -225,17 +258,10 @@ func (w *Watcher) handleRestorePodEvent(ctx context.Context, pod *corev1.Pod) {
 		return
 	}

-	annotationStatus := pod.Annotations[kubeAnnotationRestoreStatus]
-
 	if isPodReady(pod) {
 		return
 	}

-	// Restore failures require explicit intervention (new label/update) before retry.
-	if annotationStatus == "completed" || annotationStatus == "in_progress" || annotationStatus == "failed" {
-		return
-	}
-
 	checkpointHash, ok := pod.Labels[kubeLabelCheckpointHash]
 	if !ok || checkpointHash == "" {
 		w.log.Info("Restore pod has no checkpoint-hash label", "pod", podKey)
@@ -247,13 +273,43 @@ func (w *Watcher) handleRestorePodEvent(ctx context.Context, pod *corev1.Pod) {
 		return
 	}

-	checkpointDir := filepath.Join(w.config.BasePath, checkpointHash)
-	if _, err := os.Stat(checkpointDir); os.IsNotExist(err) {
-		w.log.V(1).Info("Checkpoint not ready on disk, skipping restore", "pod", podKey, "checkpoint_hash", checkpointHash)
+	checkpointLocation, checkpointStorageType, err := checkpointStorageFromPod(pod)
+	if err != nil {
+		w.log.Error(err, "Restore pod is missing storage metadata", "pod", podKey, "checkpoint_hash", checkpointHash)
+		return
+	}
+	if _, err := os.Stat(checkpointLocation); os.IsNotExist(err) {
+		w.log.V(1).Info("Checkpoint not ready on disk, skipping restore", "pod", podKey, "checkpoint_hash", checkpointHash, "checkpoint_location", checkpointLocation)
 		return
 	}

-	if !w.tryAcquire(podKey) {
+	containerName := resolveMainContainerName(pod)
+	if containerName == "" {
+		w.log.Info("Restore pod has no containers", "pod", podKey)
+		return
+	}
+
+	containerID := ""
+	for _, cs := range pod.Status.ContainerStatuses {
+		if cs.Name != containerName || cs.ContainerID == "" {
+			continue
+		}
+		containerID = strings.TrimPrefix(cs.ContainerID, "containerd://")
+		break
+	}
+	if containerID == "" {
+		w.log.V(1).Info("Restore pod has no running main container yet", "pod", podKey, "container", containerName)
+		return
+	}
+
+	annotationStatus := pod.Annotations[kubeAnnotationRestoreStatus]
+	annotationContainerID := pod.Annotations[kubeAnnotationRestoreContainerID]
+	if annotationContainerID == containerID && (annotationStatus == "completed" || annotationStatus == "in_progress") {
+		return
+	}
+
+	restoreAttemptKey := fmt.Sprintf("%s/%s", podKey, containerID)
+	if !w.tryAcquire(restoreAttemptKey) {
 		return
 	}

@@ -261,51 +317,54 @@ func (w *Watcher) handleRestorePodEvent(ctx context.Context, pod *corev1.Pod) {
 	emitPodEvent(ctx, w.clientset, w.log, pod, "snapshot", corev1.EventTypeNormal, "RestoreRequested", fmt.Sprintf("Restore requested from checkpoint %s", checkpointHash))

 	go func() {
-		if err := w.doRestore(ctx, pod, checkpointHash, podKey); err != nil {
+		if err := w.runRestore(ctx, pod, containerName, containerID, checkpointHash, checkpointLocation, checkpointStorageType, restoreAttemptKey); err != nil {
 			opLog := w.log.WithValues("pod", podKey, "checkpoint_hash", checkpointHash)
-			opLog.Error(err, "Restore worker failed")
+			opLog.Error(err, "Restore controller worker failed")
 			emitPodEvent(ctx, w.clientset, opLog, pod, "snapshot", corev1.EventTypeWarning, "RestoreWorkerFailed", err.Error())
 		}
 	}()
 }

-// doCheckpoint runs the full checkpoint workflow for a pod:
-//  1. Mark pod as in_progress
+// runCheckpoint runs the full checkpoint workflow for a pod:
+//  1. Hold and renew the checkpoint lease
 //  2. Resolve the container ID and host PID
-//  3. Call orchestrate.Checkpoint (inspect → configure → CUDA lock/checkpoint → CRIU dump → rootfs diff)
+//  3. Call executor.Checkpoint (inspect → configure → CUDA lock/checkpoint → CRIU dump → rootfs diff)
 //  4. SIGUSR1 the process on success (notify workload), SIGKILL on failure (terminate immediately)
-//  5. Mark pod as completed or failed
-func (w *Watcher) doCheckpoint(ctx context.Context, pod *corev1.Pod, checkpointHash, podKey string) error {
-	releaseOnExit := true
+//  5. Mark job as completed or failed
+func (w *NodeController) runCheckpoint(ctx context.Context, pod *corev1.Pod, job *batchv1.Job, checkpointHash, checkpointLocation, checkpointStorageType, podKey string) error {
+	releasePodOnExit := true
 	defer func() {
-		if releaseOnExit {
+		if releasePodOnExit {
 			w.release(podKey)
 		}
 	}()
 	log := w.log.WithValues("pod", podKey, "checkpoint_hash", checkpointHash)
-	setCheckpointStatus := func(value string) error {
-		annotations := map[string]string{
-			kubeAnnotationCheckpointStatus: value,
-		}
+	leaseCtx, stopLease := context.WithCancelCause(ctx)
+	defer stopLease(nil)

-		if value == "failed" || value == "completed" {
-			if err := annotatePodRetry(ctx, w.clientset, log, pod, annotations); err != nil {
-				releaseOnExit = false
-				return fmt.Errorf("failed to persist terminal checkpoint status %q: %w", value, err)
+	releaseLeaseOnExit := true
+	defer func() {
+		if !releaseLeaseOnExit {
+			return
 		}
-			return nil
+		releaseCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+		defer cancel()
+		if err := releaseCheckpointLease(releaseCtx, w.clientset, log, job, w.holderID); err != nil {
+			log.Error(err, "Failed to release checkpoint lease")
 		}
+	}()

-		if err := annotatePod(ctx, w.clientset, log, pod, annotations); err != nil {
-			return fmt.Errorf("failed to update checkpoint status %q: %w", value, err)
-		}
-		return nil
-	}
+	go w.renewCheckpointLease(leaseCtx, log, job, stopLease)

-	if err := annotatePod(ctx, w.clientset, log, pod, map[string]string{
-		kubeAnnotationCheckpointStatus: "in_progress",
+	setCheckpointStatus := func(value string) error {
+		if err := annotateJob(ctx, w.clientset, log, job, map[string]string{
+			kubeAnnotationCheckpointStatus: value,
 		}); err != nil {
-		return fmt.Errorf("failed to annotate pod with checkpoint in_progress: %w", err)
+			releasePodOnExit = false
+			releaseLeaseOnExit = false
+			return fmt.Errorf("failed to persist terminal checkpoint status %q: %w", value, err)
+		}
+		return nil
 	}

 	// Resolve the target container
@@ -346,16 +405,20 @@ func (w *Watcher) doCheckpoint(ctx context.Context, pod *corev1.Pod, checkpointH
 	}

 	// Step 1: Run the checkpoint orchestrator
-	req := orchestrate.CheckpointRequest{
+	req := executor.CheckpointRequest{
 		ContainerID:           containerID,
 		ContainerName:         containerName,
 		CheckpointHash:        checkpointHash,
-		CheckpointDir:  w.config.BasePath,
+		CheckpointLocation:    checkpointLocation,
+		CheckpointStorageType: checkpointStorageType,
 		NodeName:              w.config.NodeName,
 		PodName:               pod.Name,
 		PodNamespace:          pod.Namespace,
 	}
-	if err := orchestrate.Checkpoint(ctx, w.containerd, log, req, w.config); err != nil {
+	if err := executor.Checkpoint(leaseCtx, w.containerd, log, req, w.config); err != nil {
+		if cause := context.Cause(leaseCtx); cause != nil && cause != context.Canceled {
+			err = fmt.Errorf("checkpoint lease lost: %w", cause)
+		}
 		log.Error(err, "Checkpoint failed")
 		emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "CheckpointFailed", err.Error())
 		// SIGKILL on failure: process is unrecoverable (CUDA locked), terminate immediately
@@ -368,6 +431,24 @@ func (w *Watcher) doCheckpoint(ctx context.Context, pod *corev1.Pod, checkpointH
 		return nil
 	}

+	info, err := os.Stat(checkpointLocation)
+	if err != nil || !info.IsDir() {
+		if err == nil {
+			err = fmt.Errorf("published checkpoint path %s is not a directory", checkpointLocation)
+		} else {
+			err = fmt.Errorf("published checkpoint path %s is missing: %w", checkpointLocation, err)
+		}
+		log.Error(err, "Checkpoint failed verification")
+		emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "CheckpointFailed", err.Error())
+		if signalErr := common.SendSignalToPID(log, containerPID, syscall.SIGKILL, "checkpoint verification failed"); signalErr != nil {
+			log.Error(signalErr, "Failed to signal checkpoint verification failure to runtime process")
+		}
+		if statusErr := setCheckpointStatus("failed"); statusErr != nil {
+			return statusErr
+		}
+		return nil
+	}
+
 	// Step 2: SIGUSR1 on success: notify the workload that checkpoint completed
 	emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeNormal, "CheckpointSucceeded", fmt.Sprintf("Checkpoint completed: %s", checkpointHash))
 	if err := common.SendSignalToPID(log, containerPID, syscall.SIGUSR1, "checkpoint complete"); err != nil {
@@ -385,34 +466,31 @@ func (w *Watcher) doCheckpoint(ctx context.Context, pod *corev1.Pod, checkpointH
 	return nil
 }

-// doRestore runs the full restore workflow for a pod:
-//  1. Mark pod as in_progress
-//  2. Call orchestrate.Restore (inspect placeholder → nsrestore inside namespace)
+// runRestore runs the full restore workflow for a pod:
+//  1. Mark the current container instance as in_progress
+//  2. Call executor.Restore (inspect placeholder → nsrestore inside namespace)
 //  3. SIGCONT the restored process to wake it up
 //  4. Wait for the pod to become Ready
-//  5. Mark pod as completed or failed
-func (w *Watcher) doRestore(ctx context.Context, pod *corev1.Pod, checkpointHash, podKey string) error {
+//  5. Mark the container instance as completed
+func (w *NodeController) runRestore(ctx context.Context, pod *corev1.Pod, containerName, containerID, checkpointHash, checkpointLocation, checkpointStorageType, restoreAttemptKey string) error {
 	releaseOnExit := true
 	defer func() {
 		if releaseOnExit {
-			w.release(podKey)
+			w.release(restoreAttemptKey)
 		}
 	}()
-	log := w.log.WithValues("pod", podKey, "checkpoint_hash", checkpointHash)
+	podKey := fmt.Sprintf("%s/%s", pod.Namespace, pod.Name)
+	log := w.log.WithValues("pod", podKey, "checkpoint_hash", checkpointHash, "container_id", containerID)
 	setRestoreStatus := func(value string) error {
 		annotations := map[string]string{
 			kubeAnnotationRestoreStatus:      value,
+			kubeAnnotationRestoreContainerID: containerID,
 		}
-
-		if value == "failed" || value == "completed" {
-			if err := annotatePodRetry(ctx, w.clientset, log, pod, annotations); err != nil {
+		if err := annotatePod(ctx, w.clientset, log, pod, annotations); err != nil {
+			if value == "completed" {
 				releaseOnExit = false
 				return fmt.Errorf("failed to persist terminal restore status %q: %w", value, err)
 			}
-			return nil
-		}
-
-		if err := annotatePod(ctx, w.clientset, log, pod, annotations); err != nil {
 			return fmt.Errorf("failed to update restore status %q: %w", value, err)
 		}
 		return nil
@@ -420,36 +498,33 @@ func (w *Watcher) doRestore(ctx context.Context, pod *corev1.Pod, checkpointHash

 	if err := annotatePod(ctx, w.clientset, log, pod, map[string]string{
 		kubeAnnotationRestoreStatus:      "in_progress",
+		kubeAnnotationRestoreContainerID: containerID,
 	}); err != nil {
 		return fmt.Errorf("failed to annotate pod with restore in_progress: %w", err)
 	}

-	containerName := resolveMainContainerName(pod)
-	if containerName == "" {
-		err := fmt.Errorf("no containers found in pod spec")
-		log.Error(err, "Restore failed")
-		emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "RestoreFailed", err.Error())
-		if statusErr := setRestoreStatus("failed"); statusErr != nil {
-			return statusErr
-		}
-		return nil
-	}
-
 	// Step 1: Run the restore orchestrator (inspect + nsrestore)
-	req := orchestrate.RestoreRequest{
+	req := executor.RestoreRequest{
 		CheckpointHash:        checkpointHash,
-		CheckpointBase: w.config.BasePath,
+		CheckpointLocation:    checkpointLocation,
+		CheckpointStorageType: checkpointStorageType,
 		NSRestorePath:         w.config.Restore.NSRestorePath,
 		PodName:               pod.Name,
 		PodNamespace:          pod.Namespace,
 		ContainerName:         containerName,
 	}
-	restoredPID, err := orchestrate.Restore(ctx, w.containerd, log, req)
+	restoredPID, err := executor.Restore(ctx, w.containerd, log, req)
 	if err != nil {
 		log.Error(err, "External restore failed")
 		emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "RestoreFailed", err.Error())
-		if statusErr := setRestoreStatus("failed"); statusErr != nil {
-			return statusErr
+		placeholderHostPID, _, pidErr := common.ResolveContainerByPod(ctx, w.containerd, pod.Name, pod.Namespace, containerName)
+		if pidErr != nil {
+			releaseOnExit = false
+			return fmt.Errorf("restore failed and placeholder PID could not be resolved: %w", pidErr)
+		}
+		if killErr := common.SendSignalToPID(log, placeholderHostPID, syscall.SIGKILL, "restore failed"); killErr != nil {
+			releaseOnExit = false
+			return fmt.Errorf("restore failed and placeholder could not be killed: %w", killErr)
 		}
 		return nil
 	}
@@ -459,18 +534,17 @@ func (w *Watcher) doRestore(ctx context.Context, pod *corev1.Pod, checkpointHash
 	if err != nil {
 		log.Error(err, "Failed to resolve placeholder host PID for signaling")
 		emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "RestoreFailed", err.Error())
-		if statusErr := setRestoreStatus("failed"); statusErr != nil {
-			return statusErr
-		}
-		return nil
+		releaseOnExit = false
+		return fmt.Errorf("failed to resolve placeholder host PID for signaling: %w", err)
 	}
 	if err := common.SendSignalViaPIDNamespace(ctx, log, placeholderHostPID, restoredPID, syscall.SIGCONT, "restore complete"); err != nil {
 		log.Error(err, "Failed to signal restored runtime process")
 		emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "RestoreFailed", err.Error())
-		if statusErr := setRestoreStatus("failed"); statusErr != nil {
-			return statusErr
+		if killErr := common.SendSignalToPID(log, placeholderHostPID, syscall.SIGKILL, "restore signaling failed"); killErr != nil {
+			log.Error(killErr, "Failed to kill placeholder after restore signaling failure")
 		}
-		return nil
+		releaseOnExit = false
+		return fmt.Errorf("failed to signal restored runtime process: %w", err)
 	}

 	// Step 3: Wait for the pod to become Ready
@@ -483,10 +557,11 @@ func (w *Watcher) doRestore(ctx context.Context, pod *corev1.Pod, checkpointHash
 	if err := waitForPodReady(readyCtx, w.clientset, pod.Namespace, pod.Name, containerName); err != nil {
 		log.Error(err, "Restore post-signal readiness check failed")
 		emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "RestoreFailed", err.Error())
-		if statusErr := setRestoreStatus("failed"); statusErr != nil {
-			return statusErr
+		if killErr := common.SendSignalToPID(log, placeholderHostPID, syscall.SIGKILL, "restore readiness failed"); killErr != nil {
+			log.Error(killErr, "Failed to kill placeholder after restore readiness failure")
 		}
-		return nil
+		releaseOnExit = false
+		return fmt.Errorf("restore post-signal readiness check failed: %w", err)
 	}

 	emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeNormal, "RestoreSucceeded", fmt.Sprintf("Restore completed from checkpoint %s", checkpointHash))
@@ -496,7 +571,7 @@ func (w *Watcher) doRestore(ctx context.Context, pod *corev1.Pod, checkpointHash
 	return nil
 }

-func (w *Watcher) tryAcquire(podKey string) bool {
+func (w *NodeController) tryAcquire(podKey string) bool {
 	w.inFlightMu.Lock()
 	defer w.inFlightMu.Unlock()
 	if _, held := w.inFlight[podKey]; held {
@@ -506,8 +581,25 @@ func (w *Watcher) tryAcquire(podKey string) bool {
 	return true
 }

-func (w *Watcher) release(podKey string) {
+func (w *NodeController) release(podKey string) {
 	w.inFlightMu.Lock()
 	defer w.inFlightMu.Unlock()
 	delete(w.inFlight, podKey)
 }
+
+func checkpointStorageFromPod(pod *corev1.Pod) (string, string, error) {
+	checkpointLocation := strings.TrimSpace(pod.Annotations[kubeAnnotationCheckpointLocation])
+	if checkpointLocation == "" {
+		return "", "", fmt.Errorf("missing %s annotation", kubeAnnotationCheckpointLocation)
+	}
+
+	checkpointStorageType := strings.TrimSpace(pod.Annotations[kubeAnnotationCheckpointStorageType])
+	if checkpointStorageType == "" {
+		return "", "", fmt.Errorf("missing %s annotation", kubeAnnotationCheckpointStorageType)
+	}
+	if checkpointStorageType != "pvc" {
+		return "", "", fmt.Errorf("checkpoint storage type %q is not supported", checkpointStorageType)
+	}
+
+	return checkpointLocation, checkpointStorageType, nil
+}
--- a/deploy/snapshot/pkg/watcher/watcher_test.go
+++ b/deploy/snapshot/pkg/watcher/watcher_test.go
-package watcher
+package controller

 import (
 	"context"
@@ -9,6 +9,8 @@ import (
 	"time"

 	"github.com/go-logr/logr/testr"
+	batchv1 "k8s.io/api/batch/v1"
+	coordinationv1 "k8s.io/api/coordination/v1"
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime"
@@ -19,24 +21,42 @@ import (
 )

 const testNodeName = "test-node"
+const testContainerID = "test-container"

-// makeTestWatcher creates a Watcher with a fake k8s client and nil orchestrators.
-// The fake clientset is empty so any goroutine launched by doCheckpoint/doRestore
+// makeTestController creates a NodeController with a fake k8s client and nil executors.
+// The fake clientset is empty so any goroutine launched by runCheckpoint/runRestore
 // will fail on the first annotatePod call and exit cleanly.
-func makeTestWatcher(t *testing.T) *Watcher {
+func makeTestController(t *testing.T, objs ...runtime.Object) *NodeController {
 	t.Helper()
-	return &Watcher{
+	return &NodeController{
 		config: &types.AgentConfig{
 			NodeName: testNodeName,
-			BasePath: t.TempDir(),
 		},
-		clientset: fake.NewClientset(),
+		clientset: fake.NewClientset(objs...),
 		log:       testr.New(t),
+		holderID:  "test-holder",
 		inFlight:  make(map[string]struct{}),
 		stopCh:    make(chan struct{}),
 	}
 }

+func makeLease(namespace, name, holder string, renewTime time.Time) *coordinationv1.Lease {
+	leaseDurationSeconds := int32(checkpointLeaseDuration.Seconds())
+	renewMicroTime := metav1.NewMicroTime(renewTime)
+	return &coordinationv1.Lease{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      name,
+			Namespace: namespace,
+		},
+		Spec: coordinationv1.LeaseSpec{
+			HolderIdentity:       &holder,
+			LeaseDurationSeconds: &leaseDurationSeconds,
+			AcquireTime:          &renewMicroTime,
+			RenewTime:            &renewMicroTime,
+		},
+	}
+}
+
 func makePod(name, namespace, nodeName string, phase corev1.PodPhase, ready bool, labels, annotations map[string]string) *corev1.Pod {
 	var conditions []corev1.PodCondition
 	if ready {
@@ -65,7 +85,7 @@ func makePod(name, namespace, nodeName string, phase corev1.PodPhase, ready bool
 	}
 }

-func TestHandleCheckpointPodEvent(t *testing.T) {
+func TestReconcileCheckpointPod(t *testing.T) {
 	tests := []struct {
 		name       string
 		nodeName   string
@@ -73,6 +93,7 @@ func TestHandleCheckpointPodEvent(t *testing.T) {
 		ready      bool
 		hash       string
 		annotation string
+		lease      *coordinationv1.Lease
 		preSeed    bool // pre-populate inFlight to test deduplication
 		want       bool // true = pod passes filtering and triggers checkpoint
 	}{
@@ -126,14 +147,32 @@ func TestHandleCheckpointPodEvent(t *testing.T) {
 			want:       false,
 		},
 		{
-			name:       "already in progress",
+			name:       "already failed",
+			nodeName:   testNodeName,
+			phase:      corev1.PodRunning,
+			ready:      true,
+			hash:       "abc123",
+			annotation: "failed",
+			want:       false,
+		},
+		{
+			name:     "active lease held elsewhere",
 			nodeName: testNodeName,
 			phase:    corev1.PodRunning,
 			ready:    true,
 			hash:     "abc123",
-			annotation: "in_progress",
+			lease:    makeLease("default", "checkpoint-job", "other-holder", time.Now()),
 			want:     false,
 		},
+		{
+			name:     "expired lease can be reclaimed",
+			nodeName: testNodeName,
+			phase:    corev1.PodRunning,
+			ready:    true,
+			hash:     "abc123",
+			lease:    makeLease("default", "checkpoint-job", "other-holder", time.Now().Add(-checkpointLeaseDuration-time.Second)),
+			want:     true,
+		},
 		{
 			name:     "duplicate in-flight",
 			nodeName: testNodeName,
@@ -149,27 +188,45 @@ func TestHandleCheckpointPodEvent(t *testing.T) {
 		t.Run(tc.name, func(t *testing.T) {
 			labels := map[string]string{
 				kubeLabelIsCheckpointSource:    "true",
+				"batch.kubernetes.io/job-name": "checkpoint-job",
 			}
 			if tc.hash != "" {
 				labels[kubeLabelCheckpointHash] = tc.hash
 			}

-			var annotations map[string]string
+			job := &batchv1.Job{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "checkpoint-job",
+					Namespace: "default",
+				},
+			}
 			if tc.annotation != "" {
-				annotations = map[string]string{
+				job.Annotations = map[string]string{
 					kubeAnnotationCheckpointStatus: tc.annotation,
 				}
 			}

+			var annotations map[string]string
+			if tc.hash != "" {
+				annotations = map[string]string{
+					kubeAnnotationCheckpointLocation:    "/checkpoints/" + tc.hash,
+					kubeAnnotationCheckpointStorageType: "pvc",
+				}
+			}
 			pod := makePod("test-pod", "default", tc.nodeName, tc.phase, tc.ready, labels, annotations)
-			w := makeTestWatcher(t)
+			objs := []runtime.Object{job}
+			if tc.lease != nil {
+				objs = append(objs, tc.lease)
+			}
+
+			w := makeTestController(t, objs...)
 			ctx := context.Background()

 			if tc.preSeed {
 				w.inFlight["default/test-pod"] = struct{}{}
 			}

-			w.handleCheckpointPodEvent(ctx, pod)
+			w.reconcileCheckpointPod(ctx, pod)

 			// tryAcquire adds to inFlight synchronously before launching the goroutine.
 			// For filtered pods, inFlight stays at its original size.
@@ -191,14 +248,15 @@ func TestHandleCheckpointPodEvent(t *testing.T) {
 	}
 }

-func TestHandleRestorePodEvent(t *testing.T) {
+func TestReconcileRestorePod(t *testing.T) {
 	tests := []struct {
 		name                  string
 		nodeName              string
 		phase                 corev1.PodPhase
 		ready                 bool
 		hash                  string
-		annotation string
+		annotationStatus      string
+		annotationContainerID string
 		createDir             bool // whether to create the checkpoint dir on disk
 		preSeed               bool
 		want                  bool
@@ -257,34 +315,48 @@ func TestHandleRestorePodEvent(t *testing.T) {
 			want:      false,
 		},
 		{
-			name:       "already completed",
+			name:                  "already completed for same container",
 			nodeName:              testNodeName,
 			phase:                 corev1.PodRunning,
 			ready:                 false,
 			hash:                  "abc123",
-			annotation: "completed",
+			annotationStatus:      "completed",
+			annotationContainerID: testContainerID,
 			createDir:             true,
 			want:                  false,
 		},
 		{
-			name:       "already in progress",
+			name:                  "already in progress for same container",
 			nodeName:              testNodeName,
 			phase:                 corev1.PodRunning,
 			ready:                 false,
 			hash:                  "abc123",
-			annotation: "in_progress",
+			annotationStatus:      "in_progress",
+			annotationContainerID: testContainerID,
 			createDir:             true,
 			want:                  false,
 		},
 		{
-			name:       "already failed",
+			name:                  "completed for previous container retries",
 			nodeName:              testNodeName,
 			phase:                 corev1.PodRunning,
 			ready:                 false,
 			hash:                  "abc123",
-			annotation: "failed",
+			annotationStatus:      "completed",
+			annotationContainerID: "old-container",
 			createDir:             true,
-			want:       false,
+			want:                  true,
+		},
+		{
+			name:                  "in progress for previous container retries",
+			nodeName:              testNodeName,
+			phase:                 corev1.PodRunning,
+			ready:                 false,
+			hash:                  "abc123",
+			annotationStatus:      "in_progress",
+			annotationContainerID: "old-container",
+			createDir:             true,
+			want:                  true,
 		},
 		{
 			name:      "checkpoint not on disk",
@@ -316,18 +388,33 @@ func TestHandleRestorePodEvent(t *testing.T) {
 				labels[kubeLabelCheckpointHash] = tc.hash
 			}

+			w := makeTestController(t)
+			checkpointDir := t.TempDir()
+
 			var annotations map[string]string
-			if tc.annotation != "" {
+			if tc.annotationStatus != "" {
 				annotations = map[string]string{
-					kubeAnnotationRestoreStatus: tc.annotation,
+					kubeAnnotationRestoreStatus:      tc.annotationStatus,
+					kubeAnnotationRestoreContainerID: tc.annotationContainerID,
+				}
+			}
+			if tc.hash != "" {
+				if annotations == nil {
+					annotations = make(map[string]string)
 				}
+				annotations[kubeAnnotationCheckpointLocation] = filepath.Join(checkpointDir, tc.hash)
+				annotations[kubeAnnotationCheckpointStorageType] = "pvc"
 			}

 			pod := makePod("test-pod", "default", tc.nodeName, tc.phase, tc.ready, labels, annotations)
-			w := makeTestWatcher(t)
+			pod.Status.ContainerStatuses = []corev1.ContainerStatus{{
+				Name:        "main",
+				Ready:       tc.ready,
+				ContainerID: "containerd://" + testContainerID,
+			}}

 			if tc.createDir && tc.hash != "" {
-				dir := filepath.Join(w.config.BasePath, tc.hash)
+				dir := filepath.Join(checkpointDir, tc.hash)
 				if err := os.MkdirAll(dir, 0o755); err != nil {
 					t.Fatalf("failed to create checkpoint dir: %v", err)
 				}
@@ -336,10 +423,10 @@ func TestHandleRestorePodEvent(t *testing.T) {
 			ctx := context.Background()

 			if tc.preSeed {
-				w.inFlight["default/test-pod"] = struct{}{}
+				w.inFlight["default/test-pod/"+testContainerID] = struct{}{}
 			}

-			w.handleRestorePodEvent(ctx, pod)
+			w.reconcileRestorePod(ctx, pod)

 			triggered := len(w.inFlight) > 0 && !tc.preSeed
 			if tc.preSeed {
@@ -358,91 +445,60 @@ func TestHandleRestorePodEvent(t *testing.T) {
 	}
 }

-func TestDoCheckpointKeepsInFlightOnTerminalStatusPatchFailure(t *testing.T) {
+func TestRunCheckpointKeepsLeaseAndInFlightOnTerminalStatusPatchFailure(t *testing.T) {
 	pod := &corev1.Pod{
 		ObjectMeta: metav1.ObjectMeta{
 			Name:      "test-pod",
 			Namespace: "default",
+			Labels: map[string]string{
+				"batch.kubernetes.io/job-name": "checkpoint-job",
 			},
-	}
-
-	clientset := fake.NewClientset(pod.DeepCopy())
-	patchCalls := 0
-	clientset.PrependReactor("patch", "pods", func(clientgotesting.Action) (bool, runtime.Object, error) {
-		patchCalls++
-		if patchCalls == 1 {
-			return false, nil, nil
-		}
-		return true, nil, errors.New("terminal patch failed")
-	})
-
-	w := &Watcher{
-		config: &types.AgentConfig{
-			NodeName: testNodeName,
-			BasePath: t.TempDir(),
 		},
-		clientset: clientset,
-		log:       testr.New(t),
-		inFlight: map[string]struct{}{
-			"default/test-pod": {},
-		},
-		stopCh: make(chan struct{}),
-	}
-
-	err := w.doCheckpoint(context.Background(), pod, "abc123", "default/test-pod")
-	if err == nil {
-		t.Fatal("expected terminal checkpoint status update to fail")
-	}
-	if _, ok := w.inFlight["default/test-pod"]; !ok {
-		t.Fatal("checkpoint terminal status failure should keep pod in-flight")
-	}
-	if patchCalls != 1+terminalStatusPatchRetryAttempts {
-		t.Fatalf("patchCalls = %d, want %d", patchCalls, 1+terminalStatusPatchRetryAttempts)
 	}
-}
-
-func TestDoRestoreKeepsInFlightOnTerminalStatusPatchFailure(t *testing.T) {
-	pod := &corev1.Pod{
+	job := &batchv1.Job{
 		ObjectMeta: metav1.ObjectMeta{
-			Name:      "test-pod",
+			Name:      "checkpoint-job",
 			Namespace: "default",
 		},
-		Status: corev1.PodStatus{
-			Phase: corev1.PodRunning,
-		},
 	}
+	lease := makeLease("default", "checkpoint-job", "test-holder", time.Now())

-	clientset := fake.NewClientset(pod.DeepCopy())
+	clientset := fake.NewClientset(pod.DeepCopy(), job, lease)
 	patchCalls := 0
-	clientset.PrependReactor("patch", "pods", func(clientgotesting.Action) (bool, runtime.Object, error) {
+	clientset.PrependReactor("patch", "jobs", func(clientgotesting.Action) (bool, runtime.Object, error) {
 		patchCalls++
-		if patchCalls == 1 {
-			return false, nil, nil
-		}
 		return true, nil, errors.New("terminal patch failed")
 	})

-	w := &Watcher{
+	w := &NodeController{
 		config: &types.AgentConfig{
 			NodeName: testNodeName,
-			BasePath: t.TempDir(),
 		},
 		clientset: clientset,
 		log:       testr.New(t),
+		holderID:  "test-holder",
 		inFlight: map[string]struct{}{
 			"default/test-pod": {},
 		},
 		stopCh: make(chan struct{}),
 	}

-	err := w.doRestore(context.Background(), pod, "abc123", "default/test-pod")
+	err := w.runCheckpoint(context.Background(), pod, job, "abc123", filepath.Join(t.TempDir(), "abc123"), "pvc", "default/test-pod")
 	if err == nil {
-		t.Fatal("expected terminal restore status update to fail")
+		t.Fatal("expected terminal checkpoint status update to fail")
 	}
 	if _, ok := w.inFlight["default/test-pod"]; !ok {
-		t.Fatal("restore terminal status failure should keep pod in-flight")
+		t.Fatal("checkpoint terminal status failure should keep pod in-flight")
+	}
+	if patchCalls != 1 {
+		t.Fatalf("patchCalls = %d, want %d", patchCalls, 1)
+	}
+
+	remainingLease, err := clientset.CoordinationV1().Leases("default").Get(context.Background(), "checkpoint-job", metav1.GetOptions{})
+	if err != nil {
+		t.Fatalf("expected checkpoint lease to remain after terminal status patch failure: %v", err)
 	}
-	if patchCalls != 1+terminalStatusPatchRetryAttempts {
-		t.Fatalf("patchCalls = %d, want %d", patchCalls, 1+terminalStatusPatchRetryAttempts)
+	if remainingLease.Spec.HolderIdentity == nil || *remainingLease.Spec.HolderIdentity != "test-holder" {
+		t.Fatalf("unexpected remaining lease holder: %#v", remainingLease.Spec.HolderIdentity)
 	}
 }
--- a/deploy/snapshot/pkg/watcher/util.go
+++ b/deploy/snapshot/pkg/watcher/util.go
-package watcher
+package controller

 import (
 	"context"
@@ -7,7 +7,10 @@ import (
 	"time"

 	"github.com/go-logr/logr"
+	batchv1 "k8s.io/api/batch/v1"
+	coordinationv1 "k8s.io/api/coordination/v1"
 	corev1 "k8s.io/api/core/v1"
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	ktypes "k8s.io/apimachinery/pkg/types"
 	"k8s.io/client-go/kubernetes"
@@ -15,8 +18,8 @@ import (
 )

 const (
-	terminalStatusPatchRetryAttempts = 3
-	terminalStatusPatchRetryDelay    = 10 * time.Millisecond
+	checkpointLeaseDuration      = 30 * time.Second
+	checkpointLeaseRenewInterval = 10 * time.Second
 )

 func podFromInformerObj(obj interface{}) (*corev1.Pod, bool) {
@@ -78,30 +81,176 @@ func annotatePod(ctx context.Context, clientset kubernetes.Interface, log logr.L
 	return err
 }

-func annotatePodRetry(ctx context.Context, clientset kubernetes.Interface, log logr.Logger, pod *corev1.Pod, annotations map[string]string) error {
-	delay := terminalStatusPatchRetryDelay
-	var lastErr error
+func getCheckpointJob(ctx context.Context, clientset kubernetes.Interface, pod *corev1.Pod) (*batchv1.Job, error) {
+	jobName := pod.Labels["batch.kubernetes.io/job-name"]
+	if jobName == "" {
+		return nil, fmt.Errorf("pod %s/%s has no batch.kubernetes.io/job-name label", pod.Namespace, pod.Name)
+	}
+
+	job, err := clientset.BatchV1().Jobs(pod.Namespace).Get(ctx, jobName, metav1.GetOptions{})
+	if err != nil {
+		return nil, fmt.Errorf("failed to get checkpoint job %s/%s: %w", pod.Namespace, jobName, err)
+	}
+	return job, nil
+}
+
+func isLeaseExpired(lease *coordinationv1.Lease, now time.Time) bool {
+	if lease == nil || lease.Spec.LeaseDurationSeconds == nil {
+		return true
+	}
+	last := lease.Spec.RenewTime
+	if last == nil {
+		last = lease.Spec.AcquireTime
+	}
+	if last == nil {
+		return true
+	}
+	return now.After(last.Time.Add(time.Duration(*lease.Spec.LeaseDurationSeconds) * time.Second))
+}

-	for attempt := 1; attempt <= terminalStatusPatchRetryAttempts; attempt++ {
-		if err := annotatePod(ctx, clientset, log, pod, annotations); err == nil {
+func acquireCheckpointLease(ctx context.Context, clientset kubernetes.Interface, log logr.Logger, job *batchv1.Job, holderIdentity string) (bool, error) {
+	leaseName := job.Name
+	now := metav1.NewMicroTime(time.Now())
+	leaseDurationSeconds := int32(checkpointLeaseDuration.Seconds())
+
+	leaseClient := clientset.CoordinationV1().Leases(job.Namespace)
+	existingLease, err := leaseClient.Get(ctx, leaseName, metav1.GetOptions{})
+	if err != nil {
+		if !apierrors.IsNotFound(err) {
+			return false, fmt.Errorf("failed to get checkpoint lease %s/%s: %w", job.Namespace, leaseName, err)
+		}
+
+		lease := &coordinationv1.Lease{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      leaseName,
+				Namespace: job.Namespace,
+			},
+			Spec: coordinationv1.LeaseSpec{
+				HolderIdentity:       &holderIdentity,
+				LeaseDurationSeconds: &leaseDurationSeconds,
+				AcquireTime:          &now,
+				RenewTime:            &now,
+			},
+		}
+
+		if _, err := leaseClient.Create(ctx, lease, metav1.CreateOptions{}); err != nil {
+			if apierrors.IsAlreadyExists(err) {
+				return false, nil
+			}
+			return false, fmt.Errorf("failed to create checkpoint lease %s/%s: %w", job.Namespace, leaseName, err)
+		}
+		return true, nil
+	}
+
+	if !isLeaseExpired(existingLease, now.Time) &&
+		existingLease.Spec.HolderIdentity != nil &&
+		*existingLease.Spec.HolderIdentity != holderIdentity {
+		return false, nil
+	}
+
+	existingLease.Spec.HolderIdentity = &holderIdentity
+	existingLease.Spec.LeaseDurationSeconds = &leaseDurationSeconds
+	if existingLease.Spec.AcquireTime == nil || isLeaseExpired(existingLease, now.Time) {
+		existingLease.Spec.AcquireTime = &now
+	}
+	existingLease.Spec.RenewTime = &now
+
+	if _, err := leaseClient.Update(ctx, existingLease, metav1.UpdateOptions{}); err != nil {
+		if apierrors.IsConflict(err) {
+			log.V(1).Info("Checkpoint lease update conflicted", "lease", fmt.Sprintf("%s/%s", job.Namespace, leaseName))
+			return false, nil
+		}
+		return false, fmt.Errorf("failed to update checkpoint lease %s/%s: %w", job.Namespace, leaseName, err)
+	}
+
+	return true, nil
+}
+
+func renewCheckpointLease(ctx context.Context, clientset kubernetes.Interface, job *batchv1.Job, holderIdentity string) error {
+	leaseName := job.Name
+	leaseClient := clientset.CoordinationV1().Leases(job.Namespace)
+	lease, err := leaseClient.Get(ctx, leaseName, metav1.GetOptions{})
+	if err != nil {
+		return fmt.Errorf("failed to get checkpoint lease %s/%s for renewal: %w", job.Namespace, leaseName, err)
+	}
+	if lease.Spec.HolderIdentity == nil || *lease.Spec.HolderIdentity != holderIdentity {
+		return fmt.Errorf("checkpoint lease %s/%s is no longer held by %q", job.Namespace, leaseName, holderIdentity)
+	}
+
+	now := metav1.NewMicroTime(time.Now())
+	leaseDurationSeconds := int32(checkpointLeaseDuration.Seconds())
+	lease.Spec.LeaseDurationSeconds = &leaseDurationSeconds
+	lease.Spec.RenewTime = &now
+
+	if _, err := leaseClient.Update(ctx, lease, metav1.UpdateOptions{}); err != nil {
+		return fmt.Errorf("failed to renew checkpoint lease %s/%s: %w", job.Namespace, leaseName, err)
+	}
+	return nil
+}
+
+func releaseCheckpointLease(ctx context.Context, clientset kubernetes.Interface, log logr.Logger, job *batchv1.Job, holderIdentity string) error {
+	leaseName := job.Name
+	leaseClient := clientset.CoordinationV1().Leases(job.Namespace)
+	lease, err := leaseClient.Get(ctx, leaseName, metav1.GetOptions{})
+	if err != nil {
+		if apierrors.IsNotFound(err) {
+			return nil
+		}
+		return fmt.Errorf("failed to get checkpoint lease %s/%s for release: %w", job.Namespace, leaseName, err)
+	}
+
+	if lease.Spec.HolderIdentity == nil || *lease.Spec.HolderIdentity != holderIdentity {
+		log.V(1).Info("Skipping checkpoint lease release because another holder owns it",
+			"lease", fmt.Sprintf("%s/%s", job.Namespace, leaseName),
+			"holder", holderIdentity,
+		)
 		return nil
-		} else {
-			lastErr = err
 	}

-		if attempt == terminalStatusPatchRetryAttempts {
-			break
+	if err := leaseClient.Delete(ctx, leaseName, metav1.DeleteOptions{}); err != nil && !apierrors.IsNotFound(err) {
+		return fmt.Errorf("failed to delete checkpoint lease %s/%s: %w", job.Namespace, leaseName, err)
 	}
+	return nil
+}
+
+func (w *NodeController) renewCheckpointLease(ctx context.Context, log logr.Logger, job *batchv1.Job, stopLease context.CancelCauseFunc) {
+	ticker := time.NewTicker(checkpointLeaseRenewInterval)
+	defer ticker.Stop()

+	for {
 		select {
 		case <-ctx.Done():
-			return fmt.Errorf("pod annotation retry interrupted: %w", ctx.Err())
-		case <-time.After(delay):
+			return
+		case <-ticker.C:
+			if err := renewCheckpointLease(ctx, w.clientset, job, w.holderID); err != nil {
+				log.Error(err, "Failed to renew checkpoint lease")
+				stopLease(fmt.Errorf("checkpoint lease renewal failed: %w", err))
+				return
+			}
 		}
-		delay *= 2
 	}
+}

-	return fmt.Errorf("failed to annotate pod after %d attempts: %w", terminalStatusPatchRetryAttempts, lastErr)
+func annotateJob(ctx context.Context, clientset kubernetes.Interface, log logr.Logger, job *batchv1.Job, annotations map[string]string) error {
+	patchBytes, err := json.Marshal(map[string]any{
+		"metadata": map[string]any{
+			"annotations": annotations,
+		},
+	})
+	if err != nil {
+		return fmt.Errorf("failed to build job annotation patch payload: %w", err)
+	}
+
+	_, err = clientset.BatchV1().Jobs(job.Namespace).Patch(
+		ctx, job.Name, ktypes.MergePatchType, patchBytes, metav1.PatchOptions{},
+	)
+	if err != nil {
+		log.Error(err, "Failed to annotate checkpoint job",
+			"job", fmt.Sprintf("%s/%s", job.Namespace, job.Name),
+			"annotations", annotations,
+		)
+	}
+	return err
 }

 func waitForPodReady(ctx context.Context, clientset kubernetes.Interface, namespace, podName, containerName string) error {

--- a/deploy/snapshot/pkg/criu/restore.go
+++ b/deploy/snapshot/pkg/criu/restore.go
@@ -15,7 +15,7 @@ import (
 	"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
 )

-// RestoreLogFilename is the CRIU restore log filename (also used by orchestrate/restore.go).
+// RestoreLogFilename is the CRIU restore log filename (also used by executor/restore.go).
 const RestoreLogFilename = "restore.log"

 const (

--- a/deploy/snapshot/pkg/criu/util.go
+++ b/deploy/snapshot/pkg/criu/util.go
@@ -43,9 +43,14 @@ func shouldSetCgroupRoot(cgMode criurpc.CriuCgMode) bool {

 // applyCommonSettings sets CRIU options shared between dump and restore.
 func applyCommonSettings(opts *criurpc.CriuOpts, settings *types.CRIUSettings) error {
+	if settings.TcpClose && settings.TcpEstablished {
+		return fmt.Errorf("tcpClose and tcpEstablished cannot both be true")
+	}
+
 	opts.LogLevel = proto.Int32(settings.LogLevel)
 	opts.ShellJob = proto.Bool(settings.ShellJob)
 	opts.TcpClose = proto.Bool(settings.TcpClose)
+	opts.TcpEstablished = proto.Bool(settings.TcpEstablished)
 	opts.FileLocks = proto.Bool(settings.FileLocks)
 	opts.ExtUnixSk = proto.Bool(settings.ExtUnixSk)
 	opts.LinkRemap = proto.Bool(settings.LinkRemap)

--- a/deploy/snapshot/pkg/criu/util_test.go
+++ b/deploy/snapshot/pkg/criu/util_test.go
@@ -53,7 +53,7 @@ func TestApplyCommonSettings(t *testing.T) {
 		settings := &types.CRIUSettings{
 			LogLevel:          4,
 			ShellJob:          true,
-			TcpClose:          true,
+			TcpEstablished:    true,
 			FileLocks:         true,
 			ExtUnixSk:         true,
 			LinkRemap:         true,
@@ -70,8 +70,11 @@ func TestApplyCommonSettings(t *testing.T) {
 		if !opts.GetShellJob() {
 			t.Error("ShellJob should be true")
 		}
-		if !opts.GetTcpClose() {
-			t.Error("TcpClose should be true")
+		if !opts.GetTcpEstablished() {
+			t.Error("TcpEstablished should be true")
+		}
+		if opts.GetTcpClose() {
+			t.Error("TcpClose should be false")
 		}
 		if !opts.GetFileLocks() {
 			t.Error("FileLocks should be true")
@@ -97,6 +100,17 @@ func TestApplyCommonSettings(t *testing.T) {
 			t.Error("expected error for invalid ManageCgroupsMode")
 		}
 	})
+
+	t.Run("conflicting tcp settings return error", func(t *testing.T) {
+		opts := &criurpc.CriuOpts{}
+		settings := &types.CRIUSettings{
+			TcpClose:       true,
+			TcpEstablished: true,
+		}
+		if err := applyCommonSettings(opts, settings); err == nil {
+			t.Error("expected error for conflicting tcp settings")
+		}
+	})
 }

 func TestBuildRestoreExtMounts(t *testing.T) {

--- a/deploy/snapshot/pkg/orchestrate/checkpoint.go
+++ b/deploy/snapshot/pkg/orchestrate/checkpoint.go
-// Package orchestrate provides the top-level checkpoint and restore orchestrators.
+// Package executor provides the top-level checkpoint and restore executors.
 // These wire together the lib packages (criu, cuda, etc.) into multi-step workflows.
-package orchestrate
+package executor

 import (
 	"context"
@@ -12,6 +12,7 @@ import (
 	criurpc "github.com/checkpoint-restore/go-criu/v8/rpc"
 	"github.com/containerd/containerd"
 	"github.com/go-logr/logr"
+	"github.com/google/uuid"

 	"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/common"
 	"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/criu"
@@ -24,7 +25,8 @@ type CheckpointRequest struct {
 	ContainerID           string
 	ContainerName         string
 	CheckpointHash        string
-	CheckpointDir  string
+	CheckpointLocation    string
+	CheckpointStorageType string
 	NodeName              string
 	PodName               string
 	PodNamespace          string
@@ -33,21 +35,31 @@ type CheckpointRequest struct {
 // Checkpoint performs a CRIU dump of a container.
 // The operation has three phases: inspect, configure, capture.
 //
-// The checkpoint directory is staged under tmp/<hash> during the operation.
-// On success, it is atomically renamed to <hash> at the base path root.
+// The checkpoint directory is staged under tmp/<uuid> during the operation.
+// On success, the previous checkpoint is removed and the staged directory is
+// renamed into place at the base path root.
 func Checkpoint(ctx context.Context, ctrd *containerd.Client, log logr.Logger, req CheckpointRequest, cfg *types.AgentConfig) error {
 	checkpointStart := time.Now()
 	log.Info("=== Starting checkpoint operation ===")

-	finalDir := filepath.Join(req.CheckpointDir, req.CheckpointHash)
-	tmpDir := filepath.Join(req.CheckpointDir, "tmp", req.CheckpointHash)
-	if err := os.RemoveAll(tmpDir); err != nil {
-		return fmt.Errorf("failed to clean checkpoint staging directory: %w", err)
+	if req.CheckpointStorageType != "pvc" {
+		return fmt.Errorf("checkpoint storage type %q is not supported", req.CheckpointStorageType)
 	}
-	if err := os.MkdirAll(tmpDir, 0700); err != nil {
-		return fmt.Errorf("failed to create checkpoint directory: %w", err)
+	if req.CheckpointLocation == "" {
+		return fmt.Errorf("checkpoint location is required")
 	}

+	finalDir := req.CheckpointLocation
+	tmpRoot := filepath.Join(filepath.Dir(finalDir), "tmp")
+	if err := os.MkdirAll(tmpRoot, 0700); err != nil {
+		return fmt.Errorf("failed to create checkpoint staging root: %w", err)
+	}
+	tmpDir := filepath.Join(tmpRoot, uuid.NewString())
+	if err := os.Mkdir(tmpDir, 0700); err != nil {
+		return fmt.Errorf("failed to create checkpoint staging directory: %w", err)
+	}
+	defer os.RemoveAll(tmpDir)
+
 	// Phase 1: Inspect container state
 	state, err := inspectContainer(ctx, ctrd, log, req)
 	if err != nil {
@@ -67,7 +79,9 @@ func Checkpoint(ctx context.Context, ctrd *containerd.Client, log logr.Logger, r
 	}

 	// Remove any previous checkpoint with the same identity hash before finalizing
-	os.RemoveAll(finalDir)
+	if err := os.RemoveAll(finalDir); err != nil {
+		return fmt.Errorf("failed to remove previous checkpoint directory: %w", err)
+	}
 	if err := os.Rename(tmpDir, finalDir); err != nil {
 		return fmt.Errorf("failed to finalize checkpoint directory: %w", err)
 	}

--- a/deploy/snapshot/pkg/orchestrate/nsrestore.go
+++ b/deploy/snapshot/pkg/orchestrate/nsrestore.go
-package orchestrate
+package executor

 import (
 	"context"

--- a/deploy/snapshot/pkg/orchestrate/restore.go
+++ b/deploy/snapshot/pkg/orchestrate/restore.go
-package orchestrate
+package executor

 import (
 	"bytes"
@@ -25,7 +25,8 @@ import (
 // RestoreRequest holds the parameters for a restore operation.
 type RestoreRequest struct {
 	CheckpointHash        string
-	CheckpointBase string
+	CheckpointLocation    string
+	CheckpointStorageType string
 	NSRestorePath         string
 	PodName               string
 	PodNamespace          string
@@ -72,8 +73,15 @@ func Restore(ctx context.Context, ctrd *containerd.Client, log logr.Logger, req
 }

 func inspectRestore(ctx context.Context, ctrd *containerd.Client, log logr.Logger, req RestoreRequest) (*types.RestoreContainerSnapshot, error) {
-	checkpointPath := filepath.Join(req.CheckpointBase, req.CheckpointHash)
-	baseAbs, err := filepath.Abs(req.CheckpointBase)
+	if req.CheckpointStorageType != "pvc" {
+		return nil, fmt.Errorf("checkpoint storage type %q is not supported", req.CheckpointStorageType)
+	}
+	if req.CheckpointLocation == "" {
+		return nil, fmt.Errorf("checkpoint location is required")
+	}
+
+	checkpointPath := req.CheckpointLocation
+	baseAbs, err := filepath.Abs(filepath.Dir(checkpointPath))
 	if err != nil {
 		return nil, fmt.Errorf("failed to resolve checkpoint base path: %w", err)
 	}

--- a/deploy/snapshot/pkg/types/config.go
+++ b/deploy/snapshot/pkg/types/config.go
@@ -4,7 +4,6 @@ package types
 import (
 	"fmt"
 	"os"
-	"strings"
 	"time"
 )

@@ -13,7 +12,6 @@ import (
 type AgentConfig struct {
 	NodeName            string          `yaml:"-"`
 	RestrictedNamespace string          `yaml:"-"`
-	BasePath            string          `yaml:"basePath"`
 	Overlay             OverlaySettings `yaml:"overlay"`
 	Restore             RestoreSpec     `yaml:"restore"`
 	CRIU                CRIUSettings    `yaml:"criu"`
@@ -29,8 +27,11 @@ func (c *AgentConfig) LoadEnvOverrides() {
 }

 func (c *AgentConfig) Validate() error {
-	if strings.TrimSpace(c.BasePath) == "" {
-		return &ConfigError{Field: "basePath", Message: "basePath is required"}
+	if c.CRIU.TcpClose && c.CRIU.TcpEstablished {
+		return &ConfigError{
+			Field:   "criu",
+			Message: "tcpClose and tcpEstablished cannot both be true",
+		}
 	}
 	return c.Restore.Validate()
 }
@@ -65,6 +66,7 @@ type CRIUSettings struct {
 	LeaveRunning      bool   `yaml:"leaveRunning"`
 	ShellJob          bool   `yaml:"shellJob"`
 	TcpClose          bool   `yaml:"tcpClose"`
+	TcpEstablished    bool   `yaml:"tcpEstablished"`
 	FileLocks         bool   `yaml:"fileLocks"`
 	OrphanPtsMaster   bool   `yaml:"orphanPtsMaster"`
 	ExtUnixSk         bool   `yaml:"extUnixSk"`
@@ -83,9 +85,7 @@ type CRIUSettings struct {

 // OverlaySettings is the static config for rootfs exclusions.
 type OverlaySettings struct {
-	SystemDirs           []string `yaml:"systemDirs"`
-	CacheDirs            []string `yaml:"cacheDirs"`
-	AdditionalExclusions []string `yaml:"additionalExclusions"`
+	Exclusions []string `yaml:"exclusions"`
 }

 // ConfigError represents a configuration validation error.

--- a/deploy/snapshot/pkg/types/manifest_test.go
+++ b/deploy/snapshot/pkg/types/manifest_test.go
@@ -24,7 +24,7 @@ func TestManifestRoundTrip(t *testing.T) {
 		},
 		NewSourcePodManifest("ctr-abc", 42, "node-1", "my-pod", "default", []string{"pipe:[111]", "pipe:[222]", "pipe:[333]"}),
 		OverlayManifest{
-			Exclusions:     OverlaySettings{SystemDirs: []string{"/proc", "/sys"}},
+			Exclusions:     OverlaySettings{Exclusions: []string{"/proc", "/sys"}},
 			UpperDir:       "/var/lib/containerd/upper",
 			ExternalPaths:  []string{"/proc/acpi"},
 			BindMountDests: []string{"/data"},

--- a/docs/kubernetes/api-reference.md
+++ b/docs/kubernetes/api-reference.md
@@ -262,9 +262,10 @@ _Appears in:_
 | Field | Description | Default | Validation |
 | --- | --- | --- | --- |
 | `podTemplateSpec` _[PodTemplateSpec](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#podtemplatespec-v1-core)_ | PodTemplateSpec allows customizing the checkpoint Job pod<br />This should include the container that runs the workload to be checkpointed |  | Required: \{\} <br /> |
-| `activeDeadlineSeconds` _integer_ | ActiveDeadlineSeconds specifies the maximum time the Job can run | 3600 | Optional: \{\} <br /> |
-| `backoffLimit` _integer_ | BackoffLimit specifies the number of retries before marking the Job failed | 3 | Optional: \{\} <br /> |
-| `ttlSecondsAfterFinished` _integer_ | TTLSecondsAfterFinished specifies how long to keep the Job after completion | 300 | Optional: \{\} <br /> |
+| `sharedMemory` _[SharedMemorySpec](#sharedmemoryspec)_ | SharedMemory controls the tmpfs mounted at /dev/shm for the checkpoint Job pod.<br />When omitted, checkpoint Jobs use the same default 8Gi tmpfs as Dynamo components. |  | Optional: \{\} <br /> |
+| `activeDeadlineSeconds` _integer_ | ActiveDeadlineSeconds specifies the maximum time the Job can run | 3600 | Minimum: 1 <br />Optional: \{\} <br /> |
+| `backoffLimit` _integer_ | Deprecated: BackoffLimit is ignored. Checkpoint Jobs never retry. |  | Minimum: 0 <br />Optional: \{\} <br /> |
+| `ttlSecondsAfterFinished` _integer_ | TTLSecondsAfterFinished specifies how long to keep the Job after completion | 300 | Minimum: 0 <br />Optional: \{\} <br /> |


 #### DynamoCheckpointPhase
@@ -324,7 +325,7 @@ _Appears in:_
 | `jobName` _string_ | JobName is the name of the checkpoint creation Job |  | Optional: \{\} <br /> |
 | `createdAt` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#time-v1-meta)_ | CreatedAt is the timestamp when the checkpoint tar was created |  | Optional: \{\} <br /> |
 | `message` _string_ | Message provides additional information about the current state |  | Optional: \{\} <br /> |
-| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions represent the latest available observations of the checkpoint's state |  | Optional: \{\} <br /> |
+| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | DEPRECATED: Conditions are deprecated. Use status.phase instead. |  | Optional: \{\} <br /> |


 #### DynamoCheckpointStorageType
@@ -1155,7 +1156,7 @@ _Appears in:_
 | --- | --- | --- | --- |
 | `enabled` _boolean_ | Enabled indicates whether checkpointing is enabled for this service | false | Optional: \{\} <br /> |
 | `mode` _[CheckpointMode](#checkpointmode)_ | Mode defines how checkpoint creation is handled<br />- Auto: DGD controller creates Checkpoint CR automatically<br />- Manual: User must create Checkpoint CR | Auto | Enum: [Auto Manual] <br />Optional: \{\} <br /> |
-| `checkpointRef` _string_ | CheckpointRef references an existing Checkpoint CR to use<br />If specified, Identity is ignored and this checkpoint is used directly |  | Optional: \{\} <br /> |
+| `checkpointRef` _string_ | CheckpointRef references an existing DynamoCheckpoint CR by metadata.name.<br />If specified, this service's Identity is ignored and the referenced checkpoint is used directly. |  | Optional: \{\} <br /> |
 | `identity` _[DynamoCheckpointIdentity](#dynamocheckpointidentity)_ | Identity defines the checkpoint identity for hash computation<br />Used when Mode is Auto or when looking up existing checkpoints<br />Required when checkpointRef is not specified |  | Optional: \{\} <br /> |


@@ -1174,7 +1175,7 @@ _Appears in:_
 | --- | --- | --- | --- |
 | `checkpointName` _string_ | CheckpointName is the name of the associated Checkpoint CR |  | Optional: \{\} <br /> |
 | `identityHash` _string_ | IdentityHash is the computed hash of the checkpoint identity |  | Optional: \{\} <br /> |
-| `ready` _boolean_ | Ready indicates if the checkpoint is ready for use |  | Optional: \{\} <br /> |
+| `ready` _boolean_ | Ready indicates if the checkpoint was visible to the worker at startup |  | Optional: \{\} <br /> |


 #### ServiceReplicaStatus
@@ -1208,6 +1209,7 @@ _Appears in:_


 _Appears in:_
+- [DynamoCheckpointJobConfig](#dynamocheckpointjobconfig)
 - [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
 - [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)

@@ -2349,17 +2351,6 @@ These are injected into all components when the corresponding infrastructure ser
 | --- | --- | --- | --- | --- |
 | `OMPI_MCA_orte_keep_fqdn_hostnames` | Instructs OpenMPI to preserve FQDN hostnames for inter-node communication | `1` | `string` | Multinode deployments only |

-### Checkpoint / Restore
-
-These environment variables are injected when checkpoint/restore is enabled for a component.
-
-| Variable | Purpose | Default | Type | Condition |
-| --- | --- | --- | --- | --- |
-| `DYN_CHECKPOINT_PATH` | Base directory where checkpoint data is stored | From operator checkpoint config `storage.pvc.basePath` | `string` | PVC storage type |
-| `DYN_CHECKPOINT_LOCATION` | Full checkpoint URI (for non-PVC backends) | — | `string` | S3 or OCI storage type |
-| `DYN_CHECKPOINT_HASH` | Identity hash that uniquely identifies the checkpoint | — | `string` | Always set when checkpoint is enabled |
-| `SKIP_WAIT_FOR_CHECKPOINT` | Skips the checkpoint readiness polling loop; checks once and proceeds | — | `string` | Set on restored and DGD pods |
-
 ## Service Accounts

 The following component types automatically receive dedicated service accounts:

--- a/docs/kubernetes/snapshot.md
+++ b/docs/kubernetes/snapshot.md
@@ -11,7 +11,7 @@ title: Snapshot
 | Startup Type | Time | What Happens |
 |--------------|------|--------------|
 | **Cold Start** | ~1 min | Download model, load to GPU, initialize engine |
-| **Warm Start** (restore from checkpoint) | ~ 10 sec | Restore from checkpoint tar |
+| **Warm Start** (restore from checkpoint) | ~ 10 sec | Restore from a ready checkpoint directory |

 > ⚠️ Restore time may vary depending on cluster configuration (storage bandwidth, GPU model, etc.)

@@ -146,34 +146,13 @@ spec:
          args:
            - --model
            - Qwen/Qwen3-0.6B
-            - --disable-custom-all-reduce
          env:
-            - name: GLOO_SOCKET_IFNAME
-              value: lo
-            - name: NCCL_SOCKET_IFNAME
-              value: lo
            - name: NCCL_DEBUG
              value: ERROR
            - name: TORCH_CPP_LOG_LEVEL
              value: ERROR
            - name: TORCH_DISTRIBUTED_DEBUG
              value: "OFF"
-            - name: CUDA_ERROR_LEVEL
-              value: "10"
-            - name: NCCL_CUMEM_ENABLE
-              value: "0"
-            - name: NCCL_CUMEM_HOST_ENABLE
-              value: "0"
-            - name: NCCL_NVLS_ENABLE
-              value: "0"
-            - name: NCCL_P2P_DISABLE
-              value: "0"
-            - name: NCCL_SHM_DISABLE
-              value: "1"
-            - name: NCCL_IB_DISABLE
-              value: "1"
-            - name: TORCH_NCCL_ENABLE_MONITORING
-              value: "0"
 ```

 For SGLang, use `dynamo.sglang`, an SGLang placeholder image, `backendFramework: sglang`, and the matching CLI flags.
@@ -184,24 +163,26 @@ Apply the manifest:
 kubectl apply -f vllm-snapshot-demo.yaml -n ${NAMESPACE}
 ```

-On the first rollout, the worker cold-starts, the operator creates a `DynamoCheckpoint`, and the checkpoint Job writes data into `snapshot-pvc`.
+On the first rollout, the worker cold-starts, the operator resolves the checkpoint identity hash, and the checkpoint Job writes a new checkpoint directory into `snapshot-pvc`.

 ### 5. Wait for the checkpoint to become ready

-Capture the checkpoint name from DGD status, then wait for the `DynamoCheckpoint` phase to become `Ready`:
+Auto mode resolves checkpoints by identity hash. It may create `checkpoint-<hash>` or reuse an existing checkpoint with a different CR name. For the sample identity above, the hash is `73e74442beb109ed`:

 ```bash
-CHECKPOINT_NAME=$(kubectl get dgd vllm-snapshot-demo -n ${NAMESPACE} \
-  -o jsonpath='{.status.checkpoints.VllmDecodeWorker.checkpointName}')
+kubectl get dckpt -n ${NAMESPACE}

+CKPT_NAME=$(kubectl get dckpt -n ${NAMESPACE} \
+  -l nvidia.com/snapshot-checkpoint-hash=73e74442beb109ed \
+  -o jsonpath='{.items[0].metadata.name}')
 kubectl wait \
  --for=jsonpath='{.status.phase}'=Ready \
-  "dynamocheckpoint/${CHECKPOINT_NAME}" \
+  "dynamocheckpoint/${CKPT_NAME}" \
  -n ${NAMESPACE} \
-  --timeout=30m
+  --timeout=5m
 ```

-The DGD status also reports the computed checkpoint hash at `.status.checkpoints.VllmDecodeWorker.identityHash`.
+If you change the checkpoint identity, the hash changes and so does the checkpoint selected by Auto mode.

 ### 6. Trigger restore

@@ -218,7 +199,7 @@ New worker pods for `VllmDecodeWorker` will restore from the ready checkpoint au

 ### Auto Mode (Recommended)

-The operator computes the checkpoint identity hash, looks for an existing `DynamoCheckpoint` with a matching `nvidia.com/snapshot-checkpoint-hash` label, and creates one if it does not find one:
+The operator computes the checkpoint identity hash, looks up an existing `DynamoCheckpoint` by that hash, and creates a new `DynamoCheckpoint` only when no matching checkpoint already exists:

 ```yaml
 checkpoint:
@@ -232,7 +213,12 @@ checkpoint:
    maxModelLen: 4096
 ```

-When a service uses checkpointing, DGD status reports the resolved `checkpointName`, `identityHash`, and `ready` fields under `.status.checkpoints.<service-name>`.
+The `DynamoGraphDeployment` mirrors checkpoint resolution state under `.status.checkpoints`, including the resolved checkpoint CR name, identity hash, and whether the checkpoint was visible to the worker when it started:
+
+```bash
+kubectl get dgd vllm-snapshot-demo -n ${NAMESPACE} \
+  -o jsonpath='{.status.checkpoints.VllmDecodeWorker.checkpointName}{"\n"}{.status.checkpoints.VllmDecodeWorker.identityHash}{"\n"}'
+```

 ### Manual Management and `checkpointRef`

@@ -241,26 +227,26 @@ Use `checkpointRef` when you want a service to restore from a specific `DynamoCh
 ```yaml
 checkpoint:
  enabled: true
-  checkpointRef: "qwen3-06b-vllm-prewarm"
+  checkpointRef: "qwen3-06b-bf16"
 ```

 This is useful when:
 - You want to **pre-warm checkpoints** before creating DGDs
 - You want **explicit control** over which checkpoint to use

-`checkpointRef` resolves by `DynamoCheckpoint.metadata.name`, not by `status.identityHash`. A manual checkpoint can use any valid Kubernetes resource name.
+`checkpointRef` resolves by `DynamoCheckpoint.metadata.name`. Use a readable CR name when you want an explicit checkpoint that operators can reference directly.

 If you are managing checkpoint CRs yourself, set `mode: Manual` on the service to prevent the operator from creating a new `DynamoCheckpoint` when identity-based lookup does not find one.

 ```bash
 # Check checkpoint status by CR name
-kubectl get dynamocheckpoint qwen3-06b-vllm-prewarm -n ${NAMESPACE}
+kubectl get dynamocheckpoint qwen3-06b-bf16 -n ${NAMESPACE}

 # Now create DGD referencing it
 kubectl apply -f my-dgd.yaml -n ${NAMESPACE}
 ```

-If you want `mode: Auto` DGDs to discover a manually created checkpoint by identity, add the label `nvidia.com/snapshot-checkpoint-hash=<identity-hash>` to that `DynamoCheckpoint`. Auto-created checkpoints already use that label, and currently use the same hash as the CR name.
+`mode: Auto` still resolves checkpoints by identity hash. The operator backfills `status.identityHash` and the `nvidia.com/snapshot-checkpoint-hash` label on each `DynamoCheckpoint` so auto lookup and uniqueness checks do not depend on the CR name.

 ## Checkpoint Identity

@@ -309,7 +295,8 @@ The `DynamoCheckpoint` (shortname: `dckpt`) is a Kubernetes Custom Resource that
 - **Pre-warming:** Create checkpoints before deploying DGDs for instant startup
 - **Explicit control:** Manage checkpoint lifecycle independently from DGDs

-The operator requires `spec.identity` and `spec.job.podTemplateSpec`. The pod template should match the worker container you want checkpointed, including image, command, args, secrets, volumes, and resource limits. You do not need to set the checkpoint environment variables manually; the operator injects them for checkpoint jobs and restored pods.
+The operator requires `spec.identity` and `spec.job.podTemplateSpec`. The pod template should match the worker container you want checkpointed, including image, command, args, secrets, volumes, and resource limits. You do not need to set checkpoint-control plumbing manually; the operator injects the checkpoint-ready signal path for checkpoint Jobs and adds the restore metadata consumed by restored pods and the node-local controller inside the `snapshot-agent` DaemonSet.
+`spec.job.backoffLimit` is deprecated and ignored. Checkpoint Jobs are always single-attempt.

 **Create a checkpoint:**

@@ -317,9 +304,7 @@ The operator requires `spec.identity` and `spec.job.podTemplateSpec`. The pod te
 apiVersion: nvidia.com/v1alpha1
 kind: DynamoCheckpoint
 metadata:
-  name: qwen3-06b-vllm-prewarm
-  labels:
-    nvidia.com/snapshot-checkpoint-hash: "e5962d34ba272638"  # Add this if Auto-mode identity lookup should find the CR
+  name: qwen3-06b-bf16
 spec:
  identity:
    model: Qwen/Qwen3-0.6B
@@ -330,7 +315,6 @@ spec:

  job:
    activeDeadlineSeconds: 3600
-    backoffLimit: 3
    ttlSecondsAfterFinished: 300
    podTemplateSpec:
      spec:
@@ -345,18 +329,19 @@ spec:
            args:
              - --model
              - Qwen/Qwen3-0.6B
-              - --disable-custom-all-reduce
            env:
-              - name: GLOO_SOCKET_IFNAME
-                value: lo
-              - name: NCCL_SOCKET_IFNAME
-                value: lo
+              - name: NCCL_DEBUG
+                value: ERROR
+              - name: TORCH_CPP_LOG_LEVEL
+                value: ERROR
+              - name: TORCH_DISTRIBUTED_DEBUG
+                value: "OFF"
            resources:
              limits:
                nvidia.com/gpu: "1"
 ```

-You can name the CR however you want if you plan to use `checkpointRef`. If you want `mode: Auto` identity lookup to find a manual CR, set the `nvidia.com/snapshot-checkpoint-hash` label to the computed 16-character identity hash. Using the hash as the CR name is a convenient convention, but it is not required.
+For this example identity, the operator computes a deterministic identity hash and stores it in `status.identityHash`. Auto mode uses that hash, not the CR name, when it decides whether to reuse or create a checkpoint.

 **Check status:**

@@ -367,8 +352,8 @@ kubectl get dynamocheckpoint -n ${NAMESPACE}
 kubectl get dckpt -n ${NAMESPACE}

 NAME               MODEL                                BACKEND  PHASE     HASH              AGE
-qwen3-06b-vllm-prewarm Qwen/Qwen3-0.6B            vllm     Ready    e5962d34ba272638  5m
-llama3-8b-vllm-prewarm meta-llama/Llama-3-8B      vllm     Creating 7ab4f89c12de3456  2m
+qwen3-06b-bf16     Qwen/Qwen3-0.6B                      vllm     Ready     3bff874d069f0ed5  5m
+llama3-8b-bf16     meta-llama/Meta-Llama-3-8B-Instruct  vllm     Creating  9be4f5574b5a285d  2m
 ```

 **Phases:**
@@ -380,45 +365,33 @@ llama3-8b-vllm-prewarm meta-llama/Llama-3-8B      vllm     Creating 7ab4f89c12de
 | `Ready` | Checkpoint available for use |
 | `Failed` | Checkpoint creation failed |

-`Ready` is a value in `status.phase`, not a Kubernetes condition. The `conditions` array tracks job lifecycle events:
-
-| Condition Type | Meaning |
-|----------------|---------|
-| `JobCreated` | The checkpoint Job has been created |
-| `JobCompleted` | The checkpoint Job has completed successfully or failed |
-
 Other useful status fields are:

 | Field | Meaning |
 |-------|---------|
+| `status.identityHash` | Deterministic hash of `spec.identity` used for auto lookup and reuse |
 | `status.jobName` | Name of the checkpoint Job |
-| `status.identityHash` | Computed 16-character hash for the checkpoint identity |
 | `status.location` | Checkpoint location in the configured storage backend |
 | `status.storageType` | Storage backend type (`pvc`, `s3`, or `oci`) |
 | `status.createdAt` | Timestamp recorded when the checkpoint becomes ready |
 | `status.message` | Failure or progress message when available |

+`status.conditions` is deprecated for `DynamoCheckpoint`. The legacy condition types `JobCreated` and `JobCompleted` are kept for compatibility only. Prefer `status.phase`, `status.jobName`, and `status.message` when checking checkpoint progress.
+
 **Detailed status:**

 ```bash
-kubectl describe dckpt qwen3-06b-vllm-prewarm -n ${NAMESPACE}
+kubectl describe dckpt qwen3-06b-bf16 -n ${NAMESPACE}
 ```

 ```yaml
 Status:
  Phase: Ready
-  IdentityHash: e5962d34ba272638
-  JobName: checkpoint-qwen3-06b-vllm-prewarm
-  Location: /checkpoints/e5962d34ba272638.tar
+  IdentityHash: 3bff874d069f0ed5
+  JobName: checkpoint-job-3bff874d069f0ed5
+  Location: /checkpoints/3bff874d069f0ed5
  StorageType: pvc
  CreatedAt: 2026-01-29T10:05:00Z
-  Conditions:
-    - Type: JobCreated
-      Status: "True"
-      Reason: JobCreated
-    - Type: JobCompleted
-      Status: "True"
-      Reason: JobSucceeded
 ```

 **Reference from DGD:**
@@ -431,16 +404,16 @@ spec:
    VllmDecodeWorker:
      checkpoint:
        enabled: true
-        checkpointRef: "qwen3-06b-vllm-prewarm"
+        checkpointRef: "qwen3-06b-bf16"
 ```

-Or use `mode: Auto` with the same identity and snapshot-hash label, and the operator will reuse it automatically.
+Or use `mode: Auto` with the same identity, and the operator will reuse the same deterministic checkpoint object automatically.

 ## Limitations

 - **LLM workers only**: Checkpoint/restore supports LLM decode and prefill workers. Specialized workers (multimodal, embedding, diffusion) are not supported.
 - **Single-GPU only**: Multi-GPU configurations may work in very basic hardware configurations, but are not officially supported yet.
- **Network state**: No active TCP connections can be checkpointed
+- **Network state**: Restore is sensitive to live TCP socket state. Loopback bootstrap/control sockets can work with the supported CRIU TCP policies, but non-loopback or pod-IP-bound connections can still break restore.
 - **Security**: Dynamo Snapshot runs as a **privileged DaemonSet** which is required to run CRIU and cuda-checkpoint. However, workload pods do not need to be privileged.

 ## Troubleshooting
@@ -451,7 +424,10 @@ Or use `mode: Auto` with the same identity and snapshot-hash label, and the oper
   ```bash
   kubectl get dckpt -n ${NAMESPACE}
   kubectl describe dckpt <checkpoint-name> -n ${NAMESPACE}
-   kubectl logs job/$(kubectl get dckpt <checkpoint-name> -n ${NAMESPACE} -o jsonpath='{.status.jobName}') -n ${NAMESPACE}
+   JOB_NAME=$(kubectl get dckpt <checkpoint-name> -n ${NAMESPACE} -o jsonpath='{.status.jobName}')
+   if [ -n "${JOB_NAME}" ]; then
+     kubectl logs job/"${JOB_NAME}" -n ${NAMESPACE}
+   fi
   ```

 2. Check the DaemonSet: