refactor(snapshot): add manifest-based snapshotctl flow and shared workload builders (#7671)

Signed-off-by: Schwinn Saereesitthipitak <schwinns@nvidia.com>

refactor(snapshot): add manifest-based snapshotctl flow and shared workload builders (#7671)
Signed-off-by: Schwinn Saereesitthipitak <schwinns@nvidia.com>
43e810a4 · Schwinn Saereesitthipitak · GitHub · 23144df5 · 43e810a4 · 43e810a4
Unverified Commit 43e810a4 authored Apr 03, 2026 by Schwinn Saereesitthipitak Committed by GitHub Apr 03, 2026
14 changed files
--- a/deploy/snapshot/pkg/common/process.go
+++ b/deploy/snapshot/pkg/common/process.go
-package common
+package runtime

 import (
 	"fmt"

--- a/deploy/snapshot/pkg/common/process_test.go
+++ b/deploy/snapshot/pkg/common/process_test.go
-package common
+package runtime

 import (
 	"os"

--- a/deploy/snapshot/pkg/types/config.go
+++ b/deploy/snapshot/pkg/types/config.go
@@ -4,6 +4,7 @@ package types
 import (
 	"fmt"
 	"os"
+	"strings"
 	"time"
 )

@@ -12,6 +13,7 @@ import (
 type AgentConfig struct {
 	NodeName            string          `yaml:"-"`
 	RestrictedNamespace string          `yaml:"-"`
+	Storage             StorageSpec     `yaml:"storage"`
 	Overlay             OverlaySettings `yaml:"overlay"`
 	Restore             RestoreSpec     `yaml:"restore"`
 	CRIU                CRIUSettings    `yaml:"criu"`
@@ -27,6 +29,16 @@ func (c *AgentConfig) LoadEnvOverrides() {
 }

 func (c *AgentConfig) Validate() error {
+	storageType := strings.TrimSpace(c.Storage.Type)
+	if storageType == "" {
+		storageType = "pvc"
+	}
+	if storageType != "pvc" {
+		return &ConfigError{Field: "storage.type", Message: fmt.Sprintf("unsupported storage type %q; only pvc is implemented today", storageType)}
+	}
+	if strings.TrimSpace(c.Storage.BasePath) == "" {
+		return &ConfigError{Field: "storage.basePath", Message: "storage.basePath is required"}
+	}
 	if c.CRIU.TcpClose && c.CRIU.TcpEstablished {
 		return &ConfigError{
 			Field:   "criu",
@@ -36,6 +48,12 @@ func (c *AgentConfig) Validate() error {
 	return c.Restore.Validate()
 }

+// StorageSpec holds snapshot storage settings that are local to the agent deployment.
+type StorageSpec struct {
+	Type     string `yaml:"type"`
+	BasePath string `yaml:"basePath"`
+}
+
 // RestoreSpec holds settings for the CRIU restore process.
 type RestoreSpec struct {
 	NSRestorePath              string `yaml:"nsRestorePath"`

--- a/deploy/snapshot/pkg/types/inspect.go
+++ b/deploy/snapshot/pkg/types/inspect.go
--- a/deploy/snapshot/pkg/types/manifest.go
+++ b/deploy/snapshot/pkg/types/manifest.go
@@ -4,6 +4,7 @@ import (
 	"fmt"
 	"os"
 	"path/filepath"
+	"strings"
 	"time"

 	criurpc "github.com/checkpoint-restore/go-criu/v8/rpc"
@@ -15,8 +16,8 @@ const manifestFilename = "manifest.yaml"

 // CheckpointManifest is saved as manifest.yaml at checkpoint time and loaded at restore.
 type CheckpointManifest struct {
-	CheckpointHash string    `yaml:"checkpointHash"`
-	CreatedAt      time.Time `yaml:"createdAt"`
+	CheckpointID string    `yaml:"checkpointId"`
+	CreatedAt    time.Time `yaml:"createdAt"`

 	CRIUDump CRIUDumpManifest  `yaml:"criuDump"`
 	K8s      SourcePodManifest `yaml:"k8s"`
@@ -25,17 +26,17 @@ type CheckpointManifest struct {
 }

 func NewCheckpointManifest(
-	checkpointHash string,
+	checkpointID string,
 	criuDump CRIUDumpManifest,
 	k8s SourcePodManifest,
 	overlay OverlayManifest,
 ) *CheckpointManifest {
 	return &CheckpointManifest{
-		CheckpointHash: checkpointHash,
-		CreatedAt:      time.Now().UTC(),
-		CRIUDump:       criuDump,
-		K8s:            k8s,
-		Overlay:        overlay,
+		CheckpointID: checkpointID,
+		CreatedAt:    time.Now().UTC(),
+		CRIUDump:     criuDump,
+		K8s:          k8s,
+		Overlay:      overlay,
 	}
 }

@@ -140,6 +141,13 @@ func (m CUDAManifest) IsEmpty() bool {

 // WriteManifest writes a checkpoint manifest file in the checkpoint directory.
 func WriteManifest(checkpointDir string, data *CheckpointManifest) error {
+	if data == nil {
+		return fmt.Errorf("checkpoint manifest is required")
+	}
+	if strings.TrimSpace(data.CheckpointID) == "" {
+		return fmt.Errorf("checkpoint manifest is missing checkpointId")
+	}
+
 	content, err := yaml.Marshal(data)
 	if err != nil {
 		return fmt.Errorf("failed to marshal checkpoint manifest: %w", err)
@@ -166,6 +174,9 @@ func ReadManifest(checkpointDir string) (*CheckpointManifest, error) {
 	if err := yaml.Unmarshal(content, &data); err != nil {
 		return nil, fmt.Errorf("failed to unmarshal checkpoint manifest: %w", err)
 	}
+	if strings.TrimSpace(data.CheckpointID) == "" {
+		return nil, fmt.Errorf("checkpoint manifest is missing checkpointId")
+	}

 	return &data, nil
 }
--- a/deploy/snapshot/pkg/types/manifest_test.go
+++ b/deploy/snapshot/pkg/types/manifest_test.go
 package types

 import (
+	"os"
+	"path/filepath"
 	"testing"

 	criurpc "github.com/checkpoint-restore/go-criu/v8/rpc"
@@ -42,8 +44,8 @@ func TestManifestRoundTrip(t *testing.T) {
 	}

 	// Verify key fields survived the round-trip
-	if loaded.CheckpointHash != original.CheckpointHash {
-		t.Errorf("CheckpointHash = %q, want %q", loaded.CheckpointHash, original.CheckpointHash)
+	if loaded.CheckpointID != original.CheckpointID {
+		t.Errorf("CheckpointID = %q, want %q", loaded.CheckpointID, original.CheckpointID)
 	}
 	if loaded.CRIUDump.CRIU.LogLevel != 4 {
 		t.Errorf("CRIU.LogLevel = %d, want 4", loaded.CRIUDump.CRIU.LogLevel)
@@ -138,3 +140,26 @@ func TestNewCRIUDumpManifest(t *testing.T) {
 		}
 	})
 }
+
+func TestWriteManifestRejectsMissingCheckpointID(t *testing.T) {
+	dir := t.TempDir()
+
+	err := WriteManifest(dir, &CheckpointManifest{})
+	if err == nil || err.Error() != "checkpoint manifest is missing checkpointId" {
+		t.Fatalf("expected missing checkpointId error, got %v", err)
+	}
+}
+
+func TestReadManifestRejectsMissingCheckpointID(t *testing.T) {
+	dir := t.TempDir()
+
+	content := []byte("createdAt: 2026-03-31T00:00:00Z\n")
+	if err := os.WriteFile(filepath.Join(dir, manifestFilename), content, 0o600); err != nil {
+		t.Fatalf("WriteFile: %v", err)
+	}
+
+	_, err := ReadManifest(dir)
+	if err == nil || err.Error() != "checkpoint manifest is missing checkpointId" {
+		t.Fatalf("expected missing checkpointId error, got %v", err)
+	}
+}
--- a/deploy/snapshot/protocol/checkpoint.go
+++ b/deploy/snapshot/protocol/checkpoint.go
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+package protocol
+
+import (
+	"fmt"
+
+	batchv1 "k8s.io/api/batch/v1"
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/utils/ptr"
+)
+
+type CheckpointJobOptions struct {
+	Namespace             string
+	CheckpointID          string
+	ArtifactVersion       string
+	SeccompProfile        string
+	Name                  string
+	ActiveDeadlineSeconds *int64
+	TTLSecondsAfterFinish *int32
+	WrapLaunchJob         bool
+}
+
+func NewCheckpointJob(podTemplate *corev1.PodTemplateSpec, opts CheckpointJobOptions) (*batchv1.Job, error) {
+	podTemplate = podTemplate.DeepCopy()
+	if podTemplate.Labels == nil {
+		podTemplate.Labels = map[string]string{}
+	}
+	if podTemplate.Annotations == nil {
+		podTemplate.Annotations = map[string]string{}
+	}
+	applyCheckpointSourceMetadata(podTemplate.Labels, podTemplate.Annotations, opts.CheckpointID, opts.ArtifactVersion)
+	podTemplate.Spec.RestartPolicy = corev1.RestartPolicyNever
+	if opts.SeccompProfile != "" {
+		EnsureLocalhostSeccompProfile(&podTemplate.Spec, opts.SeccompProfile)
+	}
+	if opts.WrapLaunchJob {
+		if len(podTemplate.Spec.Containers) == 0 {
+			return nil, fmt.Errorf("checkpoint job requires one worker container")
+		}
+		if len(podTemplate.Spec.Containers[0].Command) == 0 {
+			return nil, fmt.Errorf("checkpoint job requires container.command when cuda-checkpoint launch-job wrapping is enabled")
+		}
+		podTemplate.Spec.Containers[0].Command, podTemplate.Spec.Containers[0].Args = wrapWithCudaCheckpointLaunchJob(
+			podTemplate.Spec.Containers[0].Command,
+			podTemplate.Spec.Containers[0].Args,
+		)
+	}
+
+	return &batchv1.Job{
+		TypeMeta: metav1.TypeMeta{APIVersion: "batch/v1", Kind: "Job"},
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      opts.Name,
+			Namespace: opts.Namespace,
+			Labels: map[string]string{
+				CheckpointIDLabel: opts.CheckpointID,
+			},
+		},
+		Spec: batchv1.JobSpec{
+			ActiveDeadlineSeconds:   opts.ActiveDeadlineSeconds,
+			BackoffLimit:            ptr.To[int32](0),
+			TTLSecondsAfterFinished: opts.TTLSecondsAfterFinish,
+			Template:                *podTemplate,
+		},
+	}, nil
+}
+
+func EnsureLocalhostSeccompProfile(podSpec *corev1.PodSpec, profile string) {
+	if podSpec.SecurityContext == nil {
+		podSpec.SecurityContext = &corev1.PodSecurityContext{}
+	}
+	podSpec.SecurityContext.SeccompProfile = &corev1.SeccompProfile{
+		Type:             corev1.SeccompProfileTypeLocalhost,
+		LocalhostProfile: &profile,
+	}
+}
+
+func wrapWithCudaCheckpointLaunchJob(command []string, args []string) ([]string, []string) {
+	wrappedArgs := make([]string, 0, len(command)+len(args)+1)
+	wrappedArgs = append(wrappedArgs, "--launch-job")
+	wrappedArgs = append(wrappedArgs, command...)
+	wrappedArgs = append(wrappedArgs, args...)
+	return []string{"cuda-checkpoint"}, wrappedArgs
+}
--- a/deploy/snapshot/protocol/checkpoint_job_test.go
+++ b/deploy/snapshot/protocol/checkpoint_job_test.go
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+package protocol
+
+import (
+	"testing"
+
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/utils/ptr"
+)
+
+func TestNewCheckpointJob(t *testing.T) {
+	job, err := NewCheckpointJob(&corev1.PodTemplateSpec{
+		ObjectMeta: metav1.ObjectMeta{
+			Labels:      map[string]string{"existing": "label"},
+			Annotations: map[string]string{"existing": "annotation"},
+		},
+		Spec: corev1.PodSpec{
+			RestartPolicy: corev1.RestartPolicyAlways,
+			Containers: []corev1.Container{{
+				Name:    "main",
+				Image:   "test:latest",
+				Command: []string{"python3", "-m", "dynamo.vllm"},
+				Args:    []string{"--model", "Qwen"},
+			}},
+		},
+	}, CheckpointJobOptions{
+		Namespace:             "test-ns",
+		CheckpointID:          "hash",
+		ArtifactVersion:       "2",
+		SeccompProfile:        DefaultSeccompLocalhostProfile,
+		Name:                  "test-job",
+		ActiveDeadlineSeconds: ptr.To(int64(60)),
+		TTLSecondsAfterFinish: ptr.To(int32(300)),
+		WrapLaunchJob:         true,
+	})
+	if err != nil {
+		t.Fatalf("expected checkpoint job, got error: %v", err)
+	}
+
+	if job.Name != "test-job" || job.Namespace != "test-ns" {
+		t.Fatalf("unexpected job identity: %#v", job.ObjectMeta)
+	}
+	if job.Labels[CheckpointIDLabel] != "hash" {
+		t.Fatalf("expected checkpoint hash label on job: %#v", job.Labels)
+	}
+	if job.Spec.Template.Labels[CheckpointSourceLabel] != "true" {
+		t.Fatalf("expected checkpoint source label on template: %#v", job.Spec.Template.Labels)
+	}
+	if job.Spec.Template.Annotations[CheckpointArtifactVersionAnnotation] != "2" {
+		t.Fatalf("expected checkpoint artifact version annotation on template: %#v", job.Spec.Template.Annotations)
+	}
+	if len(job.Spec.Template.Spec.Volumes) != 0 {
+		t.Fatalf("expected no checkpoint volume, got %#v", job.Spec.Template.Spec.Volumes)
+	}
+	if len(job.Spec.Template.Spec.Containers[0].VolumeMounts) != 0 {
+		t.Fatalf("expected no checkpoint volume mount, got %#v", job.Spec.Template.Spec.Containers[0].VolumeMounts)
+	}
+	if job.Spec.Template.Spec.RestartPolicy != corev1.RestartPolicyNever {
+		t.Fatalf("expected restartPolicy Never, got %#v", job.Spec.Template.Spec.RestartPolicy)
+	}
+	if job.Spec.Template.Spec.SecurityContext == nil || job.Spec.Template.Spec.SecurityContext.SeccompProfile == nil {
+		t.Fatalf("expected seccomp profile to be injected: %#v", job.Spec.Template.Spec.SecurityContext)
+	}
+	if len(job.Spec.Template.Spec.Containers[0].Command) != 1 || job.Spec.Template.Spec.Containers[0].Command[0] != "cuda-checkpoint" {
+		t.Fatalf("expected cuda-checkpoint wrapper command: %#v", job.Spec.Template.Spec.Containers[0].Command)
+	}
+	expectedArgs := []string{"--launch-job", "python3", "-m", "dynamo.vllm", "--model", "Qwen"}
+	if len(job.Spec.Template.Spec.Containers[0].Args) != len(expectedArgs) {
+		t.Fatalf("expected launch-job args %#v, got %#v", expectedArgs, job.Spec.Template.Spec.Containers[0].Args)
+	}
+	for i := range expectedArgs {
+		if job.Spec.Template.Spec.Containers[0].Args[i] != expectedArgs[i] {
+			t.Fatalf("expected launch-job args %#v, got %#v", expectedArgs, job.Spec.Template.Spec.Containers[0].Args)
+		}
+	}
+	if job.Spec.BackoffLimit == nil || *job.Spec.BackoffLimit != 0 {
+		t.Fatalf("expected backoffLimit 0, got %#v", job.Spec.BackoffLimit)
+	}
+	if job.Spec.ActiveDeadlineSeconds == nil || *job.Spec.ActiveDeadlineSeconds != 60 {
+		t.Fatalf("unexpected activeDeadlineSeconds: %#v", job.Spec.ActiveDeadlineSeconds)
+	}
+	if job.Spec.TTLSecondsAfterFinished == nil || *job.Spec.TTLSecondsAfterFinished != 300 {
+		t.Fatalf("unexpected ttlSecondsAfterFinished: %#v", job.Spec.TTLSecondsAfterFinished)
+	}
+}
--- a/deploy/snapshot/protocol/common.go
+++ b/deploy/snapshot/protocol/common.go
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+package protocol
+
+import (
+	"fmt"
+	"strings"
+)
+
+const (
+	CheckpointSourceLabel               = "nvidia.com/snapshot-is-checkpoint-source"
+	CheckpointIDLabel                   = "nvidia.com/snapshot-checkpoint-id"
+	RestoreTargetLabel                  = "nvidia.com/snapshot-is-restore-target"
+	CheckpointArtifactVersionAnnotation = "nvidia.com/snapshot-artifact-version"
+	CheckpointStatusAnnotation          = "nvidia.com/snapshot-checkpoint-status"
+	RestoreStatusAnnotation             = "nvidia.com/snapshot-restore-status"
+	RestoreContainerIDAnnotation        = "nvidia.com/snapshot-restore-container-id"
+	CheckpointVolumeName                = "checkpoint-storage"
+	DefaultCheckpointArtifactVersion    = "1"
+	DefaultCheckpointJobTTLSeconds      = int32(300)
+	DefaultSeccompLocalhostProfile      = "profiles/block-iouring.json"
+	StorageTypePVC                      = "pvc"
+
+	CheckpointStatusCompleted = "completed"
+	CheckpointStatusFailed    = "failed"
+	RestoreStatusInProgress   = "in_progress"
+	RestoreStatusCompleted    = "completed"
+	RestoreStatusFailed       = "failed"
+)
+
+type Storage struct {
+	Type     string
+	Location string
+	PVCName  string
+	BasePath string
+}
+
+func ArtifactVersion(version string) string {
+	version = strings.TrimSpace(version)
+	if version == "" {
+		return DefaultCheckpointArtifactVersion
+	}
+	return version
+}
+
+func ResolveCheckpointStorage(checkpointID string, version string, storage Storage) (Storage, error) {
+	resolved, err := resolveStorageConfig(storage)
+	if err != nil {
+		return Storage{}, err
+	}
+	resolved.Location = strings.TrimRight(resolved.BasePath, "/") + "/" + checkpointID + "/versions/" + ArtifactVersion(version)
+	return resolved, nil
+}
+
+func ResolveRestoreStorage(checkpointID string, version string, location string, storage Storage) (Storage, error) {
+	resolved, err := resolveStorageConfig(storage)
+	if err != nil {
+		return Storage{}, err
+	}
+	location = strings.TrimSpace(location)
+	if location == "" {
+		return ResolveCheckpointStorage(checkpointID, version, storage)
+	}
+	resolved.Location = location
+	return resolved, nil
+}
+
+func ApplyRestoreTargetMetadata(labels map[string]string, annotations map[string]string, enabled bool, checkpointID string, artifactVersion string) {
+	delete(labels, CheckpointSourceLabel)
+	delete(labels, RestoreTargetLabel)
+	delete(labels, CheckpointIDLabel)
+	delete(annotations, CheckpointArtifactVersionAnnotation)
+	delete(annotations, CheckpointStatusAnnotation)
+	delete(annotations, RestoreStatusAnnotation)
+	delete(annotations, RestoreContainerIDAnnotation)
+
+	if !enabled {
+		return
+	}
+
+	labels[RestoreTargetLabel] = "true"
+	if checkpointID != "" {
+		labels[CheckpointIDLabel] = checkpointID
+	}
+	annotations[CheckpointArtifactVersionAnnotation] = ArtifactVersion(artifactVersion)
+}
+
+func applyCheckpointSourceMetadata(labels map[string]string, annotations map[string]string, checkpointID string, artifactVersion string) {
+	delete(labels, RestoreTargetLabel)
+	delete(labels, CheckpointIDLabel)
+	delete(annotations, CheckpointArtifactVersionAnnotation)
+
+	labels[CheckpointSourceLabel] = "true"
+	if checkpointID != "" {
+		labels[CheckpointIDLabel] = checkpointID
+	}
+	annotations[CheckpointArtifactVersionAnnotation] = ArtifactVersion(artifactVersion)
+}
+
+func resolveStorageConfig(storage Storage) (Storage, error) {
+	storageType := strings.TrimSpace(storage.Type)
+	if storageType == "" {
+		storageType = StorageTypePVC
+	}
+	if storageType != StorageTypePVC {
+		return Storage{}, fmt.Errorf("checkpoint storage type %q is not supported", storageType)
+	}
+	basePath := strings.TrimSpace(storage.BasePath)
+	if basePath == "" {
+		return Storage{}, fmt.Errorf("checkpoint base path is required")
+	}
+	return Storage{
+		Type:     storageType,
+		PVCName:  strings.TrimSpace(storage.PVCName),
+		BasePath: strings.TrimRight(basePath, "/"),
+	}, nil
+}
--- a/deploy/snapshot/protocol/constants_test.go
+++ b/deploy/snapshot/protocol/constants_test.go
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+package protocol
+
+import "testing"
+
+func TestApplyRestoreTargetMetadata(t *testing.T) {
+	labels := map[string]string{
+		CheckpointSourceLabel: "true",
+		CheckpointIDLabel:     "old",
+	}
+	annotations := map[string]string{
+		CheckpointArtifactVersionAnnotation: "old",
+		CheckpointStatusAnnotation:          "completed",
+		RestoreStatusAnnotation:             "failed",
+		RestoreContainerIDAnnotation:        "dead-container",
+	}
+
+	ApplyRestoreTargetMetadata(labels, annotations, true, "hash", "2")
+
+	if labels[RestoreTargetLabel] != "true" {
+		t.Fatalf("expected restore target label, got %#v", labels)
+	}
+	if labels[CheckpointIDLabel] != "hash" {
+		t.Fatalf("expected checkpoint hash label, got %#v", labels)
+	}
+	if _, ok := labels[CheckpointSourceLabel]; ok {
+		t.Fatalf("checkpoint source label was not cleared: %#v", labels)
+	}
+	if annotations[CheckpointArtifactVersionAnnotation] != "2" {
+		t.Fatalf("expected checkpoint artifact version annotation, got %#v", annotations)
+	}
+	if _, ok := annotations[CheckpointStatusAnnotation]; ok {
+		t.Fatalf("checkpoint status annotation was not cleared: %#v", annotations)
+	}
+	if _, ok := annotations[RestoreStatusAnnotation]; ok {
+		t.Fatalf("restore status annotation was not cleared: %#v", annotations)
+	}
+	if _, ok := annotations[RestoreContainerIDAnnotation]; ok {
+		t.Fatalf("restore container id annotation was not cleared: %#v", annotations)
+	}
+}
+
+func TestApplyRestoreTargetMetadataDisabledClearsState(t *testing.T) {
+	labels := map[string]string{
+		RestoreTargetLabel: "true",
+		CheckpointIDLabel:  "hash",
+	}
+	annotations := map[string]string{
+		CheckpointArtifactVersionAnnotation: "2",
+		CheckpointStatusAnnotation:          "completed",
+		RestoreStatusAnnotation:             "failed",
+		RestoreContainerIDAnnotation:        "dead-container",
+	}
+
+	ApplyRestoreTargetMetadata(labels, annotations, false, "", "")
+
+	if _, ok := labels[RestoreTargetLabel]; ok {
+		t.Fatalf("restore target label was not cleared: %#v", labels)
+	}
+	if _, ok := labels[CheckpointIDLabel]; ok {
+		t.Fatalf("checkpoint hash label was not cleared: %#v", labels)
+	}
+	if _, ok := annotations[CheckpointArtifactVersionAnnotation]; ok {
+		t.Fatalf("checkpoint artifact version annotation was not cleared: %#v", annotations)
+	}
+	if _, ok := annotations[CheckpointStatusAnnotation]; ok {
+		t.Fatalf("checkpoint status annotation was not cleared: %#v", annotations)
+	}
+	if _, ok := annotations[RestoreStatusAnnotation]; ok {
+		t.Fatalf("restore status annotation was not cleared: %#v", annotations)
+	}
+	if _, ok := annotations[RestoreContainerIDAnnotation]; ok {
+		t.Fatalf("restore container id annotation was not cleared: %#v", annotations)
+	}
+}
--- a/deploy/snapshot/protocol/restore.go
+++ b/deploy/snapshot/protocol/restore.go
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+package protocol
+
+import (
+	"context"
+	"fmt"
+	"strings"
+
+	appsv1 "k8s.io/api/apps/v1"
+	corev1 "k8s.io/api/core/v1"
+	ctrlclient "sigs.k8s.io/controller-runtime/pkg/client"
+)
+
+const (
+	SnapshotAgentLabelKey      = "app.kubernetes.io/component"
+	SnapshotAgentLabelValue    = "snapshot-agent"
+	SnapshotAgentContainerName = "agent"
+	SnapshotAgentVolumeName    = "checkpoints"
+	SnapshotAgentLabelSelector = SnapshotAgentLabelKey + "=" + SnapshotAgentLabelValue
+)
+
+type PodOptions struct {
+	Namespace       string
+	CheckpointID    string
+	ArtifactVersion string
+	Storage         Storage
+	SeccompProfile  string
+}
+
+func NewRestorePod(pod *corev1.Pod, opts PodOptions) *corev1.Pod {
+	pod = pod.DeepCopy()
+	if pod.Labels == nil {
+		pod.Labels = map[string]string{}
+	}
+	if pod.Annotations == nil {
+		pod.Annotations = map[string]string{}
+	}
+	ApplyRestoreTargetMetadata(pod.Labels, pod.Annotations, true, opts.CheckpointID, opts.ArtifactVersion)
+	PrepareRestorePodSpec(&pod.Spec, &pod.Spec.Containers[0], opts.Storage, opts.SeccompProfile, true)
+	pod.Namespace = opts.Namespace
+	pod.Spec.RestartPolicy = corev1.RestartPolicyNever
+	return pod
+}
+
+func PrepareRestorePodSpec(
+	podSpec *corev1.PodSpec,
+	container *corev1.Container,
+	storage Storage,
+	seccompProfile string,
+	isCheckpointReady bool,
+) {
+	EnsureLocalhostSeccompProfile(podSpec, seccompProfile)
+	if storage.PVCName != "" {
+		injectCheckpointVolume(podSpec, storage.PVCName)
+	}
+	if storage.BasePath != "" {
+		injectCheckpointVolumeMount(container, storage.BasePath)
+	}
+	if isCheckpointReady {
+		container.Command = []string{"sleep", "infinity"}
+		container.Args = nil
+		container.StartupProbe = nil
+		container.LivenessProbe = nil
+		container.ReadinessProbe = nil
+	}
+}
+
+func ValidateRestorePodSpec(
+	podSpec *corev1.PodSpec,
+	storage Storage,
+	seccompProfile string,
+) error {
+	if podSpec == nil {
+		return fmt.Errorf("pod spec is nil")
+	}
+	if len(podSpec.Containers) != 1 {
+		return fmt.Errorf("restore target must have exactly one container, got %d", len(podSpec.Containers))
+	}
+	container := &podSpec.Containers[0]
+	if storage.PVCName != "" {
+		hasVolume := false
+		for _, volume := range podSpec.Volumes {
+			if volume.Name == CheckpointVolumeName &&
+				volume.PersistentVolumeClaim != nil &&
+				volume.PersistentVolumeClaim.ClaimName == storage.PVCName {
+				hasVolume = true
+				break
+			}
+		}
+		if !hasVolume {
+			return fmt.Errorf("missing %s volume for PVC %s", CheckpointVolumeName, storage.PVCName)
+		}
+	}
+	if storage.BasePath != "" {
+		hasMount := false
+		for _, mount := range container.VolumeMounts {
+			if mount.Name == CheckpointVolumeName && mount.MountPath == storage.BasePath {
+				hasMount = true
+				break
+			}
+		}
+		if !hasMount {
+			return fmt.Errorf("missing %s mount at %s", CheckpointVolumeName, storage.BasePath)
+		}
+	}
+	if seccompProfile == "" {
+		return nil
+	}
+	if podSpec.SecurityContext == nil || podSpec.SecurityContext.SeccompProfile == nil {
+		return fmt.Errorf("missing localhost seccomp profile")
+	}
+	profile := podSpec.SecurityContext.SeccompProfile
+	if profile.Type != corev1.SeccompProfileTypeLocalhost || profile.LocalhostProfile == nil || *profile.LocalhostProfile != seccompProfile {
+		return fmt.Errorf("expected localhost seccomp profile %q", seccompProfile)
+	}
+	return nil
+}
+
+func DiscoverStorageFromDaemonSets(namespace string, daemonSets []appsv1.DaemonSet) (Storage, error) {
+	if len(daemonSets) == 0 {
+		return Storage{}, fmt.Errorf("no snapshot-agent daemonset found in namespace %s", namespace)
+	}
+
+	names := make([]string, 0, len(daemonSets))
+	for _, daemonSet := range daemonSets {
+		names = append(names, daemonSet.Name)
+
+		mountPaths := map[string]string{}
+		for _, container := range daemonSet.Spec.Template.Spec.Containers {
+			if container.Name != SnapshotAgentContainerName {
+				continue
+			}
+			for _, mount := range container.VolumeMounts {
+				if strings.TrimSpace(mount.MountPath) == "" {
+					continue
+				}
+				mountPaths[mount.Name] = strings.TrimRight(mount.MountPath, "/")
+			}
+		}
+
+		for _, volume := range daemonSet.Spec.Template.Spec.Volumes {
+			if volume.Name != SnapshotAgentVolumeName {
+				continue
+			}
+			if volume.PersistentVolumeClaim == nil {
+				continue
+			}
+
+			basePath, ok := mountPaths[volume.Name]
+			if !ok || basePath == "" {
+				continue
+			}
+
+			pvcName := strings.TrimSpace(volume.PersistentVolumeClaim.ClaimName)
+			if pvcName == "" {
+				continue
+			}
+
+			return Storage{
+				Type:     StorageTypePVC,
+				PVCName:  pvcName,
+				BasePath: basePath,
+			}, nil
+		}
+	}
+
+	return Storage{}, fmt.Errorf(
+		"snapshot-agent daemonset in %s does not mount a PVC-backed checkpoint volume (%s)",
+		namespace,
+		strings.Join(names, ", "),
+	)
+}
+
+func PrepareRestorePodSpecForCheckpoint(
+	ctx context.Context,
+	reader ctrlclient.Reader,
+	namespace string,
+	podSpec *corev1.PodSpec,
+	container *corev1.Container,
+	checkpointID string,
+	artifactVersion string,
+	seccompProfile string,
+	isCheckpointReady bool,
+) error {
+	if reader == nil {
+		return fmt.Errorf("snapshot client is required")
+	}
+
+	daemonSets := &appsv1.DaemonSetList{}
+	if err := reader.List(
+		ctx,
+		daemonSets,
+		ctrlclient.InNamespace(namespace),
+		ctrlclient.MatchingLabels{SnapshotAgentLabelKey: SnapshotAgentLabelValue},
+	); err != nil {
+		return fmt.Errorf("list snapshot-agent daemonsets in %s: %w", namespace, err)
+	}
+
+	storage, err := DiscoverStorageFromDaemonSets(namespace, daemonSets.Items)
+	if err != nil {
+		return err
+	}
+
+	resolvedStorage, err := ResolveCheckpointStorage(checkpointID, artifactVersion, storage)
+	if err != nil {
+		return err
+	}
+
+	PrepareRestorePodSpec(podSpec, container, resolvedStorage, seccompProfile, isCheckpointReady)
+	return nil
+}
+
+func injectCheckpointVolume(podSpec *corev1.PodSpec, pvcName string) {
+	for _, volume := range podSpec.Volumes {
+		if volume.Name == CheckpointVolumeName {
+			return
+		}
+	}
+
+	podSpec.Volumes = append(podSpec.Volumes, corev1.Volume{
+		Name: CheckpointVolumeName,
+		VolumeSource: corev1.VolumeSource{
+			PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
+				ClaimName: pvcName,
+			},
+		},
+	})
+}
+
+func injectCheckpointVolumeMount(container *corev1.Container, basePath string) {
+	for _, mount := range container.VolumeMounts {
+		if mount.Name == CheckpointVolumeName {
+			return
+		}
+	}
+
+	container.VolumeMounts = append(container.VolumeMounts, corev1.VolumeMount{
+		Name:      CheckpointVolumeName,
+		MountPath: basePath,
+	})
+}
--- a/deploy/snapshot/protocol/restore_test.go
+++ b/deploy/snapshot/protocol/restore_test.go
+package protocol
+
+import (
+	"testing"
+
+	appsv1 "k8s.io/api/apps/v1"
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+)
+
+func TestNewRestorePod(t *testing.T) {
+	restorePod := NewRestorePod(&corev1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:        "worker",
+			Labels:      map[string]string{"existing": "label"},
+			Annotations: map[string]string{"existing": "annotation"},
+		},
+		Spec: corev1.PodSpec{
+			RestartPolicy: corev1.RestartPolicyAlways,
+			Containers: []corev1.Container{{
+				Name:           "main",
+				Image:          "test:latest",
+				Command:        []string{"python3", "-m", "dynamo.vllm"},
+				Args:           []string{"--model", "Qwen"},
+				ReadinessProbe: &corev1.Probe{},
+				LivenessProbe:  &corev1.Probe{},
+				StartupProbe:   &corev1.Probe{},
+			}},
+		},
+	}, PodOptions{
+		Namespace:       "test-ns",
+		CheckpointID:    "hash",
+		ArtifactVersion: "2",
+		Storage: Storage{
+			Type:     StorageTypePVC,
+			PVCName:  "snapshot-pvc",
+			BasePath: "/checkpoints",
+		},
+		SeccompProfile: DefaultSeccompLocalhostProfile,
+	})
+
+	if restorePod.Name != "worker" || restorePod.Namespace != "test-ns" {
+		t.Fatalf("unexpected restore pod identity: %#v", restorePod.ObjectMeta)
+	}
+	if restorePod.Labels[RestoreTargetLabel] != "true" {
+		t.Fatalf("expected restore target label: %#v", restorePod.Labels)
+	}
+	if restorePod.Labels[CheckpointIDLabel] != "hash" {
+		t.Fatalf("expected checkpoint id label: %#v", restorePod.Labels)
+	}
+	if restorePod.Annotations[CheckpointArtifactVersionAnnotation] != "2" {
+		t.Fatalf("expected checkpoint artifact version annotation: %#v", restorePod.Annotations)
+	}
+	if restorePod.Spec.RestartPolicy != corev1.RestartPolicyNever {
+		t.Fatalf("expected restartPolicy Never, got %#v", restorePod.Spec.RestartPolicy)
+	}
+	if len(restorePod.Spec.Containers[0].Command) != 2 || restorePod.Spec.Containers[0].Command[0] != "sleep" || restorePod.Spec.Containers[0].Command[1] != "infinity" {
+		t.Fatalf("expected placeholder command, got %#v", restorePod.Spec.Containers[0].Command)
+	}
+	if restorePod.Spec.Containers[0].Args != nil {
+		t.Fatalf("expected restore args to be cleared: %#v", restorePod.Spec.Containers[0].Args)
+	}
+	if restorePod.Spec.Containers[0].ReadinessProbe != nil {
+		t.Fatalf("expected readiness probe to be cleared: %#v", restorePod.Spec.Containers[0].ReadinessProbe)
+	}
+	if restorePod.Spec.Containers[0].LivenessProbe != nil {
+		t.Fatalf("expected liveness probe to be cleared: %#v", restorePod.Spec.Containers[0].LivenessProbe)
+	}
+	if restorePod.Spec.Containers[0].StartupProbe != nil {
+		t.Fatalf("expected startup probe to be cleared: %#v", restorePod.Spec.Containers[0].StartupProbe)
+	}
+	if restorePod.Spec.SecurityContext == nil || restorePod.Spec.SecurityContext.SeccompProfile == nil {
+		t.Fatalf("expected seccomp profile to be injected: %#v", restorePod.Spec.SecurityContext)
+	}
+	if len(restorePod.Spec.Volumes) != 1 {
+		t.Fatalf("expected checkpoint volume, got %#v", restorePod.Spec.Volumes)
+	}
+	if len(restorePod.Spec.Containers[0].VolumeMounts) != 1 {
+		t.Fatalf("expected checkpoint mount, got %#v", restorePod.Spec.Containers[0].VolumeMounts)
+	}
+}
+
+func TestPrepareRestorePodSpec(t *testing.T) {
+	podSpec := corev1.PodSpec{}
+	container := corev1.Container{
+		Command:        []string{"python3", "-m", "dynamo.vllm"},
+		Args:           []string{"--model", "Qwen"},
+		ReadinessProbe: &corev1.Probe{},
+		LivenessProbe:  &corev1.Probe{},
+		StartupProbe:   &corev1.Probe{},
+	}
+
+	storage := Storage{
+		Type:     StorageTypePVC,
+		PVCName:  "snapshot-pvc",
+		BasePath: "/checkpoints",
+	}
+	PrepareRestorePodSpec(&podSpec, &container, storage, DefaultSeccompLocalhostProfile, true)
+	PrepareRestorePodSpec(&podSpec, &container, storage, DefaultSeccompLocalhostProfile, true)
+
+	if podSpec.SecurityContext == nil || podSpec.SecurityContext.SeccompProfile == nil {
+		t.Fatalf("expected seccomp profile to be injected: %#v", podSpec.SecurityContext)
+	}
+	if len(podSpec.Volumes) != 1 {
+		t.Fatalf("expected checkpoint volume, got %#v", podSpec.Volumes)
+	}
+	if len(container.VolumeMounts) != 1 {
+		t.Fatalf("expected checkpoint mount, got %#v", container.VolumeMounts)
+	}
+	if len(container.Command) != 2 || container.Command[0] != "sleep" || container.Command[1] != "infinity" {
+		t.Fatalf("expected placeholder command, got %#v", container.Command)
+	}
+	if container.Args != nil {
+		t.Fatalf("expected restore args to be cleared: %#v", container.Args)
+	}
+	if container.ReadinessProbe != nil || container.LivenessProbe != nil || container.StartupProbe != nil {
+		t.Fatalf("expected probes to be cleared: %#v %#v %#v", container.ReadinessProbe, container.LivenessProbe, container.StartupProbe)
+	}
+}
+
+func TestValidateRestorePodSpec(t *testing.T) {
+	profile := DefaultSeccompLocalhostProfile
+	podSpec := &corev1.PodSpec{
+		SecurityContext: &corev1.PodSecurityContext{
+			SeccompProfile: &corev1.SeccompProfile{
+				Type:             corev1.SeccompProfileTypeLocalhost,
+				LocalhostProfile: &profile,
+			},
+		},
+		Volumes: []corev1.Volume{{
+			Name: CheckpointVolumeName,
+			VolumeSource: corev1.VolumeSource{
+				PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
+					ClaimName: "snapshot-pvc",
+				},
+			},
+		}},
+		Containers: []corev1.Container{{
+			Name: "main",
+			VolumeMounts: []corev1.VolumeMount{{
+				Name:      CheckpointVolumeName,
+				MountPath: "/checkpoints",
+			}},
+		}},
+	}
+	storage := Storage{
+		Type:     StorageTypePVC,
+		PVCName:  "snapshot-pvc",
+		BasePath: "/checkpoints",
+	}
+
+	if err := ValidateRestorePodSpec(podSpec, storage, DefaultSeccompLocalhostProfile); err != nil {
+		t.Fatalf("expected restore pod spec to be valid, got %v", err)
+	}
+
+	badSpec := podSpec.DeepCopy()
+	badSpec.Volumes = nil
+	if err := ValidateRestorePodSpec(badSpec, storage, DefaultSeccompLocalhostProfile); err == nil || err.Error() != "missing checkpoint-storage volume for PVC snapshot-pvc" {
+		t.Fatalf("expected missing volume error, got %v", err)
+	}
+
+	badSpec = podSpec.DeepCopy()
+	badSpec.Containers[0].VolumeMounts = nil
+	if err := ValidateRestorePodSpec(badSpec, storage, DefaultSeccompLocalhostProfile); err == nil || err.Error() != "missing checkpoint-storage mount at /checkpoints" {
+		t.Fatalf("expected missing mount error, got %v", err)
+	}
+
+	badSpec = podSpec.DeepCopy()
+	badSpec.SecurityContext = nil
+	if err := ValidateRestorePodSpec(badSpec, storage, DefaultSeccompLocalhostProfile); err == nil || err.Error() != "missing localhost seccomp profile" {
+		t.Fatalf("expected missing seccomp error, got %v", err)
+	}
+}
+
+func TestValidateRestorePodSpecRequiresExactlyOneContainer(t *testing.T) {
+	profile := DefaultSeccompLocalhostProfile
+	podSpec := &corev1.PodSpec{
+		SecurityContext: &corev1.PodSecurityContext{
+			SeccompProfile: &corev1.SeccompProfile{
+				Type:             corev1.SeccompProfileTypeLocalhost,
+				LocalhostProfile: &profile,
+			},
+		},
+		Volumes: []corev1.Volume{{
+			Name: CheckpointVolumeName,
+			VolumeSource: corev1.VolumeSource{
+				PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
+					ClaimName: "snapshot-pvc",
+				},
+			},
+		}},
+		Containers: []corev1.Container{
+			{
+				Name: "worker",
+				VolumeMounts: []corev1.VolumeMount{{
+					Name:      CheckpointVolumeName,
+					MountPath: "/checkpoints",
+				}},
+			},
+			{Name: "sidecar"},
+		},
+	}
+
+	storage := Storage{
+		Type:     StorageTypePVC,
+		PVCName:  "snapshot-pvc",
+		BasePath: "/checkpoints",
+	}
+
+	if err := ValidateRestorePodSpec(podSpec, storage, DefaultSeccompLocalhostProfile); err == nil || err.Error() != "restore target must have exactly one container, got 2" {
+		t.Fatalf("expected multi-container restore target to be rejected, got %v", err)
+	}
+}
+
+func TestDiscoverStorageFromDaemonSetsUsesCheckpointsVolume(t *testing.T) {
+	daemonSet := appsv1.DaemonSet{
+		ObjectMeta: metav1.ObjectMeta{Name: "snapshot-agent", Namespace: "test-ns"},
+		Spec: appsv1.DaemonSetSpec{
+			Template: corev1.PodTemplateSpec{
+				Spec: corev1.PodSpec{
+					Containers: []corev1.Container{{
+						Name: SnapshotAgentContainerName,
+						VolumeMounts: []corev1.VolumeMount{
+							{Name: "cache", MountPath: "/cache"},
+							{Name: SnapshotAgentVolumeName, MountPath: "/checkpoints"},
+						},
+					}},
+					Volumes: []corev1.Volume{
+						{
+							Name: "cache",
+							VolumeSource: corev1.VolumeSource{
+								PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ClaimName: "cache-pvc"},
+							},
+						},
+						{
+							Name: SnapshotAgentVolumeName,
+							VolumeSource: corev1.VolumeSource{
+								PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ClaimName: "snapshot-pvc"},
+							},
+						},
+					},
+				},
+			},
+		},
+	}
+
+	storage, err := DiscoverStorageFromDaemonSets("test-ns", []appsv1.DaemonSet{daemonSet})
+	if err != nil {
+		t.Fatalf("expected daemonset storage discovery to succeed, got %v", err)
+	}
+	if storage.PVCName != "snapshot-pvc" || storage.BasePath != "/checkpoints" {
+		t.Fatalf("expected snapshot PVC discovery, got %#v", storage)
+	}
+}
--- a/docs/kubernetes/api-reference.md
+++ b/docs/kubernetes/api-reference.md
@@ -265,7 +265,7 @@ _Appears in:_
 | `sharedMemory` _[SharedMemorySpec](#sharedmemoryspec)_ | SharedMemory controls the tmpfs mounted at /dev/shm for the checkpoint Job pod.<br />When omitted, checkpoint Jobs use the same default 8Gi tmpfs as Dynamo components. |  | Optional: \{\} <br /> |
 | `activeDeadlineSeconds` _integer_ | ActiveDeadlineSeconds specifies the maximum time the Job can run | 3600 | Minimum: 1 <br />Optional: \{\} <br /> |
 | `backoffLimit` _integer_ | Deprecated: BackoffLimit is ignored. Checkpoint Jobs never retry. |  | Minimum: 0 <br />Optional: \{\} <br /> |
-| `ttlSecondsAfterFinished` _integer_ | TTLSecondsAfterFinished specifies how long to keep the Job after completion | 300 | Minimum: 0 <br />Optional: \{\} <br /> |
+| `ttlSecondsAfterFinished` _integer_ | Deprecated: TTLSecondsAfterFinished is ignored. Checkpoint Jobs use a fixed<br />300 second TTL. |  | Minimum: 0 <br />Optional: \{\} <br /> |


 #### DynamoCheckpointPhase
@@ -284,7 +284,7 @@ _Appears in:_
 | --- | --- |
 | `Pending` | DynamoCheckpointPhasePending indicates the checkpoint CR has been created but the Job has not started<br /> |
 | `Creating` | DynamoCheckpointPhaseCreating indicates the checkpoint Job is running<br /> |
-| `Ready` | DynamoCheckpointPhaseReady indicates the checkpoint tar file is available on the PVC<br /> |
+| `Ready` | DynamoCheckpointPhaseReady indicates the checkpoint artifact is available<br /> |
 | `Failed` | DynamoCheckpointPhaseFailed indicates the checkpoint creation failed<br /> |


@@ -320,10 +320,10 @@ _Appears in:_
 | --- | --- | --- | --- |
 | `phase` _[DynamoCheckpointPhase](#dynamocheckpointphase)_ | Phase represents the current phase of the checkpoint lifecycle |  | Enum: [Pending Creating Ready Failed] <br />Optional: \{\} <br /> |
 | `identityHash` _string_ | IdentityHash is the computed hash of the checkpoint identity<br />This hash is used to identify equivalent checkpoints |  | Optional: \{\} <br /> |
-| `location` _string_ | Location is the full URI/path to the checkpoint in the storage backend<br />For PVC: same as TarPath (e.g., /checkpoints/\{hash\}.tar)<br />For S3: s3://bucket/prefix/\{hash\}.tar<br />For OCI: oci://registry/repo:\{hash\} |  | Optional: \{\} <br /> |
-| `storageType` _[DynamoCheckpointStorageType](#dynamocheckpointstoragetype)_ | StorageType indicates the storage backend type used for this checkpoint |  | Enum: [pvc s3 oci] <br />Optional: \{\} <br /> |
+| `location` _string_ | Deprecated: Location is ignored and no longer populated. It is retained<br />only so older objects continue to validate. |  | Optional: \{\} <br /> |
+| `storageType` _[DynamoCheckpointStorageType](#dynamocheckpointstoragetype)_ | Deprecated: StorageType is ignored and no longer populated. It is retained<br />only so older objects continue to validate. |  | Enum: [pvc s3 oci] <br />Optional: \{\} <br /> |
 | `jobName` _string_ | JobName is the name of the checkpoint creation Job |  | Optional: \{\} <br /> |
-| `createdAt` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#time-v1-meta)_ | CreatedAt is the timestamp when the checkpoint tar was created |  | Optional: \{\} <br /> |
+| `createdAt` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#time-v1-meta)_ | CreatedAt is the timestamp when the checkpoint became ready |  | Optional: \{\} <br /> |
 | `message` _string_ | Message provides additional information about the current state |  | Optional: \{\} <br /> |
 | `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | DEPRECATED: Conditions are deprecated. Use status.phase instead. |  | Optional: \{\} <br /> |

@@ -332,7 +332,9 @@ _Appears in:_

 _Underlying type:_ _string_

-DynamoCheckpointStorageType defines the supported storage backends for checkpoints
+Deprecated: StorageType is retained for compatibility with older
+DynamoCheckpoint status consumers. The current checkpoint flow publishes
+PVC-backed artifacts discovered from the snapshot-agent DaemonSet.

 _Validation:_
 - Enum: [pvc s3 oci]
@@ -1716,14 +1718,15 @@ _Appears in:_
 | --- | --- | --- | --- |
 | `enabled` _boolean_ | Enabled indicates if checkpoint functionality is enabled |  |  |
 | `readyForCheckpointFilePath` _string_ | ReadyForCheckpointFilePath signals model readiness for checkpoint jobs | /tmp/ready-for-checkpoint |  |
-| `storage` _[CheckpointStorageConfiguration](#checkpointstorageconfiguration)_ | Storage holds storage backend configuration |  |  |
+| `storage` _[CheckpointStorageConfiguration](#checkpointstorageconfiguration)_ | Deprecated: Storage is retained for compatibility and ignored by the<br />current snapshot flow. Snapshot storage is discovered from the<br />snapshot-agent DaemonSet instead. |  |  |


 #### CheckpointOCIConfig



-CheckpointOCIConfig holds OCI registry storage configuration.
+Deprecated: CheckpointOCIConfig is retained for compatibility and ignored by
+the current snapshot flow.



@@ -1732,15 +1735,16 @@ _Appears in:_

 | Field | Description | Default | Validation |
 | --- | --- | --- | --- |
-| `uri` _string_ | URI is the OCI URI (oci://registry/repository) |  |  |
-| `credentialsSecretRef` _string_ | CredentialsSecretRef is the name of the docker config secret |  |  |
+| `uri` _string_ | URI is the legacy OCI URI (oci://registry/repository). |  |  |
+| `credentialsSecretRef` _string_ | CredentialsSecretRef is the legacy docker config secret name. |  |  |


 #### CheckpointPVCConfig



-CheckpointPVCConfig holds PVC storage configuration.
+Deprecated: CheckpointPVCConfig is retained for compatibility and ignored by
+the current snapshot flow.



@@ -1749,15 +1753,16 @@ _Appears in:_

 | Field | Description | Default | Validation |
 | --- | --- | --- | --- |
-| `pvcName` _string_ | PVCName is the name of the PVC | snapshot-pvc |  |
-| `basePath` _string_ | BasePath is the base directory within the PVC | /checkpoints |  |
+| `pvcName` _string_ | PVCName is the legacy PVC name. |  |  |
+| `basePath` _string_ | BasePath is the legacy base directory within the PVC. |  |  |


 #### CheckpointS3Config



-CheckpointS3Config holds S3 storage configuration.
+Deprecated: CheckpointS3Config is retained for compatibility and ignored by
+the current snapshot flow.



@@ -1766,15 +1771,16 @@ _Appears in:_

 | Field | Description | Default | Validation |
 | --- | --- | --- | --- |
-| `uri` _string_ | URI is the S3 URI (s3://[endpoint/]bucket/prefix) |  |  |
-| `credentialsSecretRef` _string_ | CredentialsSecretRef is the name of the credentials secret |  |  |
+| `uri` _string_ | URI is the legacy S3 URI (s3://[endpoint/]bucket/prefix). |  |  |
+| `credentialsSecretRef` _string_ | CredentialsSecretRef is the legacy credentials secret name. |  |  |


 #### CheckpointStorageConfiguration



-CheckpointStorageConfiguration holds storage backend configuration for checkpoints.
+Deprecated: CheckpointStorageConfiguration is retained for compatibility and
+ignored by the current snapshot flow.



@@ -1783,10 +1789,10 @@ _Appears in:_

 | Field | Description | Default | Validation |
 | --- | --- | --- | --- |
-| `type` _string_ | Type is the storage backend type: pvc, s3, or oci | pvc |  |
-| `pvc` _[CheckpointPVCConfig](#checkpointpvcconfig)_ | PVC configuration (used when Type=pvc) |  |  |
-| `s3` _[CheckpointS3Config](#checkpoints3config)_ | S3 configuration (used when Type=s3) |  |  |
-| `oci` _[CheckpointOCIConfig](#checkpointociconfig)_ | OCI configuration (used when Type=oci) |  |  |
+| `type` _string_ | Type is the legacy storage backend type: pvc, s3, or oci. |  |  |
+| `pvc` _[CheckpointPVCConfig](#checkpointpvcconfig)_ | PVC configuration for legacy pvc-based settings. |  |  |
+| `s3` _[CheckpointS3Config](#checkpoints3config)_ | S3 configuration for legacy s3-based settings. |  |  |
+| `oci` _[CheckpointOCIConfig](#checkpointociconfig)_ | OCI configuration for legacy oci-based settings. |  |  |


 #### DiscoveryBackend

--- a/docs/kubernetes/snapshot.md
+++ b/docs/kubernetes/snapshot.md
@@ -4,32 +4,38 @@
 title: Snapshot
 ---

-> ⚠️ **Experimental Feature**: Dynamo Snapshot is currently in **preview** and may only be functional in some k8s cluster setups. The Dynamo Snapshot DaemonSet runs in privileged mode to perform CRIU operations. See [Limitations](#limitations) for details.
+> ⚠️ **Experimental Feature**: Dynamo Snapshot is currently in preview and may only be functional in some cluster setups. The `snapshot-agent` DaemonSet runs in privileged mode to perform CRIU operations. See [Limitations](#limitations) for details.

-**Dynamo Snapshot** is an experimental infrastructure for fast-starting GPU applications in Kubernetes using CRIU (Checkpoint/Restore in User-space) and NVIDIA's cuda-checkpoint utility. Dynamo Snapshot dramatically reduces cold-start times for large models from minutes to seconds by capturing initialized application state and restoring it on-demand.
+**Dynamo Snapshot** is infrastructure for fast-starting GPU applications in Kubernetes using CRIU (Checkpoint/Restore in Userspace) and NVIDIA's `cuda-checkpoint` utility. The usual flow is:
+
+1. start a worker once and checkpoint its initialized state
+2. store that checkpoint on a namespace-local snapshot volume
+3. restore later workers from that checkpoint instead of cold-starting again

 | Startup Type | Time | What Happens |
 |--------------|------|--------------|
 | **Cold Start** | ~1 min | Download model, load to GPU, initialize engine |
-| **Warm Start** (restore from checkpoint) | ~ 10 sec | Restore from a ready checkpoint directory |
+| **Warm Start** (restore from checkpoint) | ~10 sec | Restore from a ready checkpoint directory |

-> ⚠️ Restore time may vary depending on cluster configuration (storage bandwidth, GPU model, etc.)
+> ⚠️ Restore time depends on storage bandwidth, GPU model, and whether the restore stays on the same node.

 ## Prerequisites

- Dynamo Platform/Operator installed on a k8s cluster with **x86_64 (amd64)** GPU nodes
- NVIDIA driver 580.xx or newer on the target GPU nodes
- `ReadWriteMany` storage if you need cross-node restore
- vLLM or SGLang backend (TensorRT-LLM is not supported yet)
- Security clearance to run a privileged DaemonSet
+- x86_64 (`amd64`) GPU nodes
+- NVIDIA driver 580.xx or newer on the target GPU nodes (590.xx or newer if testing multi-GPU snapshots)
+- vLLM or SGLang backend today
+- `ReadWriteMany` storage for cross-node restore

-## Quick Start
+## Quick Start via `DynamoCheckpoint` CR

-This guide assumes a normal Dynamo deployment workflow is already present on your Kubernetes cluster.
+1. Build a placeholder image
+2. Install the snapshot chart
+3. Create a `DynamoCheckpoint` and wait for it to become ready
+4. Deploy a `DynamoGraphDeployment` that restores from the corresponding `checkpointRef`

 ### 1. Build and push a placeholder image

-Snapshot-enabled workers must use a placeholder image that wraps the normal runtime image with the restore tooling. If you do not already have one, build it with the snapshot placeholder target and push it to a registry your cluster can pull from:
+Snapshot-enabled workers must use a placeholder image that wraps the normal runtime image with restore tooling. If you do not already have one, build it and push it to a registry your cluster can pull from:

 ```bash
 export RUNTIME_IMAGE=registry.example.com/dynamo/vllm-runtime:1.0.0
@@ -45,7 +51,7 @@ make docker-push-placeholder \
  PLACEHOLDER_IMG="${PLACEHOLDER_IMAGE}"
 ```

-This flow is defined in [deploy/snapshot/Makefile](https://github.com/ai-dynamo/dynamo/blob/main/deploy/snapshot/Makefile) and [deploy/snapshot/Dockerfile](https://github.com/ai-dynamo/dynamo/blob/main/deploy/snapshot/Dockerfile). The placeholder image preserves the base runtime entrypoint and command contract, and adds the CRIU, `cuda-checkpoint`, and `nsrestore` tooling needed for restore.
+The placeholder image preserves the normal runtime entrypoint/command contract and adds the `criu`, `cuda-checkpoint`, and `nsrestore` tooling needed for checkpoint and restore.

 To build either snapshot image against a custom CRIU fork or ref, pass
 `CRIU_REPO` and `CRIU_REF` through `make`. If they are unset, the Dockerfile
@@ -66,17 +72,12 @@ make docker-build-placeholder \

 ### 2. Enable checkpointing in the platform and verify it

-Whether you are installing or upgrading `dynamo-platform`, the operator must have checkpointing enabled and must point at the same storage that the snapshot chart will use:
+Whether you are installing or upgrading `dynamo-platform`, the operator only needs checkpointing enabled:

 ```yaml
 dynamo-operator:
  checkpoint:
    enabled: true
-    storage:
-      type: pvc
-      pvc:
-        pvcName: snapshot-pvc
-        basePath: /checkpoints
 ```

 If the platform is already installed, verify that the operator config contains the checkpoint block:
@@ -90,11 +91,9 @@ kubectl get configmap "${OPERATOR_CONFIG}" -n "${PLATFORM_NAMESPACE}" \
  -o jsonpath='{.data.config\.yaml}' | sed -n '/^checkpoint:/,/^[^[:space:]]/p'
 ```

-Verify that the rendered config includes `enabled: true` and the same PVC name and base path you plan to use for the snapshot chart.
-
-For the full platform/operator configuration surface, see [deploy/helm/charts/platform/README.md](https://github.com/ai-dynamo/dynamo/blob/main/deploy/helm/charts/platform/README.md) and [deploy/helm/charts/platform/components/operator/values.yaml](https://github.com/ai-dynamo/dynamo/blob/main/deploy/helm/charts/platform/components/operator/values.yaml).
+Verify that the rendered config includes `enabled: true`.

-### 3. Install the snapshot chart
+### 3. Install the snapshot chart in the workload namespace

 ```bash
 helm upgrade --install snapshot ./deploy/helm/charts/snapshot \
@@ -103,28 +102,84 @@ helm upgrade --install snapshot ./deploy/helm/charts/snapshot \
  --set storage.pvc.create=true
 ```

-Cross-node restore requires `ReadWriteMany` storage. The chart defaults to that mode.
+Cross-node restore requires shared `ReadWriteMany` storage. The chart defaults to that mode. If your cluster does not have a default storage class, also set `storage.pvc.storageClass`.

-For better restore times, use a fast `ReadWriteMany` StorageClass for the checkpoint PVC. If you are reusing an existing checkpoint PVC, do not set `storage.pvc.create=true`; install the chart with `storage.pvc.create=false` and point `storage.pvc.name` at the existing PVC instead.
+If you are reusing an existing checkpoint PVC, do not set `storage.pvc.create=true`; install the chart with `storage.pvc.create=false` and set `storage.pvc.name` instead.

 Verify that the PVC and DaemonSet are ready:

 ```bash
 kubectl get pvc snapshot-pvc -n ${NAMESPACE}
 kubectl rollout status daemonset/snapshot-agent -n ${NAMESPACE}
+kubectl get pods -n ${NAMESPACE} -l app.kubernetes.io/component=snapshot-agent -o wide
+```
+
+### 4. Create a `DynamoCheckpoint`
+
+The checkpoint Job pod template should match the worker container you want to checkpoint. For the snapshot flow, the important parts are the checkpoint identity, the first container in `spec.containers`, and the placeholder image; the rest of the pod template should mirror your normal worker config.
+
+```yaml
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoCheckpoint
+metadata:
+  name: qwen3-06b-bf16
+spec:
+  identity:
+    model: Qwen/Qwen3-0.6B
+    backendFramework: vllm
+    tensorParallelSize: 1
+    dtype: bfloat16
+    maxModelLen: 2048
+
+  job:
+    activeDeadlineSeconds: 3600
+    podTemplateSpec:
+      spec:
+        ...
+        containers:
+          - name: worker
+            image: registry.example.com/dynamo/vllm-placeholder:1.0.0
+            ...
+```
+
+For a full working example, see [deploy/operator/config/samples/nvidia.com_v1alpha1_dynamocheckpoint.yaml](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/config/samples/nvidia.com_v1alpha1_dynamocheckpoint.yaml).
+
+Apply it:
+
+```bash
+kubectl apply -f qwen3-checkpoint.yaml -n ${NAMESPACE}
+```
+
+### 5. Wait for the checkpoint to become ready
+
+```bash
+kubectl get dckpt -n ${NAMESPACE} \
+  -o custom-columns=NAME:.metadata.name,HASH:.status.identityHash,PHASE:.status.phase
+
+kubectl wait \
+  --for=jsonpath='{.status.phase}'=Ready \
+  dynamocheckpoint/qwen3-06b-bf16 \
+  -n ${NAMESPACE} \
+  --timeout=30m
 ```

-For the full snapshot chart configuration surface, see [deploy/helm/charts/snapshot/README.md](https://github.com/ai-dynamo/dynamo/blob/main/deploy/helm/charts/snapshot/README.md) and [deploy/helm/charts/snapshot/values.yaml](https://github.com/ai-dynamo/dynamo/blob/main/deploy/helm/charts/snapshot/values.yaml).
+The useful status fields are:

-### 4. Apply a snapshot-compatible `DynamoGraphDeployment`
+- `status.phase`: high-level lifecycle (`Pending`, `Creating`, `Ready`, `Failed`)
+- `status.identityHash`: deterministic hash of `spec.identity`
+- `status.jobName`: checkpoint Job name
+- `status.createdAt`: timestamp recorded when the checkpoint became ready
+- `status.message`: progress or failure detail when available

-This example is adapted from [examples/backends/vllm/deploy/agg.yaml](https://github.com/ai-dynamo/dynamo/blob/main/examples/backends/vllm/deploy/agg.yaml). The worker must use the placeholder image from step 1, and the checkpoint identity must describe the runtime state you want to reuse.
+### 6. Deploy a `DynamoGraphDeployment` that restores from `checkpointRef`
+
+Once the checkpoint is `Ready`, restore a worker from it explicitly:

 ```yaml
 apiVersion: nvidia.com/v1alpha1
 kind: DynamoGraphDeployment
 metadata:
-  name: vllm-snapshot-demo
+  name: vllm-checkpointref-demo
 spec:
  services:
    Frontend:
@@ -137,133 +192,139 @@ spec:
    VllmDecodeWorker:
      componentType: worker
      replicas: 1
-      resources:
-        limits:
-          gpu: "1"
-      readinessProbe:
-        httpGet:
-          path: /live
-          port: system
-        periodSeconds: 1
-        timeoutSeconds: 4
-        failureThreshold: 3
      checkpoint:
        enabled: true
-        mode: Auto
-        identity:
-          model: Qwen/Qwen3-0.6B
-          backendFramework: vllm
+        checkpointRef: qwen3-06b-bf16
      extraPodSpec:
        mainContainer:
          image: registry.example.com/dynamo/vllm-placeholder:1.0.0
-          command:
-            - python3
-            - -m
-            - dynamo.vllm
-          args:
-            - --model
-            - Qwen/Qwen3-0.6B
-          env:
-            - name: NCCL_DEBUG
-              value: ERROR
-            - name: TORCH_CPP_LOG_LEVEL
-              value: ERROR
-            - name: TORCH_DISTRIBUTED_DEBUG
-              value: "OFF"
+          ...
+        ...
 ```

-For SGLang, use `dynamo.sglang`, an SGLang placeholder image, `backendFramework: sglang`, and the matching CLI flags.
-
-Apply the manifest:
+Apply it:

 ```bash
-kubectl apply -f vllm-snapshot-demo.yaml -n ${NAMESPACE}
+kubectl apply -f vllm-checkpointref-demo.yaml -n ${NAMESPACE}
+kubectl get pods -n ${NAMESPACE} -w
 ```

-On the first rollout, the worker cold-starts, the operator resolves the checkpoint identity hash, and the checkpoint Job writes a new checkpoint directory into `snapshot-pvc`.
+The `VllmDecodeWorker` pod should restore from the ready checkpoint instead of creating a new one.

-### 5. Wait for the checkpoint to become ready
+## DGD Auto Flow

-Auto mode resolves checkpoints by identity hash. It may create `checkpoint-<hash>` or reuse an existing checkpoint with a different CR name. For the sample identity above, the hash is `73e74442beb109ed`:
+`checkpointRef` is the most explicit path. `mode: Auto` is the higher-level path: the operator computes the checkpoint identity hash, looks for an equivalent `DynamoCheckpoint`, and creates one only when no matching checkpoint exists. If a `DynamoCheckpoint` already exists with the same identity, Auto mode reuses it. If no matching checkpoint exists yet, the first worker cold-starts and the operator creates the checkpoint in the background.

-```bash
-kubectl get dckpt -n ${NAMESPACE}
+```yaml
+checkpoint:
+  enabled: true
+  mode: Auto
+  identity:
+    model: Qwen/Qwen3-0.6B
+    backendFramework: vllm
+    tensorParallelSize: 1
+    dtype: bfloat16
+    maxModelLen: 2048
+```

-CKPT_NAME=$(kubectl get dckpt -n ${NAMESPACE} \
-  -l nvidia.com/snapshot-checkpoint-hash=73e74442beb109ed \
-  -o jsonpath='{.items[0].metadata.name}')
-kubectl wait \
-  --for=jsonpath='{.status.phase}'=Ready \
-  "dynamocheckpoint/${CKPT_NAME}" \
-  -n ${NAMESPACE} \
-  --timeout=5m
+Inside a `DynamoGraphDeployment`, it looks like this:
+
+```yaml
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: vllm-auto-demo
+spec:
+  services:
+    Frontend:
+      componentType: frontend
+      replicas: 1
+      extraPodSpec:
+        mainContainer:
+          image: registry.example.com/dynamo/vllm-runtime:1.0.0
+
+    VllmDecodeWorker:
+      componentType: worker
+      replicas: 1
+      checkpoint:
+        enabled: true
+        mode: Auto
+        identity:
+          model: Qwen/Qwen3-0.6B
+          backendFramework: vllm
+          tensorParallelSize: 1
+          dtype: bfloat16
+          maxModelLen: 2048
+      extraPodSpec:
+        mainContainer:
+          image: registry.example.com/dynamo/vllm-placeholder:1.0.0
+          ...
+        ...
 ```

-If you change the checkpoint identity, the hash changes and so does the checkpoint selected by Auto mode.
+Useful inspection commands:
+
+```bash
+kubectl get dgd vllm-auto-demo -n ${NAMESPACE} \
+  -o jsonpath='{.status.checkpoints.VllmDecodeWorker.checkpointName}{"\n"}{.status.checkpoints.VllmDecodeWorker.identityHash}{"\n"}{.status.checkpoints.VllmDecodeWorker.ready}{"\n"}'

-### 6. Trigger restore
+kubectl get dckpt -n ${NAMESPACE}
+```

-Once the checkpoint is ready, scale the worker replicas from `1` to `2`:
+If you want to force a new restore after the checkpoint becomes ready, scale the worker:

 ```bash
-kubectl patch dgd vllm-snapshot-demo -n ${NAMESPACE} --type=merge \
+kubectl patch dgd vllm-auto-demo -n ${NAMESPACE} --type=merge \
  -p '{"spec":{"services":{"VllmDecodeWorker":{"replicas":2}}}}'
 ```

-New worker pods for `VllmDecodeWorker` will restore from the ready checkpoint automatically.
+## Lower-Level Testing With `snapshotctl`

-## Checkpoint Configuration
+It is possible to checkpoint and restore pods without the Dynamo operator via the lower-level `snapshotctl` utility. However, the snapshot helm chart must be installed, with a running `snapshot-agent` DaemonSet in the namespace with the checkpoint PVC mounted.

-### Auto Mode (Recommended)
+`snapshotctl` is intended for lower-level debugging and validation workflows, not as the primary user-facing checkpoint interface. For command details and manifest requirements, see [deploy/snapshot/cmd/snapshotctl/README.md](../../deploy/snapshot/cmd/snapshotctl/README.md).

-The operator computes the checkpoint identity hash, looks up an existing `DynamoCheckpoint` by that hash, and creates a new `DynamoCheckpoint` only when no matching checkpoint already exists:
+### Checkpoint from a worker pod manifest

-```yaml
-checkpoint:
-  enabled: true
-  mode: Auto
-  identity:
-    model: "meta-llama/Llama-3-8B"
-    backendFramework: "vllm"  # or "sglang"
-    tensorParallelSize: 1
-    dtype: "bfloat16"
-    maxModelLen: 4096
+```bash
+snapshotctl checkpoint \
+  --manifest ./worker-pod.yaml \
+  --namespace ${NAMESPACE}
 ```

-The `DynamoGraphDeployment` mirrors checkpoint resolution state under `.status.checkpoints`, including the resolved checkpoint CR name, identity hash, and whether the checkpoint was visible to the worker when it started:
+The checkpoint manifest must be for a pod, contain exactly one worker container, and use a placeholder image.
+If you do not pass `--checkpoint-id`, `snapshotctl` generates one and prints it:

-```bash
-kubectl get dgd vllm-snapshot-demo -n ${NAMESPACE} \
-  -o jsonpath='{.status.checkpoints.VllmDecodeWorker.checkpointName}{"\n"}{.status.checkpoints.VllmDecodeWorker.identityHash}{"\n"}'
+```text
+status=completed
+namespace=...
+name=...
+checkpoint_job=...
+checkpoint_id=manual-snapshot-...
+checkpoint_location=/checkpoints/...
 ```

-### Manual Management and `checkpointRef`
+### Restore from a worker pod manifest

-Use `checkpointRef` when you want a service to restore from a specific `DynamoCheckpoint` CR:
-
-```yaml
-checkpoint:
-  enabled: true
-  checkpointRef: "qwen3-06b-bf16"
+```bash
+snapshotctl restore \
+  --manifest ./worker-pod.yaml \
+  --namespace ${NAMESPACE} \
+  --checkpoint-id manual-snapshot-...
 ```

-This is useful when:
- You want to **pre-warm checkpoints** before creating DGDs
- You want **explicit control** over which checkpoint to use
-
-`checkpointRef` resolves by `DynamoCheckpoint.metadata.name`. Use a readable CR name when you want an explicit checkpoint that operators can reference directly.
+This creates a new restore pod from the manifest and waits for the restore annotation to reach `completed`.

-If you are managing checkpoint CRs yourself, set `mode: Manual` on the service to prevent the operator from creating a new `DynamoCheckpoint` when identity-based lookup does not find one.
+### Restore an existing pod in place

 ```bash
-# Check checkpoint status by CR name
-kubectl get dynamocheckpoint qwen3-06b-bf16 -n ${NAMESPACE}
-
-# Now create DGD referencing it
-kubectl apply -f my-dgd.yaml -n ${NAMESPACE}
+snapshotctl restore \
+  --pod existing-restore-target \
+  --namespace ${NAMESPACE} \
+  --checkpoint-id manual-snapshot-...
 ```

-`mode: Auto` still resolves checkpoints by identity hash. The operator backfills `status.identityHash` and the `nvidia.com/snapshot-checkpoint-hash` label on each `DynamoCheckpoint` so auto lookup and uniqueness checks do not depend on the CR name.
+This patches restore metadata onto an existing pod that is already snapshot-compatible.

 ## Checkpoint Identity

@@ -274,215 +335,110 @@ Checkpoints are uniquely identified by a **16-character SHA256 hash** (64 bits)
 | `model` | ✓ | ✓ | `meta-llama/Llama-3-8B` |
 | `backendFramework` | ✓ | ✓ | `sglang`, `vllm` |
 | `dynamoVersion` | | ✓ | `0.9.0`, `1.0.0` |
-| `tensorParallelSize` | | ✓ | `1`, `2`, `4`, `8` (default: 1) |
-| `pipelineParallelSize` | | ✓ | `1`, `2` (default: 1) |
+| `tensorParallelSize` | | ✓ | `1`, `2`, `4`, `8` |
+| `pipelineParallelSize` | | ✓ | `1`, `2` |
 | `dtype` | | ✓ | `float16`, `bfloat16`, `fp8` |
 | `maxModelLen` | | ✓ | `4096`, `8192` |
-| `extraParameters` | | ✓ | Custom key-value pairs |
-
-**Not included in hash** (don't invalidate checkpoint):
- `replicas`
- `nodeSelector`, `affinity`, `tolerations`
- `resources` (requests/limits)
- Logging/observability config
+| `extraParameters` | | ✓ | custom key-value pairs |

-**Example with all fields:**
-```yaml
-checkpoint:
-  enabled: true
-  mode: Auto
-  identity:
-    model: "meta-llama/Llama-3-8B"
-    backendFramework: "vllm"
-    dynamoVersion: "1.0.0"
-    tensorParallelSize: 1
-    pipelineParallelSize: 1
-    dtype: "bfloat16"
-    maxModelLen: 8192
-    extraParameters:
-      enableChunkedPrefill: "true"
-      quantization: "awq"
-```
+Fields that do **not** change the checkpoint hash include:

-## DynamoCheckpoint CRD
+- replica count
+- node placement (`nodeSelector`, `affinity`, `tolerations`)
+- resource requests/limits
+- logging or observability configuration

-The `DynamoCheckpoint` (shortname: `dckpt`) is a Kubernetes Custom Resource that manages checkpoint lifecycle.
+## `DynamoCheckpoint` CRD

-**When to create a DynamoCheckpoint directly:**
- **Pre-warming:** Create checkpoints before deploying DGDs for instant startup
- **Explicit control:** Manage checkpoint lifecycle independently from DGDs
+The `DynamoCheckpoint` (shortname: `dckpt`) is the operator-managed resource for checkpoint lifecycle.

-The operator requires `spec.identity` and `spec.job.podTemplateSpec`. The pod template should match the worker container you want checkpointed, including image, command, args, secrets, volumes, and resource limits. You do not need to set checkpoint-control plumbing manually; the operator injects the checkpoint-ready signal path for checkpoint Jobs and adds the restore metadata consumed by restored pods and the node-local controller inside the `snapshot-agent` DaemonSet.
-`spec.job.backoffLimit` is deprecated and ignored. Checkpoint Jobs are always single-attempt.
+Use it when you want:

-**Create a checkpoint:**
+- pre-warmed checkpoints before any `DynamoGraphDeployment` exists
+- explicit lifecycle control independent from a DGD
+- a stable human-readable name that services can reference with `checkpointRef`

-```yaml
-apiVersion: nvidia.com/v1alpha1
-kind: DynamoCheckpoint
-metadata:
-  name: qwen3-06b-bf16
-spec:
-  identity:
-    model: Qwen/Qwen3-0.6B
-    backendFramework: vllm
-    tensorParallelSize: 1
-    dtype: bfloat16
-    maxModelLen: 4096
+The operator requires:

-  job:
-    activeDeadlineSeconds: 3600
-    ttlSecondsAfterFinished: 300
-    podTemplateSpec:
-      spec:
-        restartPolicy: Never
-        containers:
-          - name: main
-            image: registry.example.com/dynamo/vllm-placeholder:1.0.0
-            command:
-              - python3
-              - -m
-              - dynamo.vllm
-            args:
-              - --model
-              - Qwen/Qwen3-0.6B
-            env:
-              - name: NCCL_DEBUG
-                value: ERROR
-              - name: TORCH_CPP_LOG_LEVEL
-                value: ERROR
-              - name: TORCH_DISTRIBUTED_DEBUG
-                value: "OFF"
-            resources:
-              limits:
-                nvidia.com/gpu: "1"
-```
+- `spec.identity`
+- `spec.job.podTemplateSpec`

-For this example identity, the operator computes a deterministic identity hash and stores it in `status.identityHash`. Auto mode uses that hash, not the CR name, when it decides whether to reuse or create a checkpoint.
+`spec.job.backoffLimit` is deprecated and ignored. Checkpoint Jobs are always single-attempt.

-**Check status:**
+Check status with:

 ```bash
-# List all checkpoints
-kubectl get dynamocheckpoint -n ${NAMESPACE}
-# Or use shortname
 kubectl get dckpt -n ${NAMESPACE}
-
-NAME               MODEL                                BACKEND  PHASE     HASH              AGE
-qwen3-06b-bf16     Qwen/Qwen3-0.6B                      vllm     Ready     3bff874d069f0ed5  5m
-llama3-8b-bf16     meta-llama/Meta-Llama-3-8B-Instruct  vllm     Creating  9be4f5574b5a285d  2m
+kubectl describe dckpt qwen3-06b-bf16 -n ${NAMESPACE}
+kubectl get dckpt qwen3-06b-bf16 -n ${NAMESPACE} -o yaml
 ```

-**Phases:**
+The `status` block looks like:
+
+```yaml
+status:
+  phase: Ready
+  identityHash: 3bff874d069f0ed5
+  jobName: checkpoint-job-3bff874d069f0ed5-1
+  createdAt: "2026-01-29T10:05:00Z"
+  message: ""
+```

-| Phase | Description |
-|-------|-------------|
-| `Pending` | CR created, waiting for job to start |
-| `Creating` | Checkpoint job is running |
-| `Ready` | Checkpoint available for use |
-| `Failed` | Checkpoint creation failed |
+## Limitations

-Other useful status fields are:
+- **LLM workers only**: checkpoint/restore supports LLM decode and prefill workers. Specialized workers such as multimodal, embedding, and diffusion are not supported.
+- **Multi-GPU remains preview**: tensor-parallel configurations are exercised in internal testing, but they are not yet a broadly supported production path across clusters.
+- **Network state is sensitive**: restore is sensitive to live TCP socket state. Loopback bootstrap/control sockets are the most reliable path today.
+- **Privileged DaemonSet required**: `snapshot-agent` must run privileged to execute CRIU and `cuda-checkpoint`. Workload pods do not need to be privileged.

-| Field | Meaning |
-|-------|---------|
-| `status.identityHash` | Deterministic hash of `spec.identity` used for auto lookup and reuse |
-| `status.jobName` | Name of the checkpoint Job |
-| `status.location` | Checkpoint location in the configured storage backend |
-| `status.storageType` | Storage backend type (`pvc`, `s3`, or `oci`) |
-| `status.createdAt` | Timestamp recorded when the checkpoint becomes ready |
-| `status.message` | Failure or progress message when available |
+## Troubleshooting

-`status.conditions` is deprecated for `DynamoCheckpoint`. The legacy condition types `JobCreated` and `JobCompleted` are kept for compatibility only. Prefer `status.phase`, `status.jobName`, and `status.message` when checking checkpoint progress.
+### Checkpoint Job finishes but the checkpoint never becomes `Ready`

-**Detailed status:**
+Snapshot only becomes `Ready` after `snapshot-agent` confirms the checkpoint contents. A completed Job is not enough by itself.

 ```bash
-kubectl describe dckpt qwen3-06b-bf16 -n ${NAMESPACE}
-```
+kubectl get dckpt <checkpoint-name> -n ${NAMESPACE} \
+  -o custom-columns=NAME:.metadata.name,PHASE:.status.phase,MESSAGE:.status.message,JOB:.status.jobName

-```yaml
-Status:
-  Phase: Ready
-  IdentityHash: 3bff874d069f0ed5
-  JobName: checkpoint-job-3bff874d069f0ed5
-  Location: /checkpoints/3bff874d069f0ed5
-  StorageType: pvc
-  CreatedAt: 2026-01-29T10:05:00Z
+JOB_NAME=$(kubectl get dckpt <checkpoint-name> -n ${NAMESPACE} -o jsonpath='{.status.jobName}')
+if [ -n "${JOB_NAME}" ]; then
+  kubectl logs job/"${JOB_NAME}" -n ${NAMESPACE}
+fi
+
+kubectl logs daemonset/snapshot-agent -n ${NAMESPACE} --all-containers
 ```

-**Reference from DGD:**
+If the worker template is wrong, the most common causes are using the raw runtime image instead of the placeholder image, or leaving out normal mounts and secrets that the worker needs to start.

-Once the checkpoint is `Ready`, you can reference it by CR name:
+### Restore cannot find or mount checkpoint storage

-```yaml
-spec:
-  services:
-    VllmDecodeWorker:
-      checkpoint:
-        enabled: true
-        checkpointRef: "qwen3-06b-bf16"
-```
+Restore discovers checkpoint storage from the `snapshot-agent` DaemonSet in the same namespace. That DaemonSet must be ready and must mount the checkpoint PVC.

-Or use `mode: Auto` with the same identity, and the operator will reuse the same deterministic checkpoint object automatically.
+```bash
+kubectl rollout status daemonset/snapshot-agent -n ${NAMESPACE}
+kubectl get daemonset -n ${NAMESPACE} -l app.kubernetes.io/component=snapshot-agent -o wide
+kubectl get pvc -n ${NAMESPACE}
+```

-## Limitations
+This is also the path that `snapshotctl` uses when it resolves checkpoint storage.

- **LLM workers only**: Checkpoint/restore supports LLM decode and prefill workers. Specialized workers (multimodal, embedding, diffusion) are not supported.
- **Single-GPU only**: Multi-GPU configurations may work in very basic hardware configurations, but are not officially supported yet.
- **Network state**: Restore is sensitive to live TCP socket state. Loopback bootstrap/control sockets can work with the supported CRIU TCP policies, but non-loopback or pod-IP-bound connections can still break restore.
- **Security**: Dynamo Snapshot runs as a **privileged DaemonSet** which is required to run CRIU and cuda-checkpoint. However, workload pods do not need to be privileged.
+### `snapshotctl` manifest is rejected or the restore target is wrong

-## Troubleshooting
+`snapshotctl` only accepts a single-container `Pod` manifest.

-### Checkpoint Not Ready
-
-1. Check the checkpoint job:
-   ```bash
-   kubectl get dckpt -n ${NAMESPACE}
-   kubectl describe dckpt <checkpoint-name> -n ${NAMESPACE}
-   JOB_NAME=$(kubectl get dckpt <checkpoint-name> -n ${NAMESPACE} -o jsonpath='{.status.jobName}')
-   if [ -n "${JOB_NAME}" ]; then
-     kubectl logs job/"${JOB_NAME}" -n ${NAMESPACE}
-   fi
-   ```
-
-2. Check the DaemonSet:
-   ```bash
-   kubectl logs daemonset/snapshot-agent -n ${NAMESPACE} --all-containers
-   ```
-
-3. Verify that platform and chart storage settings match:
-   ```bash
-   kubectl get dckpt <checkpoint-name> -n ${NAMESPACE} -o yaml
-   ```
-
-### Restore Failing
-
-1. Check pod logs:
-   ```bash
-   kubectl logs <worker-pod> -n ${NAMESPACE}
-   ```
-
-2. Describe the restore target pod:
-   ```bash
-   kubectl describe pod <worker-pod> -n ${NAMESPACE}
-   ```
-
-3. Confirm the referenced checkpoint is still `Ready`:
-   ```bash
-   kubectl get dckpt <checkpoint-name> -n ${NAMESPACE}
-   ```
+```bash
+snapshotctl checkpoint --manifest ./worker-pod.yaml --namespace ${NAMESPACE}
+snapshotctl restore --manifest ./worker-pod.yaml --namespace ${NAMESPACE} --checkpoint-id <checkpoint-id>
+```

 ## Planned Features

- TensorRT-LLM backend support
- S3/MinIO storage backend
- OCI registry storage backend
- Multi-GPU checkpoints
+- Stabilize multi-GPU support
+- TensorRT-LLM support
+- Alternative storage backends

 ## Related Documentation

- [Dynamo Snapshot Helm Chart README](https://github.com/ai-dynamo/dynamo/blob/main/deploy/helm/charts/snapshot/README.md) - Chart configuration
- [Installation Guide](installation-guide.md) - Platform installation
- [API Reference](api-reference.md) - Complete CRD specifications
+- [Installation Guide](installation-guide.md)
+- [API Reference](api-reference.md)