Unverified Commit 43e810a4 authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

refactor(snapshot): add manifest-based snapshotctl flow and shared workload builders (#7671)


Signed-off-by: default avatarSchwinn Saereesitthipitak <schwinns@nvidia.com>
parent 23144df5
......@@ -4,6 +4,7 @@ package types
import (
"fmt"
"os"
"strings"
"time"
)
......@@ -12,6 +13,7 @@ import (
type AgentConfig struct {
NodeName string `yaml:"-"`
RestrictedNamespace string `yaml:"-"`
Storage StorageSpec `yaml:"storage"`
Overlay OverlaySettings `yaml:"overlay"`
Restore RestoreSpec `yaml:"restore"`
CRIU CRIUSettings `yaml:"criu"`
......@@ -27,6 +29,16 @@ func (c *AgentConfig) LoadEnvOverrides() {
}
func (c *AgentConfig) Validate() error {
storageType := strings.TrimSpace(c.Storage.Type)
if storageType == "" {
storageType = "pvc"
}
if storageType != "pvc" {
return &ConfigError{Field: "storage.type", Message: fmt.Sprintf("unsupported storage type %q; only pvc is implemented today", storageType)}
}
if strings.TrimSpace(c.Storage.BasePath) == "" {
return &ConfigError{Field: "storage.basePath", Message: "storage.basePath is required"}
}
if c.CRIU.TcpClose && c.CRIU.TcpEstablished {
return &ConfigError{
Field: "criu",
......@@ -36,6 +48,12 @@ func (c *AgentConfig) Validate() error {
return c.Restore.Validate()
}
// StorageSpec holds snapshot storage settings that are local to the agent deployment.
type StorageSpec struct {
Type string `yaml:"type"`
BasePath string `yaml:"basePath"`
}
// RestoreSpec holds settings for the CRIU restore process.
type RestoreSpec struct {
NSRestorePath string `yaml:"nsRestorePath"`
......
......@@ -4,6 +4,7 @@ import (
"fmt"
"os"
"path/filepath"
"strings"
"time"
criurpc "github.com/checkpoint-restore/go-criu/v8/rpc"
......@@ -15,8 +16,8 @@ const manifestFilename = "manifest.yaml"
// CheckpointManifest is saved as manifest.yaml at checkpoint time and loaded at restore.
type CheckpointManifest struct {
CheckpointHash string `yaml:"checkpointHash"`
CreatedAt time.Time `yaml:"createdAt"`
CheckpointID string `yaml:"checkpointId"`
CreatedAt time.Time `yaml:"createdAt"`
CRIUDump CRIUDumpManifest `yaml:"criuDump"`
K8s SourcePodManifest `yaml:"k8s"`
......@@ -25,17 +26,17 @@ type CheckpointManifest struct {
}
func NewCheckpointManifest(
checkpointHash string,
checkpointID string,
criuDump CRIUDumpManifest,
k8s SourcePodManifest,
overlay OverlayManifest,
) *CheckpointManifest {
return &CheckpointManifest{
CheckpointHash: checkpointHash,
CreatedAt: time.Now().UTC(),
CRIUDump: criuDump,
K8s: k8s,
Overlay: overlay,
CheckpointID: checkpointID,
CreatedAt: time.Now().UTC(),
CRIUDump: criuDump,
K8s: k8s,
Overlay: overlay,
}
}
......@@ -140,6 +141,13 @@ func (m CUDAManifest) IsEmpty() bool {
// WriteManifest writes a checkpoint manifest file in the checkpoint directory.
func WriteManifest(checkpointDir string, data *CheckpointManifest) error {
if data == nil {
return fmt.Errorf("checkpoint manifest is required")
}
if strings.TrimSpace(data.CheckpointID) == "" {
return fmt.Errorf("checkpoint manifest is missing checkpointId")
}
content, err := yaml.Marshal(data)
if err != nil {
return fmt.Errorf("failed to marshal checkpoint manifest: %w", err)
......@@ -166,6 +174,9 @@ func ReadManifest(checkpointDir string) (*CheckpointManifest, error) {
if err := yaml.Unmarshal(content, &data); err != nil {
return nil, fmt.Errorf("failed to unmarshal checkpoint manifest: %w", err)
}
if strings.TrimSpace(data.CheckpointID) == "" {
return nil, fmt.Errorf("checkpoint manifest is missing checkpointId")
}
return &data, nil
}
package types
import (
"os"
"path/filepath"
"testing"
criurpc "github.com/checkpoint-restore/go-criu/v8/rpc"
......@@ -42,8 +44,8 @@ func TestManifestRoundTrip(t *testing.T) {
}
// Verify key fields survived the round-trip
if loaded.CheckpointHash != original.CheckpointHash {
t.Errorf("CheckpointHash = %q, want %q", loaded.CheckpointHash, original.CheckpointHash)
if loaded.CheckpointID != original.CheckpointID {
t.Errorf("CheckpointID = %q, want %q", loaded.CheckpointID, original.CheckpointID)
}
if loaded.CRIUDump.CRIU.LogLevel != 4 {
t.Errorf("CRIU.LogLevel = %d, want 4", loaded.CRIUDump.CRIU.LogLevel)
......@@ -138,3 +140,26 @@ func TestNewCRIUDumpManifest(t *testing.T) {
}
})
}
func TestWriteManifestRejectsMissingCheckpointID(t *testing.T) {
dir := t.TempDir()
err := WriteManifest(dir, &CheckpointManifest{})
if err == nil || err.Error() != "checkpoint manifest is missing checkpointId" {
t.Fatalf("expected missing checkpointId error, got %v", err)
}
}
func TestReadManifestRejectsMissingCheckpointID(t *testing.T) {
dir := t.TempDir()
content := []byte("createdAt: 2026-03-31T00:00:00Z\n")
if err := os.WriteFile(filepath.Join(dir, manifestFilename), content, 0o600); err != nil {
t.Fatalf("WriteFile: %v", err)
}
_, err := ReadManifest(dir)
if err == nil || err.Error() != "checkpoint manifest is missing checkpointId" {
t.Fatalf("expected missing checkpointId error, got %v", err)
}
}
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
package protocol
import (
"fmt"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/utils/ptr"
)
type CheckpointJobOptions struct {
Namespace string
CheckpointID string
ArtifactVersion string
SeccompProfile string
Name string
ActiveDeadlineSeconds *int64
TTLSecondsAfterFinish *int32
WrapLaunchJob bool
}
func NewCheckpointJob(podTemplate *corev1.PodTemplateSpec, opts CheckpointJobOptions) (*batchv1.Job, error) {
podTemplate = podTemplate.DeepCopy()
if podTemplate.Labels == nil {
podTemplate.Labels = map[string]string{}
}
if podTemplate.Annotations == nil {
podTemplate.Annotations = map[string]string{}
}
applyCheckpointSourceMetadata(podTemplate.Labels, podTemplate.Annotations, opts.CheckpointID, opts.ArtifactVersion)
podTemplate.Spec.RestartPolicy = corev1.RestartPolicyNever
if opts.SeccompProfile != "" {
EnsureLocalhostSeccompProfile(&podTemplate.Spec, opts.SeccompProfile)
}
if opts.WrapLaunchJob {
if len(podTemplate.Spec.Containers) == 0 {
return nil, fmt.Errorf("checkpoint job requires one worker container")
}
if len(podTemplate.Spec.Containers[0].Command) == 0 {
return nil, fmt.Errorf("checkpoint job requires container.command when cuda-checkpoint launch-job wrapping is enabled")
}
podTemplate.Spec.Containers[0].Command, podTemplate.Spec.Containers[0].Args = wrapWithCudaCheckpointLaunchJob(
podTemplate.Spec.Containers[0].Command,
podTemplate.Spec.Containers[0].Args,
)
}
return &batchv1.Job{
TypeMeta: metav1.TypeMeta{APIVersion: "batch/v1", Kind: "Job"},
ObjectMeta: metav1.ObjectMeta{
Name: opts.Name,
Namespace: opts.Namespace,
Labels: map[string]string{
CheckpointIDLabel: opts.CheckpointID,
},
},
Spec: batchv1.JobSpec{
ActiveDeadlineSeconds: opts.ActiveDeadlineSeconds,
BackoffLimit: ptr.To[int32](0),
TTLSecondsAfterFinished: opts.TTLSecondsAfterFinish,
Template: *podTemplate,
},
}, nil
}
func EnsureLocalhostSeccompProfile(podSpec *corev1.PodSpec, profile string) {
if podSpec.SecurityContext == nil {
podSpec.SecurityContext = &corev1.PodSecurityContext{}
}
podSpec.SecurityContext.SeccompProfile = &corev1.SeccompProfile{
Type: corev1.SeccompProfileTypeLocalhost,
LocalhostProfile: &profile,
}
}
func wrapWithCudaCheckpointLaunchJob(command []string, args []string) ([]string, []string) {
wrappedArgs := make([]string, 0, len(command)+len(args)+1)
wrappedArgs = append(wrappedArgs, "--launch-job")
wrappedArgs = append(wrappedArgs, command...)
wrappedArgs = append(wrappedArgs, args...)
return []string{"cuda-checkpoint"}, wrappedArgs
}
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
package protocol
import (
"testing"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/utils/ptr"
)
func TestNewCheckpointJob(t *testing.T) {
job, err := NewCheckpointJob(&corev1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{"existing": "label"},
Annotations: map[string]string{"existing": "annotation"},
},
Spec: corev1.PodSpec{
RestartPolicy: corev1.RestartPolicyAlways,
Containers: []corev1.Container{{
Name: "main",
Image: "test:latest",
Command: []string{"python3", "-m", "dynamo.vllm"},
Args: []string{"--model", "Qwen"},
}},
},
}, CheckpointJobOptions{
Namespace: "test-ns",
CheckpointID: "hash",
ArtifactVersion: "2",
SeccompProfile: DefaultSeccompLocalhostProfile,
Name: "test-job",
ActiveDeadlineSeconds: ptr.To(int64(60)),
TTLSecondsAfterFinish: ptr.To(int32(300)),
WrapLaunchJob: true,
})
if err != nil {
t.Fatalf("expected checkpoint job, got error: %v", err)
}
if job.Name != "test-job" || job.Namespace != "test-ns" {
t.Fatalf("unexpected job identity: %#v", job.ObjectMeta)
}
if job.Labels[CheckpointIDLabel] != "hash" {
t.Fatalf("expected checkpoint hash label on job: %#v", job.Labels)
}
if job.Spec.Template.Labels[CheckpointSourceLabel] != "true" {
t.Fatalf("expected checkpoint source label on template: %#v", job.Spec.Template.Labels)
}
if job.Spec.Template.Annotations[CheckpointArtifactVersionAnnotation] != "2" {
t.Fatalf("expected checkpoint artifact version annotation on template: %#v", job.Spec.Template.Annotations)
}
if len(job.Spec.Template.Spec.Volumes) != 0 {
t.Fatalf("expected no checkpoint volume, got %#v", job.Spec.Template.Spec.Volumes)
}
if len(job.Spec.Template.Spec.Containers[0].VolumeMounts) != 0 {
t.Fatalf("expected no checkpoint volume mount, got %#v", job.Spec.Template.Spec.Containers[0].VolumeMounts)
}
if job.Spec.Template.Spec.RestartPolicy != corev1.RestartPolicyNever {
t.Fatalf("expected restartPolicy Never, got %#v", job.Spec.Template.Spec.RestartPolicy)
}
if job.Spec.Template.Spec.SecurityContext == nil || job.Spec.Template.Spec.SecurityContext.SeccompProfile == nil {
t.Fatalf("expected seccomp profile to be injected: %#v", job.Spec.Template.Spec.SecurityContext)
}
if len(job.Spec.Template.Spec.Containers[0].Command) != 1 || job.Spec.Template.Spec.Containers[0].Command[0] != "cuda-checkpoint" {
t.Fatalf("expected cuda-checkpoint wrapper command: %#v", job.Spec.Template.Spec.Containers[0].Command)
}
expectedArgs := []string{"--launch-job", "python3", "-m", "dynamo.vllm", "--model", "Qwen"}
if len(job.Spec.Template.Spec.Containers[0].Args) != len(expectedArgs) {
t.Fatalf("expected launch-job args %#v, got %#v", expectedArgs, job.Spec.Template.Spec.Containers[0].Args)
}
for i := range expectedArgs {
if job.Spec.Template.Spec.Containers[0].Args[i] != expectedArgs[i] {
t.Fatalf("expected launch-job args %#v, got %#v", expectedArgs, job.Spec.Template.Spec.Containers[0].Args)
}
}
if job.Spec.BackoffLimit == nil || *job.Spec.BackoffLimit != 0 {
t.Fatalf("expected backoffLimit 0, got %#v", job.Spec.BackoffLimit)
}
if job.Spec.ActiveDeadlineSeconds == nil || *job.Spec.ActiveDeadlineSeconds != 60 {
t.Fatalf("unexpected activeDeadlineSeconds: %#v", job.Spec.ActiveDeadlineSeconds)
}
if job.Spec.TTLSecondsAfterFinished == nil || *job.Spec.TTLSecondsAfterFinished != 300 {
t.Fatalf("unexpected ttlSecondsAfterFinished: %#v", job.Spec.TTLSecondsAfterFinished)
}
}
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
package protocol
import (
"fmt"
"strings"
)
const (
CheckpointSourceLabel = "nvidia.com/snapshot-is-checkpoint-source"
CheckpointIDLabel = "nvidia.com/snapshot-checkpoint-id"
RestoreTargetLabel = "nvidia.com/snapshot-is-restore-target"
CheckpointArtifactVersionAnnotation = "nvidia.com/snapshot-artifact-version"
CheckpointStatusAnnotation = "nvidia.com/snapshot-checkpoint-status"
RestoreStatusAnnotation = "nvidia.com/snapshot-restore-status"
RestoreContainerIDAnnotation = "nvidia.com/snapshot-restore-container-id"
CheckpointVolumeName = "checkpoint-storage"
DefaultCheckpointArtifactVersion = "1"
DefaultCheckpointJobTTLSeconds = int32(300)
DefaultSeccompLocalhostProfile = "profiles/block-iouring.json"
StorageTypePVC = "pvc"
CheckpointStatusCompleted = "completed"
CheckpointStatusFailed = "failed"
RestoreStatusInProgress = "in_progress"
RestoreStatusCompleted = "completed"
RestoreStatusFailed = "failed"
)
type Storage struct {
Type string
Location string
PVCName string
BasePath string
}
func ArtifactVersion(version string) string {
version = strings.TrimSpace(version)
if version == "" {
return DefaultCheckpointArtifactVersion
}
return version
}
func ResolveCheckpointStorage(checkpointID string, version string, storage Storage) (Storage, error) {
resolved, err := resolveStorageConfig(storage)
if err != nil {
return Storage{}, err
}
resolved.Location = strings.TrimRight(resolved.BasePath, "/") + "/" + checkpointID + "/versions/" + ArtifactVersion(version)
return resolved, nil
}
func ResolveRestoreStorage(checkpointID string, version string, location string, storage Storage) (Storage, error) {
resolved, err := resolveStorageConfig(storage)
if err != nil {
return Storage{}, err
}
location = strings.TrimSpace(location)
if location == "" {
return ResolveCheckpointStorage(checkpointID, version, storage)
}
resolved.Location = location
return resolved, nil
}
func ApplyRestoreTargetMetadata(labels map[string]string, annotations map[string]string, enabled bool, checkpointID string, artifactVersion string) {
delete(labels, CheckpointSourceLabel)
delete(labels, RestoreTargetLabel)
delete(labels, CheckpointIDLabel)
delete(annotations, CheckpointArtifactVersionAnnotation)
delete(annotations, CheckpointStatusAnnotation)
delete(annotations, RestoreStatusAnnotation)
delete(annotations, RestoreContainerIDAnnotation)
if !enabled {
return
}
labels[RestoreTargetLabel] = "true"
if checkpointID != "" {
labels[CheckpointIDLabel] = checkpointID
}
annotations[CheckpointArtifactVersionAnnotation] = ArtifactVersion(artifactVersion)
}
func applyCheckpointSourceMetadata(labels map[string]string, annotations map[string]string, checkpointID string, artifactVersion string) {
delete(labels, RestoreTargetLabel)
delete(labels, CheckpointIDLabel)
delete(annotations, CheckpointArtifactVersionAnnotation)
labels[CheckpointSourceLabel] = "true"
if checkpointID != "" {
labels[CheckpointIDLabel] = checkpointID
}
annotations[CheckpointArtifactVersionAnnotation] = ArtifactVersion(artifactVersion)
}
func resolveStorageConfig(storage Storage) (Storage, error) {
storageType := strings.TrimSpace(storage.Type)
if storageType == "" {
storageType = StorageTypePVC
}
if storageType != StorageTypePVC {
return Storage{}, fmt.Errorf("checkpoint storage type %q is not supported", storageType)
}
basePath := strings.TrimSpace(storage.BasePath)
if basePath == "" {
return Storage{}, fmt.Errorf("checkpoint base path is required")
}
return Storage{
Type: storageType,
PVCName: strings.TrimSpace(storage.PVCName),
BasePath: strings.TrimRight(basePath, "/"),
}, nil
}
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
package protocol
import "testing"
func TestApplyRestoreTargetMetadata(t *testing.T) {
labels := map[string]string{
CheckpointSourceLabel: "true",
CheckpointIDLabel: "old",
}
annotations := map[string]string{
CheckpointArtifactVersionAnnotation: "old",
CheckpointStatusAnnotation: "completed",
RestoreStatusAnnotation: "failed",
RestoreContainerIDAnnotation: "dead-container",
}
ApplyRestoreTargetMetadata(labels, annotations, true, "hash", "2")
if labels[RestoreTargetLabel] != "true" {
t.Fatalf("expected restore target label, got %#v", labels)
}
if labels[CheckpointIDLabel] != "hash" {
t.Fatalf("expected checkpoint hash label, got %#v", labels)
}
if _, ok := labels[CheckpointSourceLabel]; ok {
t.Fatalf("checkpoint source label was not cleared: %#v", labels)
}
if annotations[CheckpointArtifactVersionAnnotation] != "2" {
t.Fatalf("expected checkpoint artifact version annotation, got %#v", annotations)
}
if _, ok := annotations[CheckpointStatusAnnotation]; ok {
t.Fatalf("checkpoint status annotation was not cleared: %#v", annotations)
}
if _, ok := annotations[RestoreStatusAnnotation]; ok {
t.Fatalf("restore status annotation was not cleared: %#v", annotations)
}
if _, ok := annotations[RestoreContainerIDAnnotation]; ok {
t.Fatalf("restore container id annotation was not cleared: %#v", annotations)
}
}
func TestApplyRestoreTargetMetadataDisabledClearsState(t *testing.T) {
labels := map[string]string{
RestoreTargetLabel: "true",
CheckpointIDLabel: "hash",
}
annotations := map[string]string{
CheckpointArtifactVersionAnnotation: "2",
CheckpointStatusAnnotation: "completed",
RestoreStatusAnnotation: "failed",
RestoreContainerIDAnnotation: "dead-container",
}
ApplyRestoreTargetMetadata(labels, annotations, false, "", "")
if _, ok := labels[RestoreTargetLabel]; ok {
t.Fatalf("restore target label was not cleared: %#v", labels)
}
if _, ok := labels[CheckpointIDLabel]; ok {
t.Fatalf("checkpoint hash label was not cleared: %#v", labels)
}
if _, ok := annotations[CheckpointArtifactVersionAnnotation]; ok {
t.Fatalf("checkpoint artifact version annotation was not cleared: %#v", annotations)
}
if _, ok := annotations[CheckpointStatusAnnotation]; ok {
t.Fatalf("checkpoint status annotation was not cleared: %#v", annotations)
}
if _, ok := annotations[RestoreStatusAnnotation]; ok {
t.Fatalf("restore status annotation was not cleared: %#v", annotations)
}
if _, ok := annotations[RestoreContainerIDAnnotation]; ok {
t.Fatalf("restore container id annotation was not cleared: %#v", annotations)
}
}
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
package protocol
import (
"context"
"fmt"
"strings"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
ctrlclient "sigs.k8s.io/controller-runtime/pkg/client"
)
const (
SnapshotAgentLabelKey = "app.kubernetes.io/component"
SnapshotAgentLabelValue = "snapshot-agent"
SnapshotAgentContainerName = "agent"
SnapshotAgentVolumeName = "checkpoints"
SnapshotAgentLabelSelector = SnapshotAgentLabelKey + "=" + SnapshotAgentLabelValue
)
type PodOptions struct {
Namespace string
CheckpointID string
ArtifactVersion string
Storage Storage
SeccompProfile string
}
func NewRestorePod(pod *corev1.Pod, opts PodOptions) *corev1.Pod {
pod = pod.DeepCopy()
if pod.Labels == nil {
pod.Labels = map[string]string{}
}
if pod.Annotations == nil {
pod.Annotations = map[string]string{}
}
ApplyRestoreTargetMetadata(pod.Labels, pod.Annotations, true, opts.CheckpointID, opts.ArtifactVersion)
PrepareRestorePodSpec(&pod.Spec, &pod.Spec.Containers[0], opts.Storage, opts.SeccompProfile, true)
pod.Namespace = opts.Namespace
pod.Spec.RestartPolicy = corev1.RestartPolicyNever
return pod
}
func PrepareRestorePodSpec(
podSpec *corev1.PodSpec,
container *corev1.Container,
storage Storage,
seccompProfile string,
isCheckpointReady bool,
) {
EnsureLocalhostSeccompProfile(podSpec, seccompProfile)
if storage.PVCName != "" {
injectCheckpointVolume(podSpec, storage.PVCName)
}
if storage.BasePath != "" {
injectCheckpointVolumeMount(container, storage.BasePath)
}
if isCheckpointReady {
container.Command = []string{"sleep", "infinity"}
container.Args = nil
container.StartupProbe = nil
container.LivenessProbe = nil
container.ReadinessProbe = nil
}
}
func ValidateRestorePodSpec(
podSpec *corev1.PodSpec,
storage Storage,
seccompProfile string,
) error {
if podSpec == nil {
return fmt.Errorf("pod spec is nil")
}
if len(podSpec.Containers) != 1 {
return fmt.Errorf("restore target must have exactly one container, got %d", len(podSpec.Containers))
}
container := &podSpec.Containers[0]
if storage.PVCName != "" {
hasVolume := false
for _, volume := range podSpec.Volumes {
if volume.Name == CheckpointVolumeName &&
volume.PersistentVolumeClaim != nil &&
volume.PersistentVolumeClaim.ClaimName == storage.PVCName {
hasVolume = true
break
}
}
if !hasVolume {
return fmt.Errorf("missing %s volume for PVC %s", CheckpointVolumeName, storage.PVCName)
}
}
if storage.BasePath != "" {
hasMount := false
for _, mount := range container.VolumeMounts {
if mount.Name == CheckpointVolumeName && mount.MountPath == storage.BasePath {
hasMount = true
break
}
}
if !hasMount {
return fmt.Errorf("missing %s mount at %s", CheckpointVolumeName, storage.BasePath)
}
}
if seccompProfile == "" {
return nil
}
if podSpec.SecurityContext == nil || podSpec.SecurityContext.SeccompProfile == nil {
return fmt.Errorf("missing localhost seccomp profile")
}
profile := podSpec.SecurityContext.SeccompProfile
if profile.Type != corev1.SeccompProfileTypeLocalhost || profile.LocalhostProfile == nil || *profile.LocalhostProfile != seccompProfile {
return fmt.Errorf("expected localhost seccomp profile %q", seccompProfile)
}
return nil
}
func DiscoverStorageFromDaemonSets(namespace string, daemonSets []appsv1.DaemonSet) (Storage, error) {
if len(daemonSets) == 0 {
return Storage{}, fmt.Errorf("no snapshot-agent daemonset found in namespace %s", namespace)
}
names := make([]string, 0, len(daemonSets))
for _, daemonSet := range daemonSets {
names = append(names, daemonSet.Name)
mountPaths := map[string]string{}
for _, container := range daemonSet.Spec.Template.Spec.Containers {
if container.Name != SnapshotAgentContainerName {
continue
}
for _, mount := range container.VolumeMounts {
if strings.TrimSpace(mount.MountPath) == "" {
continue
}
mountPaths[mount.Name] = strings.TrimRight(mount.MountPath, "/")
}
}
for _, volume := range daemonSet.Spec.Template.Spec.Volumes {
if volume.Name != SnapshotAgentVolumeName {
continue
}
if volume.PersistentVolumeClaim == nil {
continue
}
basePath, ok := mountPaths[volume.Name]
if !ok || basePath == "" {
continue
}
pvcName := strings.TrimSpace(volume.PersistentVolumeClaim.ClaimName)
if pvcName == "" {
continue
}
return Storage{
Type: StorageTypePVC,
PVCName: pvcName,
BasePath: basePath,
}, nil
}
}
return Storage{}, fmt.Errorf(
"snapshot-agent daemonset in %s does not mount a PVC-backed checkpoint volume (%s)",
namespace,
strings.Join(names, ", "),
)
}
func PrepareRestorePodSpecForCheckpoint(
ctx context.Context,
reader ctrlclient.Reader,
namespace string,
podSpec *corev1.PodSpec,
container *corev1.Container,
checkpointID string,
artifactVersion string,
seccompProfile string,
isCheckpointReady bool,
) error {
if reader == nil {
return fmt.Errorf("snapshot client is required")
}
daemonSets := &appsv1.DaemonSetList{}
if err := reader.List(
ctx,
daemonSets,
ctrlclient.InNamespace(namespace),
ctrlclient.MatchingLabels{SnapshotAgentLabelKey: SnapshotAgentLabelValue},
); err != nil {
return fmt.Errorf("list snapshot-agent daemonsets in %s: %w", namespace, err)
}
storage, err := DiscoverStorageFromDaemonSets(namespace, daemonSets.Items)
if err != nil {
return err
}
resolvedStorage, err := ResolveCheckpointStorage(checkpointID, artifactVersion, storage)
if err != nil {
return err
}
PrepareRestorePodSpec(podSpec, container, resolvedStorage, seccompProfile, isCheckpointReady)
return nil
}
func injectCheckpointVolume(podSpec *corev1.PodSpec, pvcName string) {
for _, volume := range podSpec.Volumes {
if volume.Name == CheckpointVolumeName {
return
}
}
podSpec.Volumes = append(podSpec.Volumes, corev1.Volume{
Name: CheckpointVolumeName,
VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
ClaimName: pvcName,
},
},
})
}
func injectCheckpointVolumeMount(container *corev1.Container, basePath string) {
for _, mount := range container.VolumeMounts {
if mount.Name == CheckpointVolumeName {
return
}
}
container.VolumeMounts = append(container.VolumeMounts, corev1.VolumeMount{
Name: CheckpointVolumeName,
MountPath: basePath,
})
}
package protocol
import (
"testing"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
func TestNewRestorePod(t *testing.T) {
restorePod := NewRestorePod(&corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "worker",
Labels: map[string]string{"existing": "label"},
Annotations: map[string]string{"existing": "annotation"},
},
Spec: corev1.PodSpec{
RestartPolicy: corev1.RestartPolicyAlways,
Containers: []corev1.Container{{
Name: "main",
Image: "test:latest",
Command: []string{"python3", "-m", "dynamo.vllm"},
Args: []string{"--model", "Qwen"},
ReadinessProbe: &corev1.Probe{},
LivenessProbe: &corev1.Probe{},
StartupProbe: &corev1.Probe{},
}},
},
}, PodOptions{
Namespace: "test-ns",
CheckpointID: "hash",
ArtifactVersion: "2",
Storage: Storage{
Type: StorageTypePVC,
PVCName: "snapshot-pvc",
BasePath: "/checkpoints",
},
SeccompProfile: DefaultSeccompLocalhostProfile,
})
if restorePod.Name != "worker" || restorePod.Namespace != "test-ns" {
t.Fatalf("unexpected restore pod identity: %#v", restorePod.ObjectMeta)
}
if restorePod.Labels[RestoreTargetLabel] != "true" {
t.Fatalf("expected restore target label: %#v", restorePod.Labels)
}
if restorePod.Labels[CheckpointIDLabel] != "hash" {
t.Fatalf("expected checkpoint id label: %#v", restorePod.Labels)
}
if restorePod.Annotations[CheckpointArtifactVersionAnnotation] != "2" {
t.Fatalf("expected checkpoint artifact version annotation: %#v", restorePod.Annotations)
}
if restorePod.Spec.RestartPolicy != corev1.RestartPolicyNever {
t.Fatalf("expected restartPolicy Never, got %#v", restorePod.Spec.RestartPolicy)
}
if len(restorePod.Spec.Containers[0].Command) != 2 || restorePod.Spec.Containers[0].Command[0] != "sleep" || restorePod.Spec.Containers[0].Command[1] != "infinity" {
t.Fatalf("expected placeholder command, got %#v", restorePod.Spec.Containers[0].Command)
}
if restorePod.Spec.Containers[0].Args != nil {
t.Fatalf("expected restore args to be cleared: %#v", restorePod.Spec.Containers[0].Args)
}
if restorePod.Spec.Containers[0].ReadinessProbe != nil {
t.Fatalf("expected readiness probe to be cleared: %#v", restorePod.Spec.Containers[0].ReadinessProbe)
}
if restorePod.Spec.Containers[0].LivenessProbe != nil {
t.Fatalf("expected liveness probe to be cleared: %#v", restorePod.Spec.Containers[0].LivenessProbe)
}
if restorePod.Spec.Containers[0].StartupProbe != nil {
t.Fatalf("expected startup probe to be cleared: %#v", restorePod.Spec.Containers[0].StartupProbe)
}
if restorePod.Spec.SecurityContext == nil || restorePod.Spec.SecurityContext.SeccompProfile == nil {
t.Fatalf("expected seccomp profile to be injected: %#v", restorePod.Spec.SecurityContext)
}
if len(restorePod.Spec.Volumes) != 1 {
t.Fatalf("expected checkpoint volume, got %#v", restorePod.Spec.Volumes)
}
if len(restorePod.Spec.Containers[0].VolumeMounts) != 1 {
t.Fatalf("expected checkpoint mount, got %#v", restorePod.Spec.Containers[0].VolumeMounts)
}
}
func TestPrepareRestorePodSpec(t *testing.T) {
podSpec := corev1.PodSpec{}
container := corev1.Container{
Command: []string{"python3", "-m", "dynamo.vllm"},
Args: []string{"--model", "Qwen"},
ReadinessProbe: &corev1.Probe{},
LivenessProbe: &corev1.Probe{},
StartupProbe: &corev1.Probe{},
}
storage := Storage{
Type: StorageTypePVC,
PVCName: "snapshot-pvc",
BasePath: "/checkpoints",
}
PrepareRestorePodSpec(&podSpec, &container, storage, DefaultSeccompLocalhostProfile, true)
PrepareRestorePodSpec(&podSpec, &container, storage, DefaultSeccompLocalhostProfile, true)
if podSpec.SecurityContext == nil || podSpec.SecurityContext.SeccompProfile == nil {
t.Fatalf("expected seccomp profile to be injected: %#v", podSpec.SecurityContext)
}
if len(podSpec.Volumes) != 1 {
t.Fatalf("expected checkpoint volume, got %#v", podSpec.Volumes)
}
if len(container.VolumeMounts) != 1 {
t.Fatalf("expected checkpoint mount, got %#v", container.VolumeMounts)
}
if len(container.Command) != 2 || container.Command[0] != "sleep" || container.Command[1] != "infinity" {
t.Fatalf("expected placeholder command, got %#v", container.Command)
}
if container.Args != nil {
t.Fatalf("expected restore args to be cleared: %#v", container.Args)
}
if container.ReadinessProbe != nil || container.LivenessProbe != nil || container.StartupProbe != nil {
t.Fatalf("expected probes to be cleared: %#v %#v %#v", container.ReadinessProbe, container.LivenessProbe, container.StartupProbe)
}
}
func TestValidateRestorePodSpec(t *testing.T) {
profile := DefaultSeccompLocalhostProfile
podSpec := &corev1.PodSpec{
SecurityContext: &corev1.PodSecurityContext{
SeccompProfile: &corev1.SeccompProfile{
Type: corev1.SeccompProfileTypeLocalhost,
LocalhostProfile: &profile,
},
},
Volumes: []corev1.Volume{{
Name: CheckpointVolumeName,
VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
ClaimName: "snapshot-pvc",
},
},
}},
Containers: []corev1.Container{{
Name: "main",
VolumeMounts: []corev1.VolumeMount{{
Name: CheckpointVolumeName,
MountPath: "/checkpoints",
}},
}},
}
storage := Storage{
Type: StorageTypePVC,
PVCName: "snapshot-pvc",
BasePath: "/checkpoints",
}
if err := ValidateRestorePodSpec(podSpec, storage, DefaultSeccompLocalhostProfile); err != nil {
t.Fatalf("expected restore pod spec to be valid, got %v", err)
}
badSpec := podSpec.DeepCopy()
badSpec.Volumes = nil
if err := ValidateRestorePodSpec(badSpec, storage, DefaultSeccompLocalhostProfile); err == nil || err.Error() != "missing checkpoint-storage volume for PVC snapshot-pvc" {
t.Fatalf("expected missing volume error, got %v", err)
}
badSpec = podSpec.DeepCopy()
badSpec.Containers[0].VolumeMounts = nil
if err := ValidateRestorePodSpec(badSpec, storage, DefaultSeccompLocalhostProfile); err == nil || err.Error() != "missing checkpoint-storage mount at /checkpoints" {
t.Fatalf("expected missing mount error, got %v", err)
}
badSpec = podSpec.DeepCopy()
badSpec.SecurityContext = nil
if err := ValidateRestorePodSpec(badSpec, storage, DefaultSeccompLocalhostProfile); err == nil || err.Error() != "missing localhost seccomp profile" {
t.Fatalf("expected missing seccomp error, got %v", err)
}
}
func TestValidateRestorePodSpecRequiresExactlyOneContainer(t *testing.T) {
profile := DefaultSeccompLocalhostProfile
podSpec := &corev1.PodSpec{
SecurityContext: &corev1.PodSecurityContext{
SeccompProfile: &corev1.SeccompProfile{
Type: corev1.SeccompProfileTypeLocalhost,
LocalhostProfile: &profile,
},
},
Volumes: []corev1.Volume{{
Name: CheckpointVolumeName,
VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
ClaimName: "snapshot-pvc",
},
},
}},
Containers: []corev1.Container{
{
Name: "worker",
VolumeMounts: []corev1.VolumeMount{{
Name: CheckpointVolumeName,
MountPath: "/checkpoints",
}},
},
{Name: "sidecar"},
},
}
storage := Storage{
Type: StorageTypePVC,
PVCName: "snapshot-pvc",
BasePath: "/checkpoints",
}
if err := ValidateRestorePodSpec(podSpec, storage, DefaultSeccompLocalhostProfile); err == nil || err.Error() != "restore target must have exactly one container, got 2" {
t.Fatalf("expected multi-container restore target to be rejected, got %v", err)
}
}
func TestDiscoverStorageFromDaemonSetsUsesCheckpointsVolume(t *testing.T) {
daemonSet := appsv1.DaemonSet{
ObjectMeta: metav1.ObjectMeta{Name: "snapshot-agent", Namespace: "test-ns"},
Spec: appsv1.DaemonSetSpec{
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Containers: []corev1.Container{{
Name: SnapshotAgentContainerName,
VolumeMounts: []corev1.VolumeMount{
{Name: "cache", MountPath: "/cache"},
{Name: SnapshotAgentVolumeName, MountPath: "/checkpoints"},
},
}},
Volumes: []corev1.Volume{
{
Name: "cache",
VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ClaimName: "cache-pvc"},
},
},
{
Name: SnapshotAgentVolumeName,
VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ClaimName: "snapshot-pvc"},
},
},
},
},
},
},
}
storage, err := DiscoverStorageFromDaemonSets("test-ns", []appsv1.DaemonSet{daemonSet})
if err != nil {
t.Fatalf("expected daemonset storage discovery to succeed, got %v", err)
}
if storage.PVCName != "snapshot-pvc" || storage.BasePath != "/checkpoints" {
t.Fatalf("expected snapshot PVC discovery, got %#v", storage)
}
}
......@@ -265,7 +265,7 @@ _Appears in:_
| `sharedMemory` _[SharedMemorySpec](#sharedmemoryspec)_ | SharedMemory controls the tmpfs mounted at /dev/shm for the checkpoint Job pod.<br />When omitted, checkpoint Jobs use the same default 8Gi tmpfs as Dynamo components. | | Optional: \{\} <br /> |
| `activeDeadlineSeconds` _integer_ | ActiveDeadlineSeconds specifies the maximum time the Job can run | 3600 | Minimum: 1 <br />Optional: \{\} <br /> |
| `backoffLimit` _integer_ | Deprecated: BackoffLimit is ignored. Checkpoint Jobs never retry. | | Minimum: 0 <br />Optional: \{\} <br /> |
| `ttlSecondsAfterFinished` _integer_ | TTLSecondsAfterFinished specifies how long to keep the Job after completion | 300 | Minimum: 0 <br />Optional: \{\} <br /> |
| `ttlSecondsAfterFinished` _integer_ | Deprecated: TTLSecondsAfterFinished is ignored. Checkpoint Jobs use a fixed<br />300 second TTL. | | Minimum: 0 <br />Optional: \{\} <br /> |
#### DynamoCheckpointPhase
......@@ -284,7 +284,7 @@ _Appears in:_
| --- | --- |
| `Pending` | DynamoCheckpointPhasePending indicates the checkpoint CR has been created but the Job has not started<br /> |
| `Creating` | DynamoCheckpointPhaseCreating indicates the checkpoint Job is running<br /> |
| `Ready` | DynamoCheckpointPhaseReady indicates the checkpoint tar file is available on the PVC<br /> |
| `Ready` | DynamoCheckpointPhaseReady indicates the checkpoint artifact is available<br /> |
| `Failed` | DynamoCheckpointPhaseFailed indicates the checkpoint creation failed<br /> |
......@@ -320,10 +320,10 @@ _Appears in:_
| --- | --- | --- | --- |
| `phase` _[DynamoCheckpointPhase](#dynamocheckpointphase)_ | Phase represents the current phase of the checkpoint lifecycle | | Enum: [Pending Creating Ready Failed] <br />Optional: \{\} <br /> |
| `identityHash` _string_ | IdentityHash is the computed hash of the checkpoint identity<br />This hash is used to identify equivalent checkpoints | | Optional: \{\} <br /> |
| `location` _string_ | Location is the full URI/path to the checkpoint in the storage backend<br />For PVC: same as TarPath (e.g., /checkpoints/\{hash\}.tar)<br />For S3: s3://bucket/prefix/\{hash\}.tar<br />For OCI: oci://registry/repo:\{hash\} | | Optional: \{\} <br /> |
| `storageType` _[DynamoCheckpointStorageType](#dynamocheckpointstoragetype)_ | StorageType indicates the storage backend type used for this checkpoint | | Enum: [pvc s3 oci] <br />Optional: \{\} <br /> |
| `location` _string_ | Deprecated: Location is ignored and no longer populated. It is retained<br />only so older objects continue to validate. | | Optional: \{\} <br /> |
| `storageType` _[DynamoCheckpointStorageType](#dynamocheckpointstoragetype)_ | Deprecated: StorageType is ignored and no longer populated. It is retained<br />only so older objects continue to validate. | | Enum: [pvc s3 oci] <br />Optional: \{\} <br /> |
| `jobName` _string_ | JobName is the name of the checkpoint creation Job | | Optional: \{\} <br /> |
| `createdAt` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#time-v1-meta)_ | CreatedAt is the timestamp when the checkpoint tar was created | | Optional: \{\} <br /> |
| `createdAt` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#time-v1-meta)_ | CreatedAt is the timestamp when the checkpoint became ready | | Optional: \{\} <br /> |
| `message` _string_ | Message provides additional information about the current state | | Optional: \{\} <br /> |
| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | DEPRECATED: Conditions are deprecated. Use status.phase instead. | | Optional: \{\} <br /> |
......@@ -332,7 +332,9 @@ _Appears in:_
_Underlying type:_ _string_
DynamoCheckpointStorageType defines the supported storage backends for checkpoints
Deprecated: StorageType is retained for compatibility with older
DynamoCheckpoint status consumers. The current checkpoint flow publishes
PVC-backed artifacts discovered from the snapshot-agent DaemonSet.
_Validation:_
- Enum: [pvc s3 oci]
......@@ -1716,14 +1718,15 @@ _Appears in:_
| --- | --- | --- | --- |
| `enabled` _boolean_ | Enabled indicates if checkpoint functionality is enabled | | |
| `readyForCheckpointFilePath` _string_ | ReadyForCheckpointFilePath signals model readiness for checkpoint jobs | /tmp/ready-for-checkpoint | |
| `storage` _[CheckpointStorageConfiguration](#checkpointstorageconfiguration)_ | Storage holds storage backend configuration | | |
| `storage` _[CheckpointStorageConfiguration](#checkpointstorageconfiguration)_ | Deprecated: Storage is retained for compatibility and ignored by the<br />current snapshot flow. Snapshot storage is discovered from the<br />snapshot-agent DaemonSet instead. | | |
#### CheckpointOCIConfig
CheckpointOCIConfig holds OCI registry storage configuration.
Deprecated: CheckpointOCIConfig is retained for compatibility and ignored by
the current snapshot flow.
......@@ -1732,15 +1735,16 @@ _Appears in:_
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `uri` _string_ | URI is the OCI URI (oci://registry/repository) | | |
| `credentialsSecretRef` _string_ | CredentialsSecretRef is the name of the docker config secret | | |
| `uri` _string_ | URI is the legacy OCI URI (oci://registry/repository). | | |
| `credentialsSecretRef` _string_ | CredentialsSecretRef is the legacy docker config secret name. | | |
#### CheckpointPVCConfig
CheckpointPVCConfig holds PVC storage configuration.
Deprecated: CheckpointPVCConfig is retained for compatibility and ignored by
the current snapshot flow.
......@@ -1749,15 +1753,16 @@ _Appears in:_
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `pvcName` _string_ | PVCName is the name of the PVC | snapshot-pvc | |
| `basePath` _string_ | BasePath is the base directory within the PVC | /checkpoints | |
| `pvcName` _string_ | PVCName is the legacy PVC name. | | |
| `basePath` _string_ | BasePath is the legacy base directory within the PVC. | | |
#### CheckpointS3Config
CheckpointS3Config holds S3 storage configuration.
Deprecated: CheckpointS3Config is retained for compatibility and ignored by
the current snapshot flow.
......@@ -1766,15 +1771,16 @@ _Appears in:_
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `uri` _string_ | URI is the S3 URI (s3://[endpoint/]bucket/prefix) | | |
| `credentialsSecretRef` _string_ | CredentialsSecretRef is the name of the credentials secret | | |
| `uri` _string_ | URI is the legacy S3 URI (s3://[endpoint/]bucket/prefix). | | |
| `credentialsSecretRef` _string_ | CredentialsSecretRef is the legacy credentials secret name. | | |
#### CheckpointStorageConfiguration
CheckpointStorageConfiguration holds storage backend configuration for checkpoints.
Deprecated: CheckpointStorageConfiguration is retained for compatibility and
ignored by the current snapshot flow.
......@@ -1783,10 +1789,10 @@ _Appears in:_
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `type` _string_ | Type is the storage backend type: pvc, s3, or oci | pvc | |
| `pvc` _[CheckpointPVCConfig](#checkpointpvcconfig)_ | PVC configuration (used when Type=pvc) | | |
| `s3` _[CheckpointS3Config](#checkpoints3config)_ | S3 configuration (used when Type=s3) | | |
| `oci` _[CheckpointOCIConfig](#checkpointociconfig)_ | OCI configuration (used when Type=oci) | | |
| `type` _string_ | Type is the legacy storage backend type: pvc, s3, or oci. | | |
| `pvc` _[CheckpointPVCConfig](#checkpointpvcconfig)_ | PVC configuration for legacy pvc-based settings. | | |
| `s3` _[CheckpointS3Config](#checkpoints3config)_ | S3 configuration for legacy s3-based settings. | | |
| `oci` _[CheckpointOCIConfig](#checkpointociconfig)_ | OCI configuration for legacy oci-based settings. | | |
#### DiscoveryBackend
......
......@@ -4,32 +4,38 @@
title: Snapshot
---
> ⚠️ **Experimental Feature**: Dynamo Snapshot is currently in **preview** and may only be functional in some k8s cluster setups. The Dynamo Snapshot DaemonSet runs in privileged mode to perform CRIU operations. See [Limitations](#limitations) for details.
> ⚠️ **Experimental Feature**: Dynamo Snapshot is currently in preview and may only be functional in some cluster setups. The `snapshot-agent` DaemonSet runs in privileged mode to perform CRIU operations. See [Limitations](#limitations) for details.
**Dynamo Snapshot** is an experimental infrastructure for fast-starting GPU applications in Kubernetes using CRIU (Checkpoint/Restore in User-space) and NVIDIA's cuda-checkpoint utility. Dynamo Snapshot dramatically reduces cold-start times for large models from minutes to seconds by capturing initialized application state and restoring it on-demand.
**Dynamo Snapshot** is infrastructure for fast-starting GPU applications in Kubernetes using CRIU (Checkpoint/Restore in Userspace) and NVIDIA's `cuda-checkpoint` utility. The usual flow is:
1. start a worker once and checkpoint its initialized state
2. store that checkpoint on a namespace-local snapshot volume
3. restore later workers from that checkpoint instead of cold-starting again
| Startup Type | Time | What Happens |
|--------------|------|--------------|
| **Cold Start** | ~1 min | Download model, load to GPU, initialize engine |
| **Warm Start** (restore from checkpoint) | ~ 10 sec | Restore from a ready checkpoint directory |
| **Warm Start** (restore from checkpoint) | ~10 sec | Restore from a ready checkpoint directory |
> ⚠️ Restore time may vary depending on cluster configuration (storage bandwidth, GPU model, etc.)
> ⚠️ Restore time depends on storage bandwidth, GPU model, and whether the restore stays on the same node.
## Prerequisites
- Dynamo Platform/Operator installed on a k8s cluster with **x86_64 (amd64)** GPU nodes
- NVIDIA driver 580.xx or newer on the target GPU nodes
- `ReadWriteMany` storage if you need cross-node restore
- vLLM or SGLang backend (TensorRT-LLM is not supported yet)
- Security clearance to run a privileged DaemonSet
- x86_64 (`amd64`) GPU nodes
- NVIDIA driver 580.xx or newer on the target GPU nodes (590.xx or newer if testing multi-GPU snapshots)
- vLLM or SGLang backend today
- `ReadWriteMany` storage for cross-node restore
## Quick Start
## Quick Start via `DynamoCheckpoint` CR
This guide assumes a normal Dynamo deployment workflow is already present on your Kubernetes cluster.
1. Build a placeholder image
2. Install the snapshot chart
3. Create a `DynamoCheckpoint` and wait for it to become ready
4. Deploy a `DynamoGraphDeployment` that restores from the corresponding `checkpointRef`
### 1. Build and push a placeholder image
Snapshot-enabled workers must use a placeholder image that wraps the normal runtime image with the restore tooling. If you do not already have one, build it with the snapshot placeholder target and push it to a registry your cluster can pull from:
Snapshot-enabled workers must use a placeholder image that wraps the normal runtime image with restore tooling. If you do not already have one, build it and push it to a registry your cluster can pull from:
```bash
export RUNTIME_IMAGE=registry.example.com/dynamo/vllm-runtime:1.0.0
......@@ -45,7 +51,7 @@ make docker-push-placeholder \
PLACEHOLDER_IMG="${PLACEHOLDER_IMAGE}"
```
This flow is defined in [deploy/snapshot/Makefile](https://github.com/ai-dynamo/dynamo/blob/main/deploy/snapshot/Makefile) and [deploy/snapshot/Dockerfile](https://github.com/ai-dynamo/dynamo/blob/main/deploy/snapshot/Dockerfile). The placeholder image preserves the base runtime entrypoint and command contract, and adds the CRIU, `cuda-checkpoint`, and `nsrestore` tooling needed for restore.
The placeholder image preserves the normal runtime entrypoint/command contract and adds the `criu`, `cuda-checkpoint`, and `nsrestore` tooling needed for checkpoint and restore.
To build either snapshot image against a custom CRIU fork or ref, pass
`CRIU_REPO` and `CRIU_REF` through `make`. If they are unset, the Dockerfile
......@@ -66,17 +72,12 @@ make docker-build-placeholder \
### 2. Enable checkpointing in the platform and verify it
Whether you are installing or upgrading `dynamo-platform`, the operator must have checkpointing enabled and must point at the same storage that the snapshot chart will use:
Whether you are installing or upgrading `dynamo-platform`, the operator only needs checkpointing enabled:
```yaml
dynamo-operator:
checkpoint:
enabled: true
storage:
type: pvc
pvc:
pvcName: snapshot-pvc
basePath: /checkpoints
```
If the platform is already installed, verify that the operator config contains the checkpoint block:
......@@ -90,11 +91,9 @@ kubectl get configmap "${OPERATOR_CONFIG}" -n "${PLATFORM_NAMESPACE}" \
-o jsonpath='{.data.config\.yaml}' | sed -n '/^checkpoint:/,/^[^[:space:]]/p'
```
Verify that the rendered config includes `enabled: true` and the same PVC name and base path you plan to use for the snapshot chart.
For the full platform/operator configuration surface, see [deploy/helm/charts/platform/README.md](https://github.com/ai-dynamo/dynamo/blob/main/deploy/helm/charts/platform/README.md) and [deploy/helm/charts/platform/components/operator/values.yaml](https://github.com/ai-dynamo/dynamo/blob/main/deploy/helm/charts/platform/components/operator/values.yaml).
Verify that the rendered config includes `enabled: true`.
### 3. Install the snapshot chart
### 3. Install the snapshot chart in the workload namespace
```bash
helm upgrade --install snapshot ./deploy/helm/charts/snapshot \
......@@ -103,28 +102,84 @@ helm upgrade --install snapshot ./deploy/helm/charts/snapshot \
--set storage.pvc.create=true
```
Cross-node restore requires `ReadWriteMany` storage. The chart defaults to that mode.
Cross-node restore requires shared `ReadWriteMany` storage. The chart defaults to that mode. If your cluster does not have a default storage class, also set `storage.pvc.storageClass`.
For better restore times, use a fast `ReadWriteMany` StorageClass for the checkpoint PVC. If you are reusing an existing checkpoint PVC, do not set `storage.pvc.create=true`; install the chart with `storage.pvc.create=false` and point `storage.pvc.name` at the existing PVC instead.
If you are reusing an existing checkpoint PVC, do not set `storage.pvc.create=true`; install the chart with `storage.pvc.create=false` and set `storage.pvc.name` instead.
Verify that the PVC and DaemonSet are ready:
```bash
kubectl get pvc snapshot-pvc -n ${NAMESPACE}
kubectl rollout status daemonset/snapshot-agent -n ${NAMESPACE}
kubectl get pods -n ${NAMESPACE} -l app.kubernetes.io/component=snapshot-agent -o wide
```
### 4. Create a `DynamoCheckpoint`
The checkpoint Job pod template should match the worker container you want to checkpoint. For the snapshot flow, the important parts are the checkpoint identity, the first container in `spec.containers`, and the placeholder image; the rest of the pod template should mirror your normal worker config.
```yaml
apiVersion: nvidia.com/v1alpha1
kind: DynamoCheckpoint
metadata:
name: qwen3-06b-bf16
spec:
identity:
model: Qwen/Qwen3-0.6B
backendFramework: vllm
tensorParallelSize: 1
dtype: bfloat16
maxModelLen: 2048
job:
activeDeadlineSeconds: 3600
podTemplateSpec:
spec:
...
containers:
- name: worker
image: registry.example.com/dynamo/vllm-placeholder:1.0.0
...
```
For a full working example, see [deploy/operator/config/samples/nvidia.com_v1alpha1_dynamocheckpoint.yaml](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/config/samples/nvidia.com_v1alpha1_dynamocheckpoint.yaml).
Apply it:
```bash
kubectl apply -f qwen3-checkpoint.yaml -n ${NAMESPACE}
```
### 5. Wait for the checkpoint to become ready
```bash
kubectl get dckpt -n ${NAMESPACE} \
-o custom-columns=NAME:.metadata.name,HASH:.status.identityHash,PHASE:.status.phase
kubectl wait \
--for=jsonpath='{.status.phase}'=Ready \
dynamocheckpoint/qwen3-06b-bf16 \
-n ${NAMESPACE} \
--timeout=30m
```
For the full snapshot chart configuration surface, see [deploy/helm/charts/snapshot/README.md](https://github.com/ai-dynamo/dynamo/blob/main/deploy/helm/charts/snapshot/README.md) and [deploy/helm/charts/snapshot/values.yaml](https://github.com/ai-dynamo/dynamo/blob/main/deploy/helm/charts/snapshot/values.yaml).
The useful status fields are:
### 4. Apply a snapshot-compatible `DynamoGraphDeployment`
- `status.phase`: high-level lifecycle (`Pending`, `Creating`, `Ready`, `Failed`)
- `status.identityHash`: deterministic hash of `spec.identity`
- `status.jobName`: checkpoint Job name
- `status.createdAt`: timestamp recorded when the checkpoint became ready
- `status.message`: progress or failure detail when available
This example is adapted from [examples/backends/vllm/deploy/agg.yaml](https://github.com/ai-dynamo/dynamo/blob/main/examples/backends/vllm/deploy/agg.yaml). The worker must use the placeholder image from step 1, and the checkpoint identity must describe the runtime state you want to reuse.
### 6. Deploy a `DynamoGraphDeployment` that restores from `checkpointRef`
Once the checkpoint is `Ready`, restore a worker from it explicitly:
```yaml
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: vllm-snapshot-demo
name: vllm-checkpointref-demo
spec:
services:
Frontend:
......@@ -137,133 +192,139 @@ spec:
VllmDecodeWorker:
componentType: worker
replicas: 1
resources:
limits:
gpu: "1"
readinessProbe:
httpGet:
path: /live
port: system
periodSeconds: 1
timeoutSeconds: 4
failureThreshold: 3
checkpoint:
enabled: true
mode: Auto
identity:
model: Qwen/Qwen3-0.6B
backendFramework: vllm
checkpointRef: qwen3-06b-bf16
extraPodSpec:
mainContainer:
image: registry.example.com/dynamo/vllm-placeholder:1.0.0
command:
- python3
- -m
- dynamo.vllm
args:
- --model
- Qwen/Qwen3-0.6B
env:
- name: NCCL_DEBUG
value: ERROR
- name: TORCH_CPP_LOG_LEVEL
value: ERROR
- name: TORCH_DISTRIBUTED_DEBUG
value: "OFF"
...
...
```
For SGLang, use `dynamo.sglang`, an SGLang placeholder image, `backendFramework: sglang`, and the matching CLI flags.
Apply the manifest:
Apply it:
```bash
kubectl apply -f vllm-snapshot-demo.yaml -n ${NAMESPACE}
kubectl apply -f vllm-checkpointref-demo.yaml -n ${NAMESPACE}
kubectl get pods -n ${NAMESPACE} -w
```
On the first rollout, the worker cold-starts, the operator resolves the checkpoint identity hash, and the checkpoint Job writes a new checkpoint directory into `snapshot-pvc`.
The `VllmDecodeWorker` pod should restore from the ready checkpoint instead of creating a new one.
### 5. Wait for the checkpoint to become ready
## DGD Auto Flow
Auto mode resolves checkpoints by identity hash. It may create `checkpoint-<hash>` or reuse an existing checkpoint with a different CR name. For the sample identity above, the hash is `73e74442beb109ed`:
`checkpointRef` is the most explicit path. `mode: Auto` is the higher-level path: the operator computes the checkpoint identity hash, looks for an equivalent `DynamoCheckpoint`, and creates one only when no matching checkpoint exists. If a `DynamoCheckpoint` already exists with the same identity, Auto mode reuses it. If no matching checkpoint exists yet, the first worker cold-starts and the operator creates the checkpoint in the background.
```bash
kubectl get dckpt -n ${NAMESPACE}
```yaml
checkpoint:
enabled: true
mode: Auto
identity:
model: Qwen/Qwen3-0.6B
backendFramework: vllm
tensorParallelSize: 1
dtype: bfloat16
maxModelLen: 2048
```
CKPT_NAME=$(kubectl get dckpt -n ${NAMESPACE} \
-l nvidia.com/snapshot-checkpoint-hash=73e74442beb109ed \
-o jsonpath='{.items[0].metadata.name}')
kubectl wait \
--for=jsonpath='{.status.phase}'=Ready \
"dynamocheckpoint/${CKPT_NAME}" \
-n ${NAMESPACE} \
--timeout=5m
Inside a `DynamoGraphDeployment`, it looks like this:
```yaml
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: vllm-auto-demo
spec:
services:
Frontend:
componentType: frontend
replicas: 1
extraPodSpec:
mainContainer:
image: registry.example.com/dynamo/vllm-runtime:1.0.0
VllmDecodeWorker:
componentType: worker
replicas: 1
checkpoint:
enabled: true
mode: Auto
identity:
model: Qwen/Qwen3-0.6B
backendFramework: vllm
tensorParallelSize: 1
dtype: bfloat16
maxModelLen: 2048
extraPodSpec:
mainContainer:
image: registry.example.com/dynamo/vllm-placeholder:1.0.0
...
...
```
If you change the checkpoint identity, the hash changes and so does the checkpoint selected by Auto mode.
Useful inspection commands:
```bash
kubectl get dgd vllm-auto-demo -n ${NAMESPACE} \
-o jsonpath='{.status.checkpoints.VllmDecodeWorker.checkpointName}{"\n"}{.status.checkpoints.VllmDecodeWorker.identityHash}{"\n"}{.status.checkpoints.VllmDecodeWorker.ready}{"\n"}'
### 6. Trigger restore
kubectl get dckpt -n ${NAMESPACE}
```
Once the checkpoint is ready, scale the worker replicas from `1` to `2`:
If you want to force a new restore after the checkpoint becomes ready, scale the worker:
```bash
kubectl patch dgd vllm-snapshot-demo -n ${NAMESPACE} --type=merge \
kubectl patch dgd vllm-auto-demo -n ${NAMESPACE} --type=merge \
-p '{"spec":{"services":{"VllmDecodeWorker":{"replicas":2}}}}'
```
New worker pods for `VllmDecodeWorker` will restore from the ready checkpoint automatically.
## Lower-Level Testing With `snapshotctl`
## Checkpoint Configuration
It is possible to checkpoint and restore pods without the Dynamo operator via the lower-level `snapshotctl` utility. However, the snapshot helm chart must be installed, with a running `snapshot-agent` DaemonSet in the namespace with the checkpoint PVC mounted.
### Auto Mode (Recommended)
`snapshotctl` is intended for lower-level debugging and validation workflows, not as the primary user-facing checkpoint interface. For command details and manifest requirements, see [deploy/snapshot/cmd/snapshotctl/README.md](../../deploy/snapshot/cmd/snapshotctl/README.md).
The operator computes the checkpoint identity hash, looks up an existing `DynamoCheckpoint` by that hash, and creates a new `DynamoCheckpoint` only when no matching checkpoint already exists:
### Checkpoint from a worker pod manifest
```yaml
checkpoint:
enabled: true
mode: Auto
identity:
model: "meta-llama/Llama-3-8B"
backendFramework: "vllm" # or "sglang"
tensorParallelSize: 1
dtype: "bfloat16"
maxModelLen: 4096
```bash
snapshotctl checkpoint \
--manifest ./worker-pod.yaml \
--namespace ${NAMESPACE}
```
The `DynamoGraphDeployment` mirrors checkpoint resolution state under `.status.checkpoints`, including the resolved checkpoint CR name, identity hash, and whether the checkpoint was visible to the worker when it started:
The checkpoint manifest must be for a pod, contain exactly one worker container, and use a placeholder image.
If you do not pass `--checkpoint-id`, `snapshotctl` generates one and prints it:
```bash
kubectl get dgd vllm-snapshot-demo -n ${NAMESPACE} \
-o jsonpath='{.status.checkpoints.VllmDecodeWorker.checkpointName}{"\n"}{.status.checkpoints.VllmDecodeWorker.identityHash}{"\n"}'
```text
status=completed
namespace=...
name=...
checkpoint_job=...
checkpoint_id=manual-snapshot-...
checkpoint_location=/checkpoints/...
```
### Manual Management and `checkpointRef`
### Restore from a worker pod manifest
Use `checkpointRef` when you want a service to restore from a specific `DynamoCheckpoint` CR:
```yaml
checkpoint:
enabled: true
checkpointRef: "qwen3-06b-bf16"
```bash
snapshotctl restore \
--manifest ./worker-pod.yaml \
--namespace ${NAMESPACE} \
--checkpoint-id manual-snapshot-...
```
This is useful when:
- You want to **pre-warm checkpoints** before creating DGDs
- You want **explicit control** over which checkpoint to use
`checkpointRef` resolves by `DynamoCheckpoint.metadata.name`. Use a readable CR name when you want an explicit checkpoint that operators can reference directly.
This creates a new restore pod from the manifest and waits for the restore annotation to reach `completed`.
If you are managing checkpoint CRs yourself, set `mode: Manual` on the service to prevent the operator from creating a new `DynamoCheckpoint` when identity-based lookup does not find one.
### Restore an existing pod in place
```bash
# Check checkpoint status by CR name
kubectl get dynamocheckpoint qwen3-06b-bf16 -n ${NAMESPACE}
# Now create DGD referencing it
kubectl apply -f my-dgd.yaml -n ${NAMESPACE}
snapshotctl restore \
--pod existing-restore-target \
--namespace ${NAMESPACE} \
--checkpoint-id manual-snapshot-...
```
`mode: Auto` still resolves checkpoints by identity hash. The operator backfills `status.identityHash` and the `nvidia.com/snapshot-checkpoint-hash` label on each `DynamoCheckpoint` so auto lookup and uniqueness checks do not depend on the CR name.
This patches restore metadata onto an existing pod that is already snapshot-compatible.
## Checkpoint Identity
......@@ -274,215 +335,110 @@ Checkpoints are uniquely identified by a **16-character SHA256 hash** (64 bits)
| `model` | ✓ | ✓ | `meta-llama/Llama-3-8B` |
| `backendFramework` | ✓ | ✓ | `sglang`, `vllm` |
| `dynamoVersion` | | ✓ | `0.9.0`, `1.0.0` |
| `tensorParallelSize` | | ✓ | `1`, `2`, `4`, `8` (default: 1) |
| `pipelineParallelSize` | | ✓ | `1`, `2` (default: 1) |
| `tensorParallelSize` | | ✓ | `1`, `2`, `4`, `8` |
| `pipelineParallelSize` | | ✓ | `1`, `2` |
| `dtype` | | ✓ | `float16`, `bfloat16`, `fp8` |
| `maxModelLen` | | ✓ | `4096`, `8192` |
| `extraParameters` | | ✓ | Custom key-value pairs |
**Not included in hash** (don't invalidate checkpoint):
- `replicas`
- `nodeSelector`, `affinity`, `tolerations`
- `resources` (requests/limits)
- Logging/observability config
| `extraParameters` | | ✓ | custom key-value pairs |
**Example with all fields:**
```yaml
checkpoint:
enabled: true
mode: Auto
identity:
model: "meta-llama/Llama-3-8B"
backendFramework: "vllm"
dynamoVersion: "1.0.0"
tensorParallelSize: 1
pipelineParallelSize: 1
dtype: "bfloat16"
maxModelLen: 8192
extraParameters:
enableChunkedPrefill: "true"
quantization: "awq"
```
Fields that do **not** change the checkpoint hash include:
## DynamoCheckpoint CRD
- replica count
- node placement (`nodeSelector`, `affinity`, `tolerations`)
- resource requests/limits
- logging or observability configuration
The `DynamoCheckpoint` (shortname: `dckpt`) is a Kubernetes Custom Resource that manages checkpoint lifecycle.
## `DynamoCheckpoint` CRD
**When to create a DynamoCheckpoint directly:**
- **Pre-warming:** Create checkpoints before deploying DGDs for instant startup
- **Explicit control:** Manage checkpoint lifecycle independently from DGDs
The `DynamoCheckpoint` (shortname: `dckpt`) is the operator-managed resource for checkpoint lifecycle.
The operator requires `spec.identity` and `spec.job.podTemplateSpec`. The pod template should match the worker container you want checkpointed, including image, command, args, secrets, volumes, and resource limits. You do not need to set checkpoint-control plumbing manually; the operator injects the checkpoint-ready signal path for checkpoint Jobs and adds the restore metadata consumed by restored pods and the node-local controller inside the `snapshot-agent` DaemonSet.
`spec.job.backoffLimit` is deprecated and ignored. Checkpoint Jobs are always single-attempt.
Use it when you want:
**Create a checkpoint:**
- pre-warmed checkpoints before any `DynamoGraphDeployment` exists
- explicit lifecycle control independent from a DGD
- a stable human-readable name that services can reference with `checkpointRef`
```yaml
apiVersion: nvidia.com/v1alpha1
kind: DynamoCheckpoint
metadata:
name: qwen3-06b-bf16
spec:
identity:
model: Qwen/Qwen3-0.6B
backendFramework: vllm
tensorParallelSize: 1
dtype: bfloat16
maxModelLen: 4096
The operator requires:
job:
activeDeadlineSeconds: 3600
ttlSecondsAfterFinished: 300
podTemplateSpec:
spec:
restartPolicy: Never
containers:
- name: main
image: registry.example.com/dynamo/vllm-placeholder:1.0.0
command:
- python3
- -m
- dynamo.vllm
args:
- --model
- Qwen/Qwen3-0.6B
env:
- name: NCCL_DEBUG
value: ERROR
- name: TORCH_CPP_LOG_LEVEL
value: ERROR
- name: TORCH_DISTRIBUTED_DEBUG
value: "OFF"
resources:
limits:
nvidia.com/gpu: "1"
```
- `spec.identity`
- `spec.job.podTemplateSpec`
For this example identity, the operator computes a deterministic identity hash and stores it in `status.identityHash`. Auto mode uses that hash, not the CR name, when it decides whether to reuse or create a checkpoint.
`spec.job.backoffLimit` is deprecated and ignored. Checkpoint Jobs are always single-attempt.
**Check status:**
Check status with:
```bash
# List all checkpoints
kubectl get dynamocheckpoint -n ${NAMESPACE}
# Or use shortname
kubectl get dckpt -n ${NAMESPACE}
NAME MODEL BACKEND PHASE HASH AGE
qwen3-06b-bf16 Qwen/Qwen3-0.6B vllm Ready 3bff874d069f0ed5 5m
llama3-8b-bf16 meta-llama/Meta-Llama-3-8B-Instruct vllm Creating 9be4f5574b5a285d 2m
kubectl describe dckpt qwen3-06b-bf16 -n ${NAMESPACE}
kubectl get dckpt qwen3-06b-bf16 -n ${NAMESPACE} -o yaml
```
**Phases:**
The `status` block looks like:
```yaml
status:
phase: Ready
identityHash: 3bff874d069f0ed5
jobName: checkpoint-job-3bff874d069f0ed5-1
createdAt: "2026-01-29T10:05:00Z"
message: ""
```
| Phase | Description |
|-------|-------------|
| `Pending` | CR created, waiting for job to start |
| `Creating` | Checkpoint job is running |
| `Ready` | Checkpoint available for use |
| `Failed` | Checkpoint creation failed |
## Limitations
Other useful status fields are:
- **LLM workers only**: checkpoint/restore supports LLM decode and prefill workers. Specialized workers such as multimodal, embedding, and diffusion are not supported.
- **Multi-GPU remains preview**: tensor-parallel configurations are exercised in internal testing, but they are not yet a broadly supported production path across clusters.
- **Network state is sensitive**: restore is sensitive to live TCP socket state. Loopback bootstrap/control sockets are the most reliable path today.
- **Privileged DaemonSet required**: `snapshot-agent` must run privileged to execute CRIU and `cuda-checkpoint`. Workload pods do not need to be privileged.
| Field | Meaning |
|-------|---------|
| `status.identityHash` | Deterministic hash of `spec.identity` used for auto lookup and reuse |
| `status.jobName` | Name of the checkpoint Job |
| `status.location` | Checkpoint location in the configured storage backend |
| `status.storageType` | Storage backend type (`pvc`, `s3`, or `oci`) |
| `status.createdAt` | Timestamp recorded when the checkpoint becomes ready |
| `status.message` | Failure or progress message when available |
## Troubleshooting
`status.conditions` is deprecated for `DynamoCheckpoint`. The legacy condition types `JobCreated` and `JobCompleted` are kept for compatibility only. Prefer `status.phase`, `status.jobName`, and `status.message` when checking checkpoint progress.
### Checkpoint Job finishes but the checkpoint never becomes `Ready`
**Detailed status:**
Snapshot only becomes `Ready` after `snapshot-agent` confirms the checkpoint contents. A completed Job is not enough by itself.
```bash
kubectl describe dckpt qwen3-06b-bf16 -n ${NAMESPACE}
```
kubectl get dckpt <checkpoint-name> -n ${NAMESPACE} \
-o custom-columns=NAME:.metadata.name,PHASE:.status.phase,MESSAGE:.status.message,JOB:.status.jobName
```yaml
Status:
Phase: Ready
IdentityHash: 3bff874d069f0ed5
JobName: checkpoint-job-3bff874d069f0ed5
Location: /checkpoints/3bff874d069f0ed5
StorageType: pvc
CreatedAt: 2026-01-29T10:05:00Z
JOB_NAME=$(kubectl get dckpt <checkpoint-name> -n ${NAMESPACE} -o jsonpath='{.status.jobName}')
if [ -n "${JOB_NAME}" ]; then
kubectl logs job/"${JOB_NAME}" -n ${NAMESPACE}
fi
kubectl logs daemonset/snapshot-agent -n ${NAMESPACE} --all-containers
```
**Reference from DGD:**
If the worker template is wrong, the most common causes are using the raw runtime image instead of the placeholder image, or leaving out normal mounts and secrets that the worker needs to start.
Once the checkpoint is `Ready`, you can reference it by CR name:
### Restore cannot find or mount checkpoint storage
```yaml
spec:
services:
VllmDecodeWorker:
checkpoint:
enabled: true
checkpointRef: "qwen3-06b-bf16"
```
Restore discovers checkpoint storage from the `snapshot-agent` DaemonSet in the same namespace. That DaemonSet must be ready and must mount the checkpoint PVC.
Or use `mode: Auto` with the same identity, and the operator will reuse the same deterministic checkpoint object automatically.
```bash
kubectl rollout status daemonset/snapshot-agent -n ${NAMESPACE}
kubectl get daemonset -n ${NAMESPACE} -l app.kubernetes.io/component=snapshot-agent -o wide
kubectl get pvc -n ${NAMESPACE}
```
## Limitations
This is also the path that `snapshotctl` uses when it resolves checkpoint storage.
- **LLM workers only**: Checkpoint/restore supports LLM decode and prefill workers. Specialized workers (multimodal, embedding, diffusion) are not supported.
- **Single-GPU only**: Multi-GPU configurations may work in very basic hardware configurations, but are not officially supported yet.
- **Network state**: Restore is sensitive to live TCP socket state. Loopback bootstrap/control sockets can work with the supported CRIU TCP policies, but non-loopback or pod-IP-bound connections can still break restore.
- **Security**: Dynamo Snapshot runs as a **privileged DaemonSet** which is required to run CRIU and cuda-checkpoint. However, workload pods do not need to be privileged.
### `snapshotctl` manifest is rejected or the restore target is wrong
## Troubleshooting
`snapshotctl` only accepts a single-container `Pod` manifest.
### Checkpoint Not Ready
1. Check the checkpoint job:
```bash
kubectl get dckpt -n ${NAMESPACE}
kubectl describe dckpt <checkpoint-name> -n ${NAMESPACE}
JOB_NAME=$(kubectl get dckpt <checkpoint-name> -n ${NAMESPACE} -o jsonpath='{.status.jobName}')
if [ -n "${JOB_NAME}" ]; then
kubectl logs job/"${JOB_NAME}" -n ${NAMESPACE}
fi
```
2. Check the DaemonSet:
```bash
kubectl logs daemonset/snapshot-agent -n ${NAMESPACE} --all-containers
```
3. Verify that platform and chart storage settings match:
```bash
kubectl get dckpt <checkpoint-name> -n ${NAMESPACE} -o yaml
```
### Restore Failing
1. Check pod logs:
```bash
kubectl logs <worker-pod> -n ${NAMESPACE}
```
2. Describe the restore target pod:
```bash
kubectl describe pod <worker-pod> -n ${NAMESPACE}
```
3. Confirm the referenced checkpoint is still `Ready`:
```bash
kubectl get dckpt <checkpoint-name> -n ${NAMESPACE}
```
```bash
snapshotctl checkpoint --manifest ./worker-pod.yaml --namespace ${NAMESPACE}
snapshotctl restore --manifest ./worker-pod.yaml --namespace ${NAMESPACE} --checkpoint-id <checkpoint-id>
```
## Planned Features
- TensorRT-LLM backend support
- S3/MinIO storage backend
- OCI registry storage backend
- Multi-GPU checkpoints
- Stabilize multi-GPU support
- TensorRT-LLM support
- Alternative storage backends
## Related Documentation
- [Dynamo Snapshot Helm Chart README](https://github.com/ai-dynamo/dynamo/blob/main/deploy/helm/charts/snapshot/README.md) - Chart configuration
- [Installation Guide](installation-guide.md) - Platform installation
- [API Reference](api-reference.md) - Complete CRD specifications
- [Installation Guide](installation-guide.md)
- [API Reference](api-reference.md)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment