feat: introducing ChReK (Checkpoint Restore in K8s) (#4978)

Signed-off-by: Julien Mancuso <jmancuso@nvidia.com>

feat: introducing ChReK (Checkpoint Restore in K8s) (#4978)
Signed-off-by: Julien Mancuso <jmancuso@nvidia.com>
f3aa1e01 · Julien Mancuso · GitHub · 44986bf5 · f3aa1e01 · f3aa1e01
Unverified Commit f3aa1e01 authored Feb 03, 2026 by Julien Mancuso Committed by GitHub Feb 03, 2026
20 changed files
--- a/deploy/operator/config/samples/nvidia.com_v1alpha1_dynamocheckpoint.yaml
+++ b/deploy/operator/config/samples/nvidia.com_v1alpha1_dynamocheckpoint.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoCheckpoint
+metadata:
+  name: vllm-llama3-8b-tp1
+spec:
+  # Identity - determines the checkpoint hash
+  identity:
+    model: "meta-llama/Meta-Llama-3-8B-Instruct"
+    backendFramework: "vllm"
+    dynamoVersion: "0.6.0"
+    tensorParallelSize: 1
+    pipelineParallelSize: 1
+    dtype: "bfloat16"
+    maxModelLen: 8192
+    extraParameters:
+      enableChunkedPrefill: "true"
+
+  # Job configuration for checkpoint creation
+  job:
+    activeDeadlineSeconds: 3600
+    backoffLimit: 3
+    ttlSecondsAfterFinished: 300
+    podTemplateSpec:
+      spec:
+        containers:
+          - name: checkpoint-worker
+            image: nvcr.io/nvidia/ai-dynamo/dynamo-vllm:latest
+            command: ["python", "-m", "vllm.entrypoints.openai.api_server"]
+            args:
+              - "--model"
+              - "meta-llama/Meta-Llama-3-8B-Instruct"
+              - "--tensor-parallel-size"
+              - "1"
+              - "--dtype"
+              - "bfloat16"
+              - "--max-model-len"
+              - "8192"
+            env:
+              - name: HF_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-secret
+                    key: token
+            resources:
+              limits:
+                nvidia.com/gpu: 1
+        restartPolicy: Never
+
--- a/deploy/operator/internal/checkpoint/dgd_integration.go
+++ b/deploy/operator/internal/checkpoint/dgd_integration.go
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package checkpoint
+
+import (
+	"context"
+	"fmt"
+	"path/filepath"
+
+	nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
+	"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
+	controller_common "github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common"
+	corev1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/types"
+	"k8s.io/utils/ptr"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+)
+
+// getCheckpointInfoFromCheckpoint extracts CheckpointInfo from a DynamoCheckpoint CR
+func getCheckpointInfoFromCheckpoint(ckpt *nvidiacomv1alpha1.DynamoCheckpoint) *CheckpointInfo {
+	info := &CheckpointInfo{
+		Enabled:        true,
+		CheckpointName: ckpt.Name,
+		Hash:           ckpt.Status.IdentityHash,
+		Location:       ckpt.Status.Location,
+		StorageType:    ckpt.Status.StorageType,
+		Ready:          ckpt.Status.Phase == nvidiacomv1alpha1.DynamoCheckpointPhaseReady,
+		Identity:       &ckpt.Spec.Identity,
+	}
+
+	return info
+}
+
+// DefaultCheckpointPVCName is the default PVC name for checkpoint storage
+const DefaultCheckpointPVCName = "checkpoint-storage"
+
+// getPVCBasePath returns the PVC base path from storage config, or the default
+// Only applicable for PVC storage type
+func getPVCBasePath(storageConfig *controller_common.CheckpointStorageConfig) string {
+	if storageConfig != nil && storageConfig.PVC.BasePath != "" {
+		return storageConfig.PVC.BasePath
+	}
+	return consts.CheckpointBasePath
+}
+
+// GetPVCBasePath returns the configured PVC base path from controller config,
+// or the default if not set. This is used by both CheckpointReconciler and DynamoGraphDeploymentReconciler.
+// Only applicable for PVC storage type.
+func GetPVCBasePath(config *controller_common.CheckpointConfig) string {
+	if config != nil && config.Enabled {
+		return getPVCBasePath(&config.Storage)
+	}
+	return consts.CheckpointBasePath
+}
+
+// storageTypeToAPI converts controller_common storage type string to API enum
+func storageTypeToAPI(storageType string) nvidiacomv1alpha1.DynamoCheckpointStorageType {
+	// Simply cast - the values match between controller constants and API enum
+	return nvidiacomv1alpha1.DynamoCheckpointStorageType(storageType)
+}
+
+// CheckpointInfo contains resolved checkpoint information for a DGD service
+type CheckpointInfo struct {
+	// Enabled indicates if checkpointing is enabled
+	Enabled bool
+	// Identity is the resolved checkpoint identity (model, framework, etc.)
+	Identity *nvidiacomv1alpha1.DynamoCheckpointIdentity
+	// Hash is the computed identity hash
+	Hash string
+	// Location is the full URI/path in the storage backend
+	Location string
+	// StorageType is the storage backend type (pvc, s3, oci)
+	StorageType nvidiacomv1alpha1.DynamoCheckpointStorageType
+	// CheckpointName is the name of the Checkpoint CR
+	CheckpointName string
+	// Ready indicates if the checkpoint is ready for use
+	Ready bool
+}
+
+// ResolveCheckpointForService resolves checkpoint information for a DGD service.
+// It handles both checkpointRef (direct reference) and identity-based lookup.
+// Returns CheckpointInfo with the resolved identity populated.
+func ResolveCheckpointForService(
+	ctx context.Context,
+	c client.Client,
+	namespace string,
+	config *nvidiacomv1alpha1.ServiceCheckpointConfig,
+) (*CheckpointInfo, error) {
+	if config == nil || !config.Enabled {
+		return &CheckpointInfo{Enabled: false}, nil
+	}
+
+	// If a direct checkpoint reference is provided, use it
+	if config.CheckpointRef != nil && *config.CheckpointRef != "" {
+		ckpt := &nvidiacomv1alpha1.DynamoCheckpoint{}
+		err := c.Get(ctx, types.NamespacedName{
+			Namespace: namespace,
+			Name:      *config.CheckpointRef,
+		}, ckpt)
+		if err != nil {
+			return nil, fmt.Errorf("failed to get referenced checkpoint %s: %w", *config.CheckpointRef, err)
+		}
+
+		// Extract all checkpoint info including identity from the CR
+		return getCheckpointInfoFromCheckpoint(ckpt), nil
+	}
+
+	// Otherwise, compute hash from identity and look up checkpoint
+	if config.Identity == nil {
+		return nil, fmt.Errorf("checkpoint enabled but no checkpointRef or identity provided")
+	}
+
+	hash, err := ComputeIdentityHash(*config.Identity)
+	if err != nil {
+		return nil, fmt.Errorf("failed to compute identity hash: %w", err)
+	}
+
+	info := &CheckpointInfo{
+		Enabled:  true,
+		Identity: config.Identity,
+		Hash:     hash,
+	}
+
+	// Look for existing checkpoint with matching hash using label selector
+	checkpointList := &nvidiacomv1alpha1.DynamoCheckpointList{}
+	if err = c.List(ctx, checkpointList,
+		client.InNamespace(namespace),
+		client.MatchingLabels{consts.KubeLabelCheckpointHash: info.Hash},
+	); err != nil {
+		return nil, fmt.Errorf("failed to list checkpoints: %w", err)
+	}
+
+	// Return the first matching checkpoint (there should be at most one per hash)
+	if len(checkpointList.Items) > 0 {
+		ckpt := &checkpointList.Items[0]
+		// Merge checkpoint info from the CR (overrides the computed values)
+		foundInfo := getCheckpointInfoFromCheckpoint(ckpt)
+		// Keep the hash and identity we computed from the config
+		foundInfo.Hash = info.Hash
+		foundInfo.Identity = info.Identity
+		return foundInfo, nil
+	}
+
+	// No existing checkpoint found
+	// In Auto mode, the controller should create one
+	return info, nil
+}
+
+// InjectCheckpointEnvVars adds checkpoint-related environment variables to a container
+// Sets STORAGE_TYPE, LOCATION, PATH, HASH, and CRIU-related vars for unified storage backend handling.
+func InjectCheckpointEnvVars(container *corev1.Container, info *CheckpointInfo, config *controller_common.CheckpointConfig) {
+	if !info.Enabled {
+		return
+	}
+
+	// Determine storage type (default to PVC if not set)
+	storageType := info.StorageType
+	if storageType == "" {
+		storageType = nvidiacomv1alpha1.DynamoCheckpointStorageType(controller_common.CheckpointStorageTypePVC)
+	}
+
+	envVars := []corev1.EnvVar{
+		{
+			Name:  consts.EnvCheckpointStorageType,
+			Value: string(storageType),
+		},
+	}
+
+	// Location is the source (where to fetch from)
+	if info.Location != "" {
+		envVars = append(envVars, corev1.EnvVar{
+			Name:  consts.EnvCheckpointLocation,
+			Value: info.Location,
+		})
+	}
+
+	// For PVC storage, also inject DYNAMO_CHECKPOINT_PATH (base directory)
+	// This is used by k8s-runc-bypass restore entrypoint
+	if string(storageType) == controller_common.CheckpointStorageTypePVC && info.Location != "" {
+		// Extract base path using filepath.Dir()
+		basePath := filepath.Dir(info.Location)
+		envVars = append(envVars, corev1.EnvVar{
+			Name:  consts.EnvCheckpointPath,
+			Value: basePath,
+		})
+	}
+
+	// Include hash for debugging/observability and for k8s-runc-bypass
+	if info.Hash != "" {
+		envVars = append(envVars, corev1.EnvVar{
+			Name:  consts.EnvCheckpointHash,
+			Value: info.Hash,
+		})
+	}
+
+	// Add CRIU-related env vars for restore operations
+	criuTimeout := consts.DefaultCRIUTimeout
+	if config != nil && config.CRIUTimeout != "" {
+		criuTimeout = config.CRIUTimeout
+	}
+
+	envVars = append(envVars,
+		corev1.EnvVar{
+			Name:  consts.EnvRestoreMarkerFile,
+			Value: consts.RestoreMarkerFilePath,
+		},
+		corev1.EnvVar{
+			Name:  consts.EnvCRIUWorkDir,
+			Value: consts.CRIUWorkDirPath,
+		},
+		corev1.EnvVar{
+			Name:  consts.EnvCRIULogDir,
+			Value: consts.CRIULogDirPath,
+		},
+		corev1.EnvVar{
+			Name:  consts.EnvCUDAPluginDir,
+			Value: consts.CUDAPluginDirPath,
+		},
+		corev1.EnvVar{
+			Name:  consts.EnvCRIUTimeout,
+			Value: criuTimeout,
+		},
+	)
+
+	// Prepend checkpoint env vars to ensure they're available
+	container.Env = append(envVars, container.Env...)
+}
+
+// InjectCheckpointVolume adds the checkpoint PVC volume to a pod spec
+func InjectCheckpointVolume(podSpec *corev1.PodSpec, pvcName string) {
+	// Check if volume already exists
+	for _, v := range podSpec.Volumes {
+		if v.Name == consts.CheckpointVolumeName {
+			return
+		}
+	}
+
+	podSpec.Volumes = append(podSpec.Volumes, corev1.Volume{
+		Name: consts.CheckpointVolumeName,
+		VolumeSource: corev1.VolumeSource{
+			PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
+				ClaimName: pvcName,
+				ReadOnly:  false, // CRIU needs write access during restore
+			},
+		},
+	})
+}
+
+// InjectCheckpointVolumeMount adds the checkpoint volume mount to a container
+func InjectCheckpointVolumeMount(container *corev1.Container, basePath string) {
+	// Check if mount already exists
+	for _, m := range container.VolumeMounts {
+		if m.Name == consts.CheckpointVolumeName {
+			return
+		}
+	}
+
+	if basePath == "" {
+		basePath = consts.CheckpointBasePath
+	}
+
+	container.VolumeMounts = append(container.VolumeMounts, corev1.VolumeMount{
+		Name:      consts.CheckpointVolumeName,
+		MountPath: basePath,
+		ReadOnly:  false, // CRIU needs write access for restore.log and restore-criu.conf
+	})
+}
+
+// InjectCheckpointSignalVolume adds the checkpoint signal hostPath volume to a pod spec
+// This is needed for CRIU mount namespace consistency between checkpoint and restore pods
+func InjectCheckpointSignalVolume(podSpec *corev1.PodSpec, checkpointConfig *controller_common.CheckpointConfig) {
+	// Check if volume already exists
+	for _, v := range podSpec.Volumes {
+		if v.Name == consts.CheckpointSignalVolumeName {
+			return
+		}
+	}
+
+	// Get signal host path from config or use default
+	signalHostPath := consts.CheckpointSignalHostPath
+	if checkpointConfig != nil && checkpointConfig.Storage.SignalHostPath != "" {
+		signalHostPath = checkpointConfig.Storage.SignalHostPath
+	}
+
+	hostPathType := corev1.HostPathDirectoryOrCreate
+	podSpec.Volumes = append(podSpec.Volumes, corev1.Volume{
+		Name: consts.CheckpointSignalVolumeName,
+		VolumeSource: corev1.VolumeSource{
+			HostPath: &corev1.HostPathVolumeSource{
+				Path: signalHostPath,
+				Type: &hostPathType,
+			},
+		},
+	})
+}
+
+// InjectCheckpointSignalVolumeMount adds the checkpoint signal volume mount to a container
+// This is needed for CRIU mount namespace consistency between checkpoint and restore pods
+func InjectCheckpointSignalVolumeMount(container *corev1.Container) {
+	// Check if mount already exists
+	for _, m := range container.VolumeMounts {
+		if m.Name == consts.CheckpointSignalVolumeName {
+			return
+		}
+	}
+
+	container.VolumeMounts = append(container.VolumeMounts, corev1.VolumeMount{
+		Name:      consts.CheckpointSignalVolumeName,
+		MountPath: consts.CheckpointSignalMountPath,
+		ReadOnly:  false,
+	})
+}
+
+// InjectPodInfoVolume adds a Downward API volume for pod identity and DGD info.
+// This is critical for CRIU checkpoint/restore scenarios where environment variables
+// contain stale values from the checkpoint source pod. The Downward API files
+// always reflect the current pod's identity and DGD configuration.
+func InjectPodInfoVolume(podSpec *corev1.PodSpec) {
+	// Check if volume already exists
+	for _, v := range podSpec.Volumes {
+		if v.Name == consts.PodInfoVolumeName {
+			return
+		}
+	}
+
+	podSpec.Volumes = append(podSpec.Volumes, corev1.Volume{
+		Name: consts.PodInfoVolumeName,
+		VolumeSource: corev1.VolumeSource{
+			DownwardAPI: &corev1.DownwardAPIVolumeSource{
+				Items: []corev1.DownwardAPIVolumeFile{
+					// Pod identity fields
+					{
+						Path: "pod_name",
+						FieldRef: &corev1.ObjectFieldSelector{
+							FieldPath: consts.PodInfoFieldPodName,
+						},
+					},
+					{
+						Path: "pod_uid",
+						FieldRef: &corev1.ObjectFieldSelector{
+							FieldPath: consts.PodInfoFieldPodUID,
+						},
+					},
+					{
+						Path: "pod_namespace",
+						FieldRef: &corev1.ObjectFieldSelector{
+							FieldPath: consts.PodInfoFieldPodNamespace,
+						},
+					},
+					// DGD info from annotations (for CRIU restore)
+					{
+						Path: consts.PodInfoFileDynNamespace,
+						FieldRef: &corev1.ObjectFieldSelector{
+							FieldPath: "metadata.annotations['" + consts.AnnotationDynNamespace + "']",
+						},
+					},
+					{
+						Path: consts.PodInfoFileDynComponent,
+						FieldRef: &corev1.ObjectFieldSelector{
+							FieldPath: "metadata.annotations['" + consts.AnnotationDynComponent + "']",
+						},
+					},
+					{
+						Path: consts.PodInfoFileDynParentDGDName,
+						FieldRef: &corev1.ObjectFieldSelector{
+							FieldPath: "metadata.annotations['" + consts.AnnotationDynParentDGDName + "']",
+						},
+					},
+					{
+						Path: consts.PodInfoFileDynParentDGDNS,
+						FieldRef: &corev1.ObjectFieldSelector{
+							FieldPath: "metadata.annotations['" + consts.AnnotationDynParentDGDNS + "']",
+						},
+					},
+					{
+						Path: consts.PodInfoFileDynDiscoveryBackend,
+						FieldRef: &corev1.ObjectFieldSelector{
+							FieldPath: "metadata.annotations['" + consts.AnnotationDynDiscoveryBackend + "']",
+						},
+					},
+				},
+			},
+		},
+	})
+}
+
+// InjectPodInfoVolumeMount adds the Downward API volume mount to a container.
+func InjectPodInfoVolumeMount(container *corev1.Container) {
+	// Check if mount already exists
+	for _, m := range container.VolumeMounts {
+		if m.Name == consts.PodInfoVolumeName {
+			return
+		}
+	}
+
+	container.VolumeMounts = append(container.VolumeMounts, corev1.VolumeMount{
+		Name:      consts.PodInfoVolumeName,
+		MountPath: consts.PodInfoMountPath,
+		ReadOnly:  true,
+	})
+}
+
+// InjectCheckpointIntoPodSpec injects checkpoint configuration into a pod spec.
+// This is the single entry point for ALL checkpoint-related pod modifications:
+// 1. Command/Args transformation - moves Command to Args to respect image ENTRYPOINT
+// 2. Security context - applies hostIPC and privileged mode for CRIU restore
+// 3. Environment variables - injects checkpoint path, hash, and CRIU settings
+// 4. Storage configuration - adds volumes and mounts based on storage type
+//
+// Takes CheckpointInfo (resolved by ResolveCheckpointForService) and checkpoint config.
+// Returns error if checkpoint is enabled but configuration is invalid.
+func InjectCheckpointIntoPodSpec(
+	podSpec *corev1.PodSpec,
+	checkpointInfo *CheckpointInfo,
+	checkpointConfig *controller_common.CheckpointConfig,
+) error {
+	if checkpointInfo == nil || !checkpointInfo.Enabled {
+		return nil
+	}
+
+	// Use the checkpoint info as-is (already computed by ResolveCheckpointForService)
+	// We only need to compute hash if it's not already set
+	info := checkpointInfo
+	if info.Hash == "" {
+		// Identity is required to compute the hash
+		if info.Identity == nil {
+			return fmt.Errorf("checkpoint enabled but identity is nil and hash is not set")
+		}
+		hash, err := ComputeIdentityHash(*info.Identity)
+		if err != nil {
+			return fmt.Errorf("failed to compute identity hash: %w", err)
+		}
+		info.Hash = hash
+	}
+
+	// Find the main container first (needed for all modifications)
+	var mainContainer *corev1.Container
+	for i := range podSpec.Containers {
+		if podSpec.Containers[i].Name == consts.MainContainerName {
+			mainContainer = &podSpec.Containers[i]
+			break
+		}
+	}
+	// If no main container found by name, use the first container
+	if mainContainer == nil && len(podSpec.Containers) > 0 {
+		mainContainer = &podSpec.Containers[0]
+	}
+	if mainContainer == nil {
+		return fmt.Errorf("no container found to inject checkpoint config")
+	}
+
+	// 1. Handle command/args for checkpoint-enabled images
+	// When checkpoint is enabled, the image has a smart ENTRYPOINT (e.g., /smart-entrypoint.sh)
+	// that detects checkpoints and decides between restore and cold start.
+	// We need to pass the user's command as arguments to this ENTRYPOINT rather than
+	// overriding it with Command.
+	if len(mainContainer.Command) > 0 {
+		// Combine Command + Args into a single Args array
+		// This allows the image's ENTRYPOINT to receive the full command as arguments
+		combinedArgs := append(mainContainer.Command, mainContainer.Args...)
+		mainContainer.Args = combinedArgs
+		mainContainer.Command = nil // Clear Command to use image's ENTRYPOINT
+	}
+	// If Command is empty but Args exists, keep Args as-is (they'll be passed to ENTRYPOINT)
+
+	// 2. Apply pod-level security context for CRIU restore
+	// hostIPC: Required for CRIU to access shared memory segments and IPC resources
+	podSpec.HostIPC = true
+
+	// Apply seccomp profile to match checkpoint environment
+	// This blocks io_uring syscalls required for CRIU compatibility
+	if podSpec.SecurityContext == nil {
+		podSpec.SecurityContext = &corev1.PodSecurityContext{}
+	}
+	podSpec.SecurityContext.SeccompProfile = &corev1.SeccompProfile{
+		Type:             corev1.SeccompProfileTypeLocalhost,
+		LocalhostProfile: ptr.To("profiles/block-iouring.json"),
+	}
+
+	// Apply container-level security context for CRIU restore
+	// Privileged mode is required for CRIU restore operations
+	if mainContainer.SecurityContext == nil {
+		mainContainer.SecurityContext = &corev1.SecurityContext{}
+	}
+	mainContainer.SecurityContext.Privileged = ptr.To(true)
+
+	// Determine storage type and compute location/path
+	storageType := controller_common.CheckpointStorageTypePVC // default
+	var storageConfig *controller_common.CheckpointStorageConfig
+	if checkpointConfig != nil {
+		storageConfig = &checkpointConfig.Storage
+		if storageConfig.Type != "" {
+			storageType = storageConfig.Type
+		}
+	}
+
+	switch storageType {
+	case controller_common.CheckpointStorageTypeS3:
+		// S3 storage: location is s3:// URI
+		// URI format: s3://[endpoint/]bucket/prefix
+		info.StorageType = storageTypeToAPI(storageType)
+		s3URI := "s3://checkpoint-storage/checkpoints" // default
+		if storageConfig != nil && storageConfig.S3.URI != "" {
+			s3URI = storageConfig.S3.URI
+		}
+		// Append hash to the URI
+		info.Location = fmt.Sprintf("%s/%s.tar", s3URI, info.Hash)
+
+	case controller_common.CheckpointStorageTypeOCI:
+		// OCI storage: location is oci:// URI
+		// URI format: oci://registry/repository
+		info.StorageType = storageTypeToAPI(storageType)
+		ociURI := "oci://localhost/checkpoints" // default
+		if storageConfig != nil && storageConfig.OCI.URI != "" {
+			ociURI = storageConfig.OCI.URI
+		}
+		// Append hash as tag
+		info.Location = fmt.Sprintf("%s:%s", ociURI, info.Hash)
+
+	default: // controller_common.CheckpointStorageTypePVC
+		// PVC storage: location is the checkpoint directory
+		// k8s-runc-bypass expects: /checkpoints/{hash}/ (directory with checkpoint data)
+		info.StorageType = storageTypeToAPI(storageType)
+		basePath := getPVCBasePath(storageConfig)
+		pvcName := DefaultCheckpointPVCName
+		if storageConfig != nil && storageConfig.PVC.PVCName != "" {
+			pvcName = storageConfig.PVC.PVCName
+		}
+		info.Location = fmt.Sprintf("%s/%s", basePath, info.Hash)
+
+		// Inject PVC volume and mount (only for PVC storage)
+		InjectCheckpointVolume(podSpec, pvcName)
+		InjectCheckpointVolumeMount(mainContainer, basePath)
+	}
+
+	// Inject signal volume for CRIU mount namespace consistency
+	// Even though restore pods don't use the signal file, they need it mounted
+	// to match the checkpoint job's mount namespace for CRIU compatibility
+	InjectCheckpointSignalVolume(podSpec, checkpointConfig)
+	InjectCheckpointSignalVolumeMount(mainContainer)
+
+	// Inject Downward API volume for pod identity after CRIU restore
+	// CRIU preserves environment variables from checkpoint time, so pod identity
+	// env vars (POD_NAME, POD_UID, POD_NAMESPACE) contain stale values.
+	// The Dynamo runtime reads from /etc/podinfo/ files first to get correct identity.
+	InjectPodInfoVolume(podSpec)
+	InjectPodInfoVolumeMount(mainContainer)
+
+	// Inject checkpoint environment variables (for all storage types)
+	InjectCheckpointEnvVars(mainContainer, info, checkpointConfig)
+
+	return nil
+}
+
+// InjectCheckpointLabelsFromConfig adds checkpoint labels to a label map based on config
+func InjectCheckpointLabelsFromConfig(labels map[string]string, config *nvidiacomv1alpha1.ServiceCheckpointConfig) (map[string]string, error) {
+	if config == nil || !config.Enabled {
+		return labels, nil
+	}
+
+	if labels == nil {
+		labels = make(map[string]string)
+	}
+
+	// Compute hash from identity if provided
+	if config.Identity != nil {
+		hash, err := ComputeIdentityHash(*config.Identity)
+		if err != nil {
+			return nil, fmt.Errorf("failed to compute identity hash for labels: %w", err)
+		}
+		labels[consts.KubeLabelCheckpointHash] = hash
+	}
+
+	return labels, nil
+}
--- a/deploy/operator/internal/checkpoint/hash.go
+++ b/deploy/operator/internal/checkpoint/hash.go
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package checkpoint
+
+import (
+	"crypto/sha256"
+	"encoding/hex"
+	"encoding/json"
+	"fmt"
+
+	nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
+)
+
+// normalizedIdentity is the canonical form used for hash computation
+// Only fields that affect checkpoint equivalence are included
+type normalizedIdentity struct {
+	Model                string            `json:"model"`
+	BackendFramework     string            `json:"backendFramework"`
+	DynamoVersion        string            `json:"dynamoVersion,omitempty"`
+	TensorParallelSize   int32             `json:"tensorParallelSize"`
+	PipelineParallelSize int32             `json:"pipelineParallelSize"`
+	Dtype                string            `json:"dtype,omitempty"`
+	MaxModelLen          int32             `json:"maxModelLen,omitempty"`
+	ExtraParameters      map[string]string `json:"extraParameters,omitempty"`
+}
+
+// ComputeIdentityHash computes a deterministic hash from a DynamoCheckpointIdentity
+// The hash is computed by:
+// 1. Normalizing all fields
+// 2. Serializing to JSON (with sorted keys)
+// 3. Computing SHA256 hash
+// 4. Returning first 16 characters of hex encoding (64 bits)
+//
+// 16 hex characters (64 bits) provides excellent collision resistance:
+// - 1% collision probability at ~500 million configs
+// - 50% collision probability at ~4 billion configs
+// This is a perfect balance between readability and safety.
+func ComputeIdentityHash(identity nvidiacomv1alpha1.DynamoCheckpointIdentity) (string, error) {
+	normalized := normalizeIdentity(identity)
+
+	// Serialize to JSON (Go's json.Marshal sorts map keys)
+	data, err := json.Marshal(normalized)
+	if err != nil {
+		// This should never happen with our controlled types, but bubble up error if it does
+		return "", fmt.Errorf("failed to marshal identity for hashing: %w", err)
+	}
+
+	// Compute SHA256 hash
+	hash := sha256.Sum256(data)
+
+	// Return first 16 characters of hex encoding (64 bits)
+	// Provides excellent collision resistance while remaining readable
+	return hex.EncodeToString(hash[:])[:16], nil
+}
+
+func normalizeIdentity(identity nvidiacomv1alpha1.DynamoCheckpointIdentity) normalizedIdentity {
+	// Apply defaults for TP/PP if not set
+	tp := identity.TensorParallelSize
+	if tp == 0 {
+		tp = 1
+	}
+	pp := identity.PipelineParallelSize
+	if pp == 0 {
+		pp = 1
+	}
+
+	// ExtraParameters - ensure non-nil for consistent JSON
+	extraParams := identity.ExtraParameters
+	if extraParams == nil {
+		extraParams = make(map[string]string)
+	}
+
+	return normalizedIdentity{
+		Model:                identity.Model,
+		BackendFramework:     identity.BackendFramework,
+		DynamoVersion:        identity.DynamoVersion,
+		TensorParallelSize:   tp,
+		PipelineParallelSize: pp,
+		Dtype:                identity.Dtype,
+		MaxModelLen:          identity.MaxModelLen,
+		ExtraParameters:      extraParams,
+	}
+}
--- a/deploy/operator/internal/checkpoint/hash_test.go
+++ b/deploy/operator/internal/checkpoint/hash_test.go
+package checkpoint
+
+import (
+	"testing"
+
+	nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestComputeIdentityHash(t *testing.T) {
+	tests := []struct {
+		name          string
+		identity      nvidiacomv1alpha1.DynamoCheckpointIdentity
+		expectError   bool
+		expectedHash  string // Only set for deterministic checks
+		otherIdentity *nvidiacomv1alpha1.DynamoCheckpointIdentity
+		shouldMatch   bool
+	}{
+		{
+			name: "basic identity produces deterministic hash",
+			identity: nvidiacomv1alpha1.DynamoCheckpointIdentity{
+				Model:            "meta-llama/Llama-2-7b-hf",
+				BackendFramework: "vllm",
+			},
+			expectError:  false,
+			expectedHash: "96429b2725761a09", // Known hash for this specific identity
+		},
+		{
+			name: "identity with all fields produces deterministic hash",
+			identity: nvidiacomv1alpha1.DynamoCheckpointIdentity{
+				Model:                "meta-llama/Llama-2-13b-hf",
+				BackendFramework:     "sglang",
+				DynamoVersion:        "0.4.2",
+				TensorParallelSize:   2,
+				PipelineParallelSize: 1,
+				Dtype:                "float16",
+				MaxModelLen:          4096,
+				ExtraParameters: map[string]string{
+					"gpu_memory_utilization": "0.9",
+				},
+			},
+			expectError:  false,
+			expectedHash: "f4ba65bccbb8e4fb", // Known hash for this specific identity
+		},
+		{
+			name: "same identity produces same hash",
+			identity: nvidiacomv1alpha1.DynamoCheckpointIdentity{
+				Model:            "meta-llama/Llama-2-7b-hf",
+				BackendFramework: "vllm",
+			},
+			otherIdentity: &nvidiacomv1alpha1.DynamoCheckpointIdentity{
+				Model:            "meta-llama/Llama-2-7b-hf",
+				BackendFramework: "vllm",
+			},
+			expectError: false,
+			shouldMatch: true,
+		},
+		{
+			name: "different models produce different hashes",
+			identity: nvidiacomv1alpha1.DynamoCheckpointIdentity{
+				Model:            "meta-llama/Llama-2-7b-hf",
+				BackendFramework: "vllm",
+			},
+			otherIdentity: &nvidiacomv1alpha1.DynamoCheckpointIdentity{
+				Model:            "meta-llama/Llama-2-13b-hf",
+				BackendFramework: "vllm",
+			},
+			expectError: false,
+			shouldMatch: false,
+		},
+		{
+			name: "different frameworks produce different hashes",
+			identity: nvidiacomv1alpha1.DynamoCheckpointIdentity{
+				Model:            "meta-llama/Llama-2-7b-hf",
+				BackendFramework: "vllm",
+			},
+			otherIdentity: &nvidiacomv1alpha1.DynamoCheckpointIdentity{
+				Model:            "meta-llama/Llama-2-7b-hf",
+				BackendFramework: "sglang",
+			},
+			expectError: false,
+			shouldMatch: false,
+		},
+		{
+			name: "normalization: zero vs unset numeric fields",
+			identity: nvidiacomv1alpha1.DynamoCheckpointIdentity{
+				Model:              "meta-llama/Llama-2-7b-hf",
+				BackendFramework:   "vllm",
+				TensorParallelSize: 0,
+				MaxModelLen:        0,
+			},
+			otherIdentity: &nvidiacomv1alpha1.DynamoCheckpointIdentity{
+				Model:            "meta-llama/Llama-2-7b-hf",
+				BackendFramework: "vllm",
+				// TensorParallelSize and MaxModelLen omitted (defaults to 0)
+			},
+			expectError: false,
+			shouldMatch: true,
+		},
+		{
+			name: "normalization: empty vs nil map",
+			identity: nvidiacomv1alpha1.DynamoCheckpointIdentity{
+				Model:            "meta-llama/Llama-2-7b-hf",
+				BackendFramework: "vllm",
+				ExtraParameters:  map[string]string{},
+			},
+			otherIdentity: &nvidiacomv1alpha1.DynamoCheckpointIdentity{
+				Model:            "meta-llama/Llama-2-7b-hf",
+				BackendFramework: "vllm",
+				ExtraParameters:  nil,
+			},
+			expectError: false,
+			shouldMatch: true,
+		},
+		{
+			name: "extra parameters order should not matter",
+			identity: nvidiacomv1alpha1.DynamoCheckpointIdentity{
+				Model:            "meta-llama/Llama-2-7b-hf",
+				BackendFramework: "vllm",
+				ExtraParameters: map[string]string{
+					"param_a": "value1",
+					"param_b": "value2",
+				},
+			},
+			otherIdentity: &nvidiacomv1alpha1.DynamoCheckpointIdentity{
+				Model:            "meta-llama/Llama-2-7b-hf",
+				BackendFramework: "vllm",
+				ExtraParameters: map[string]string{
+					"param_b": "value2",
+					"param_a": "value1",
+				},
+			},
+			expectError: false,
+			shouldMatch: true,
+		},
+		{
+			name: "different extra parameters produce different hashes",
+			identity: nvidiacomv1alpha1.DynamoCheckpointIdentity{
+				Model:            "meta-llama/Llama-2-7b-hf",
+				BackendFramework: "vllm",
+				ExtraParameters: map[string]string{
+					"gpu_memory_utilization": "0.9",
+				},
+			},
+			otherIdentity: &nvidiacomv1alpha1.DynamoCheckpointIdentity{
+				Model:            "meta-llama/Llama-2-7b-hf",
+				BackendFramework: "vllm",
+				ExtraParameters: map[string]string{
+					"gpu_memory_utilization": "0.8",
+				},
+			},
+			expectError: false,
+			shouldMatch: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			hash1, err1 := ComputeIdentityHash(tt.identity)
+
+			if tt.expectError {
+				require.Error(t, err1)
+				return
+			}
+
+			require.NoError(t, err1)
+			assert.NotEmpty(t, hash1, "hash should not be empty")
+			assert.Len(t, hash1, 16, "hash should be 16 characters (64 bits)")
+			// Verify it's hex
+			assert.Regexp(t, "^[0-9a-f]{16}$", hash1, "hash should be 16 hex characters")
+
+			// If we have an expected hash, check it
+			if tt.expectedHash != "" {
+				assert.Equal(t, tt.expectedHash, hash1)
+			}
+
+			// If we have another identity to compare, compute its hash
+			if tt.otherIdentity != nil {
+				hash2, err2 := ComputeIdentityHash(*tt.otherIdentity)
+				require.NoError(t, err2)
+
+				if tt.shouldMatch {
+					assert.Equal(t, hash1, hash2, "hashes should match")
+				} else {
+					assert.NotEqual(t, hash1, hash2, "hashes should differ")
+				}
+			}
+		})
+	}
+}
--- a/deploy/operator/internal/consts/consts.go
+++ b/deploy/operator/internal/consts/consts.go
@@ -117,6 +117,89 @@ const (
 	ResourceStateReady    = "ready"
 	ResourceStateNotReady = "not_ready"
 	ResourceStateUnknown  = "unknown"
+	// Checkpoint related constants
+	KubeLabelCheckpointSource = "nvidia.com/checkpoint-source"
+	KubeLabelCheckpointHash   = "nvidia.com/checkpoint-hash"
+	KubeLabelCheckpointName   = "nvidia.com/checkpoint-name"
+
+	// EnvCheckpointStorageType indicates the storage backend type (pvc, s3, oci)
+	EnvCheckpointStorageType = "DYN_CHECKPOINT_STORAGE_TYPE"
+	// EnvCheckpointLocation is the source location of the checkpoint
+	// For PVC: same as path (e.g., /checkpoints/{hash}.tar)
+	// For S3: s3://bucket/prefix/{hash}.tar
+	// For OCI: oci://registry/repo:{hash}
+	EnvCheckpointLocation = "DYN_CHECKPOINT_LOCATION"
+	// EnvCheckpointPath is the local path to the checkpoint tar file
+	// For PVC: same as location
+	// For S3/OCI: download destination (e.g., /tmp/{hash}.tar)
+	EnvCheckpointPath = "DYN_CHECKPOINT_PATH"
+	// EnvCheckpointHash is the identity hash (for debugging/observability)
+	EnvCheckpointHash = "DYN_CHECKPOINT_HASH"
+	// EnvCheckpointSignalFile is the full path to the signal file
+	// The DaemonSet writes this file after checkpoint is complete
+	// The checkpoint job pod waits for this file, then exits successfully
+	EnvCheckpointSignalFile = "DYN_CHECKPOINT_SIGNAL_FILE"
+
+	// EnvCheckpointReadyFile is the full path to a file the worker creates
+	// when the model is loaded and ready for checkpointing.
+	// The readiness probe watches this file to trigger DaemonSet checkpoint.
+	EnvCheckpointReadyFile = "DYN_CHECKPOINT_READY_FILE"
+
+	// CRIU-related environment variables for restore operations
+	// EnvRestoreMarkerFile is the file created by CRIU after successful restore
+	EnvRestoreMarkerFile = "DYN_RESTORE_MARKER_FILE"
+	// EnvCRIUWorkDir is the working directory for CRIU operations
+	EnvCRIUWorkDir = "CRIU_WORK_DIR"
+	// EnvCRIULogDir is the directory where CRIU writes logs
+	EnvCRIULogDir = "CRIU_LOG_DIR"
+	// EnvCUDAPluginDir is the directory containing CRIU CUDA plugins
+	EnvCUDAPluginDir = "CUDA_PLUGIN_DIR"
+	// EnvCRIUTimeout is the timeout for CRIU operations
+	EnvCRIUTimeout = "CRIU_TIMEOUT"
+
+	// CheckpointReadyFilePath is the default path for the ready file
+	CheckpointReadyFilePath = "/tmp/checkpoint-ready"
+	// RestoreMarkerFilePath is the default path for the restore marker
+	RestoreMarkerFilePath = "/tmp/dynamo-restored"
+	// CRIUWorkDirPath is the default CRIU work directory
+	CRIUWorkDirPath = "/var/criu-work"
+	// CRIULogDirPath is the default CRIU log directory
+	CRIULogDirPath = "/checkpoints/restore-logs"
+	// CUDAPluginDirPath is the default CUDA plugin directory
+	CUDAPluginDirPath = "/usr/local/lib/criu"
+	// DefaultCRIUTimeout is the default CRIU timeout in seconds (6 hours)
+	DefaultCRIUTimeout = "21600"
+
+	CheckpointVolumeName       = "checkpoint-storage"
+	CheckpointSignalVolumeName = "checkpoint-signal"
+	CheckpointBasePath         = "/checkpoints"
+	CheckpointSignalHostPath   = "/var/lib/dynamo-checkpoint/signals"
+	CheckpointSignalMountPath  = "/checkpoint-signal"
+
+	// PodInfo volume for Downward API (critical for CRIU restore)
+	// After CRIU restore, environment variables contain stale values from checkpoint pod.
+	// The Downward API files at /etc/podinfo always have current pod identity.
+	PodInfoVolumeName = "podinfo"
+	PodInfoMountPath  = "/etc/podinfo"
+
+	// Downward API field paths
+	PodInfoFieldPodName      = "metadata.name"
+	PodInfoFieldPodUID       = "metadata.uid"
+	PodInfoFieldPodNamespace = "metadata.namespace"
+
+	// Downward API file names for DGD annotations
+	PodInfoFileDynNamespace        = "dyn_namespace"
+	PodInfoFileDynComponent        = "dyn_component"
+	PodInfoFileDynParentDGDName    = "dyn_parent_dgd_name"
+	PodInfoFileDynParentDGDNS      = "dyn_parent_dgd_namespace"
+	PodInfoFileDynDiscoveryBackend = "dyn_discovery_backend"
+
+	// Annotation keys for DGD info (exposed via Downward API)
+	AnnotationDynNamespace        = "nvidia.com/dyn-namespace"
+	AnnotationDynComponent        = "nvidia.com/dyn-component"
+	AnnotationDynParentDGDName    = "nvidia.com/dyn-parent-dgd-name"
+	AnnotationDynParentDGDNS      = "nvidia.com/dyn-parent-dgd-namespace"
+	AnnotationDynDiscoveryBackend = "nvidia.com/dyn-discovery-backend"
 )

 type MultinodeDeploymentType string

--- a/deploy/operator/internal/controller/dynamocheckpoint_controller.go
+++ b/deploy/operator/internal/controller/dynamocheckpoint_controller.go
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package controller
+
+import (
+	"context"
+	"fmt"
+
+	batchv1 "k8s.io/api/batch/v1"
+	corev1 "k8s.io/api/core/v1"
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
+	"k8s.io/apimachinery/pkg/api/meta"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/tools/record"
+	"k8s.io/utils/ptr"
+	ctrl "sigs.k8s.io/controller-runtime"
+	"sigs.k8s.io/controller-runtime/pkg/builder"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+	"sigs.k8s.io/controller-runtime/pkg/event"
+	"sigs.k8s.io/controller-runtime/pkg/log"
+	"sigs.k8s.io/controller-runtime/pkg/predicate"
+
+	nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
+	"github.com/ai-dynamo/dynamo/deploy/operator/internal/checkpoint"
+	"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
+	commonController "github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common"
+)
+
+// CheckpointReconciler reconciles a DynamoCheckpoint object
+type CheckpointReconciler struct {
+	client.Client
+	Config   commonController.Config
+	Recorder record.EventRecorder
+}
+
+// Helper function to compute checkpoint location from operator config
+func (r *CheckpointReconciler) getCheckpointLocation(identityHash string) string {
+	basePath := checkpoint.GetPVCBasePath(&r.Config.Checkpoint)
+	return fmt.Sprintf("%s/%s", basePath, identityHash)
+}
+
+// Helper function to get checkpoint storage type from operator config
+func (r *CheckpointReconciler) getCheckpointStorageType() nvidiacomv1alpha1.DynamoCheckpointStorageType {
+	return nvidiacomv1alpha1.DynamoCheckpointStorageType(commonController.CheckpointStorageTypePVC)
+}
+
+// GetRecorder returns the event recorder (implements controller_common.Reconciler interface)
+func (r *CheckpointReconciler) GetRecorder() record.EventRecorder {
+	return r.Recorder
+}
+
+// getSignalHostPath returns the configured signal host path, or the default if not set
+func (r *CheckpointReconciler) getSignalHostPath() string {
+	if r.Config.Checkpoint.Enabled && r.Config.Checkpoint.Storage.SignalHostPath != "" {
+		return r.Config.Checkpoint.Storage.SignalHostPath
+	}
+	return consts.CheckpointSignalHostPath
+}
+
+// +kubebuilder:rbac:groups=nvidia.com,resources=dynamocheckpoints,verbs=get;list;watch;create;update;patch;delete
+// +kubebuilder:rbac:groups=nvidia.com,resources=dynamocheckpoints/status,verbs=get;update;patch
+// +kubebuilder:rbac:groups=nvidia.com,resources=dynamocheckpoints/finalizers,verbs=update
+// +kubebuilder:rbac:groups=batch,resources=jobs,verbs=get;list;watch;create;update;patch;delete
+
+func (r *CheckpointReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
+	logger := log.FromContext(ctx)
+
+	// Fetch the DynamoCheckpoint instance
+	ckpt := &nvidiacomv1alpha1.DynamoCheckpoint{}
+	if err := r.Get(ctx, req.NamespacedName, ckpt); err != nil {
+		if apierrors.IsNotFound(err) {
+			return ctrl.Result{}, nil
+		}
+		return ctrl.Result{}, err
+	}
+
+	logger.Info("Reconciling DynamoCheckpoint", "name", ckpt.Name, "phase", ckpt.Status.Phase)
+
+	// Compute identity hash if not already set
+	if ckpt.Status.IdentityHash == "" {
+		hash, err := checkpoint.ComputeIdentityHash(ckpt.Spec.Identity)
+		if err != nil {
+			logger.Error(err, "Failed to compute identity hash")
+			return ctrl.Result{}, fmt.Errorf("failed to compute identity hash: %w", err)
+		}
+
+		ckpt.Status.IdentityHash = hash
+		ckpt.Status.Phase = nvidiacomv1alpha1.DynamoCheckpointPhasePending
+
+		if err := r.Status().Update(ctx, ckpt); err != nil {
+			logger.Error(err, "Failed to update DynamoCheckpoint status with hash")
+			return ctrl.Result{}, err
+		}
+		// Status update will trigger a new reconcile via the watch
+		return ctrl.Result{}, nil
+	}
+
+	// Handle based on current phase
+	switch ckpt.Status.Phase {
+	case nvidiacomv1alpha1.DynamoCheckpointPhasePending:
+		return r.handlePending(ctx, ckpt)
+	case nvidiacomv1alpha1.DynamoCheckpointPhaseCreating:
+		return r.handleCreating(ctx, ckpt)
+	case nvidiacomv1alpha1.DynamoCheckpointPhaseReady:
+		// Nothing to do, checkpoint is ready
+		return ctrl.Result{}, nil
+	case nvidiacomv1alpha1.DynamoCheckpointPhaseFailed:
+		// Could implement retry logic here
+		return ctrl.Result{}, nil
+	default:
+		// Unknown phase, reset to Pending
+		ckpt.Status.Phase = nvidiacomv1alpha1.DynamoCheckpointPhasePending
+		if err := r.Status().Update(ctx, ckpt); err != nil {
+			return ctrl.Result{}, err
+		}
+		return ctrl.Result{}, nil
+	}
+}
+
+func (r *CheckpointReconciler) handlePending(ctx context.Context, ckpt *nvidiacomv1alpha1.DynamoCheckpoint) (ctrl.Result, error) {
+	logger := log.FromContext(ctx)
+
+	jobName := fmt.Sprintf("checkpoint-%s", ckpt.Name)
+
+	// Use SyncResource to create/update the checkpoint Job
+	modified, _, err := commonController.SyncResource(ctx, r, ckpt, func(ctx context.Context) (*batchv1.Job, bool, error) {
+		job := r.buildCheckpointJob(ckpt, jobName)
+		return job, false, nil
+	})
+	if err != nil {
+		logger.Error(err, "Failed to sync checkpoint Job")
+		return ctrl.Result{}, err
+	}
+
+	if modified {
+		logger.Info("Created/updated checkpoint Job", "job", jobName)
+	}
+
+	// Update status to Creating phase
+	ckpt.Status.Phase = nvidiacomv1alpha1.DynamoCheckpointPhaseCreating
+	ckpt.Status.JobName = jobName
+	meta.SetStatusCondition(&ckpt.Status.Conditions, metav1.Condition{
+		Type:               string(nvidiacomv1alpha1.DynamoCheckpointConditionJobCreated),
+		Status:             metav1.ConditionTrue,
+		Reason:             "JobCreated",
+		Message:            fmt.Sprintf("Checkpoint job %s created", jobName),
+		LastTransitionTime: metav1.Now(),
+	})
+
+	if err := r.Status().Update(ctx, ckpt); err != nil {
+		return ctrl.Result{}, err
+	}
+
+	// Status update will trigger next reconcile via watch
+	return ctrl.Result{}, nil
+}
+
+func (r *CheckpointReconciler) handleCreating(ctx context.Context, ckpt *nvidiacomv1alpha1.DynamoCheckpoint) (ctrl.Result, error) {
+	logger := log.FromContext(ctx)
+
+	// Check Job status
+	job := &batchv1.Job{}
+	if err := r.Get(ctx, client.ObjectKey{Namespace: ckpt.Namespace, Name: ckpt.Status.JobName}, job); err != nil {
+		if apierrors.IsNotFound(err) {
+			// Job was deleted, go back to Pending
+			ckpt.Status.Phase = nvidiacomv1alpha1.DynamoCheckpointPhasePending
+			ckpt.Status.JobName = ""
+			meta.SetStatusCondition(&ckpt.Status.Conditions, metav1.Condition{
+				Type:               string(nvidiacomv1alpha1.DynamoCheckpointConditionJobCreated),
+				Status:             metav1.ConditionFalse,
+				Reason:             "JobDeleted",
+				Message:            "Checkpoint job was deleted",
+				LastTransitionTime: metav1.Now(),
+			})
+			if err := r.Status().Update(ctx, ckpt); err != nil {
+				return ctrl.Result{}, err
+			}
+			return ctrl.Result{}, nil
+		}
+		return ctrl.Result{}, err
+	}
+
+	// Check if job succeeded
+	if job.Status.Succeeded > 0 {
+		logger.Info("Checkpoint Job succeeded", "job", job.Name)
+		r.Recorder.Event(ckpt, corev1.EventTypeNormal, "CheckpointReady", "Checkpoint creation completed successfully")
+
+		now := metav1.Now()
+		ckpt.Status.Phase = nvidiacomv1alpha1.DynamoCheckpointPhaseReady
+		ckpt.Status.CreatedAt = &now
+
+		// Set checkpoint location and storage type using helper functions
+		ckpt.Status.Location = r.getCheckpointLocation(ckpt.Status.IdentityHash)
+		ckpt.Status.StorageType = r.getCheckpointStorageType()
+
+		meta.SetStatusCondition(&ckpt.Status.Conditions, metav1.Condition{
+			Type:               string(nvidiacomv1alpha1.DynamoCheckpointConditionJobCompleted),
+			Status:             metav1.ConditionTrue,
+			Reason:             "JobSucceeded",
+			Message:            "Checkpoint job completed successfully",
+			LastTransitionTime: metav1.Now(),
+		})
+		meta.SetStatusCondition(&ckpt.Status.Conditions, metav1.Condition{
+			Type:               string(nvidiacomv1alpha1.DynamoCheckpointConditionTarAvailable),
+			Status:             metav1.ConditionTrue,
+			Reason:             "TarCreated",
+			Message:            fmt.Sprintf("Checkpoint available at %s", ckpt.Status.Location),
+			LastTransitionTime: metav1.Now(),
+		})
+
+		if err := r.Status().Update(ctx, ckpt); err != nil {
+			return ctrl.Result{}, err
+		}
+		return ctrl.Result{}, nil
+	}
+
+	// Check if job failed
+	if job.Status.Failed > 0 {
+		logger.Info("Checkpoint Job failed", "job", job.Name)
+		r.Recorder.Event(ckpt, corev1.EventTypeWarning, "CheckpointFailed", "Checkpoint creation failed")
+
+		ckpt.Status.Phase = nvidiacomv1alpha1.DynamoCheckpointPhaseFailed
+		ckpt.Status.Message = "Checkpoint job failed"
+		meta.SetStatusCondition(&ckpt.Status.Conditions, metav1.Condition{
+			Type:               string(nvidiacomv1alpha1.DynamoCheckpointConditionJobCompleted),
+			Status:             metav1.ConditionFalse,
+			Reason:             "JobFailed",
+			Message:            "Checkpoint job failed",
+			LastTransitionTime: metav1.Now(),
+		})
+
+		if err := r.Status().Update(ctx, ckpt); err != nil {
+			return ctrl.Result{}, err
+		}
+		return ctrl.Result{}, nil
+	}
+
+	// Job is still running - we'll be notified via Update event when status changes
+	return ctrl.Result{}, nil
+}
+
+func (r *CheckpointReconciler) buildCheckpointJob(ckpt *nvidiacomv1alpha1.DynamoCheckpoint, jobName string) *batchv1.Job {
+	// Use the pod template from the spec
+	podTemplate := ckpt.Spec.Job.PodTemplateSpec.DeepCopy()
+
+	// Add checkpoint-related labels
+	if podTemplate.Labels == nil {
+		podTemplate.Labels = make(map[string]string)
+	}
+	podTemplate.Labels[consts.KubeLabelCheckpointName] = ckpt.Name
+	podTemplate.Labels[consts.KubeLabelCheckpointHash] = ckpt.Status.IdentityHash
+	podTemplate.Labels[consts.KubeLabelCheckpointSource] = "true"
+
+	// Add signal volume (hostPath for communication with DaemonSet)
+	// The DaemonSet writes a signal file after checkpoint is complete
+	hostPathType := corev1.HostPathDirectoryOrCreate
+	podTemplate.Spec.Volumes = append(podTemplate.Spec.Volumes, corev1.Volume{
+		Name: consts.CheckpointSignalVolumeName,
+		VolumeSource: corev1.VolumeSource{
+			HostPath: &corev1.HostPathVolumeSource{
+				Path: r.getSignalHostPath(),
+				Type: &hostPathType,
+			},
+		},
+	})
+
+	// Compute the signal file path - unique per checkpoint hash
+	signalFilePath := consts.CheckpointSignalMountPath + "/" + ckpt.Status.IdentityHash + ".done"
+
+	// Add initContainer to clean up any leftover signal file from previous runs
+	// This ensures a fresh start for each checkpoint job without affecting the checkpoint itself
+	// InitContainers complete before the main container starts, so they don't appear in the checkpoint
+	initContainerImage := r.Config.Checkpoint.InitContainerImage
+
+	podTemplate.Spec.InitContainers = append(podTemplate.Spec.InitContainers, corev1.Container{
+		Name:  "cleanup-signal-file",
+		Image: initContainerImage,
+		Command: []string{
+			"sh",
+			"-c",
+			fmt.Sprintf("rm -f %s || true; echo 'Signal file cleanup complete'", signalFilePath),
+		},
+		VolumeMounts: []corev1.VolumeMount{
+			{
+				Name:      consts.CheckpointSignalVolumeName,
+				MountPath: consts.CheckpointSignalMountPath,
+			},
+		},
+	})
+
+	// Add checkpoint env vars and volume mounts to main container
+	if len(podTemplate.Spec.Containers) > 0 {
+		mainContainer := &podTemplate.Spec.Containers[0]
+
+		// Compute checkpoint location and storage type using helper functions
+		checkpointLocation := r.getCheckpointLocation(ckpt.Status.IdentityHash)
+		storageType := string(r.getCheckpointStorageType())
+
+		// Add checkpoint-related env vars
+		mainContainer.Env = append(mainContainer.Env,
+			// Signal file: DaemonSet writes this after checkpoint completes
+			corev1.EnvVar{
+				Name:  consts.EnvCheckpointSignalFile,
+				Value: signalFilePath,
+			},
+			// Ready file: Worker creates this when model is loaded
+			corev1.EnvVar{
+				Name:  consts.EnvCheckpointReadyFile,
+				Value: consts.CheckpointReadyFilePath,
+			},
+			// Checkpoint hash: For idempotency check
+			corev1.EnvVar{
+				Name:  consts.EnvCheckpointHash,
+				Value: ckpt.Status.IdentityHash,
+			},
+			// Checkpoint location: For idempotency check
+			corev1.EnvVar{
+				Name:  consts.EnvCheckpointLocation,
+				Value: checkpointLocation,
+			},
+			// Storage type: For idempotency check (pvc, s3, oci)
+			corev1.EnvVar{
+				Name:  consts.EnvCheckpointStorageType,
+				Value: storageType,
+			},
+		)
+
+		// Add signal volume mount (required for DaemonSet communication)
+		mainContainer.VolumeMounts = append(mainContainer.VolumeMounts,
+			corev1.VolumeMount{
+				Name:      consts.CheckpointSignalVolumeName,
+				MountPath: consts.CheckpointSignalMountPath,
+			},
+		)
+
+		// Add checkpoint PVC volume and mount for mount namespace consistency with restore pods
+		// CRIU requires the exact same mount layout between checkpoint and restore
+		if r.Config.Checkpoint.Storage.PVC.PVCName != "" {
+			pvcName := r.Config.Checkpoint.Storage.PVC.PVCName
+			basePath := r.Config.Checkpoint.Storage.PVC.BasePath
+			if basePath == "" {
+				basePath = consts.CheckpointBasePath
+			}
+			checkpoint.InjectCheckpointVolume(&podTemplate.Spec, pvcName)
+			checkpoint.InjectCheckpointVolumeMount(mainContainer, basePath)
+		}
+
+		// Add Downward API volume for pod identity (mount namespace consistency with restore pods)
+		checkpoint.InjectPodInfoVolume(&podTemplate.Spec)
+		checkpoint.InjectPodInfoVolumeMount(mainContainer)
+
+		// Override probes for checkpoint mode
+		// Checkpoint jobs need different probe behavior than regular worker pods:
+		// - Readiness: Wait for model to load before checkpoint
+		// - Liveness/Startup: Remove to prevent restarts during slow model loading
+		mainContainer.ReadinessProbe = &corev1.Probe{
+			ProbeHandler: corev1.ProbeHandler{
+				Exec: &corev1.ExecAction{
+					Command: []string{"cat", consts.CheckpointReadyFilePath},
+				},
+			},
+			InitialDelaySeconds: 15,
+			PeriodSeconds:       2,
+		}
+		// Remove liveness probe - we don't want restarts during model loading
+		mainContainer.LivenessProbe = nil
+		// Remove startup probe - not needed for checkpoint jobs
+		mainContainer.StartupProbe = nil
+	}
+
+	// Set restart policy to Never for Jobs
+	podTemplate.Spec.RestartPolicy = corev1.RestartPolicyNever
+
+	// Apply seccomp profile to block io_uring syscalls
+	// CRIU doesn't support io_uring memory mappings, so we must block these syscalls
+	podTemplate.Spec.SecurityContext = &corev1.PodSecurityContext{
+		SeccompProfile: &corev1.SeccompProfile{
+			Type:             corev1.SeccompProfileTypeLocalhost,
+			LocalhostProfile: ptr.To("profiles/block-iouring.json"),
+		},
+	}
+
+	// Build the Job
+	activeDeadlineSeconds := ckpt.Spec.Job.ActiveDeadlineSeconds
+	if activeDeadlineSeconds == nil {
+		defaultDeadline := int64(3600)
+		activeDeadlineSeconds = &defaultDeadline
+	}
+
+	backoffLimit := ckpt.Spec.Job.BackoffLimit
+	if backoffLimit == nil {
+		defaultBackoff := int32(3)
+		backoffLimit = &defaultBackoff
+	}
+
+	ttlSeconds := ckpt.Spec.Job.TTLSecondsAfterFinished
+	if ttlSeconds == nil {
+		defaultTTL := int32(300)
+		ttlSeconds = &defaultTTL
+	}
+
+	job := &batchv1.Job{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      jobName,
+			Namespace: ckpt.Namespace,
+			Labels: map[string]string{
+				consts.KubeLabelCheckpointName: ckpt.Name,
+				consts.KubeLabelCheckpointHash: ckpt.Status.IdentityHash,
+			},
+		},
+		Spec: batchv1.JobSpec{
+			ActiveDeadlineSeconds:   activeDeadlineSeconds,
+			BackoffLimit:            backoffLimit,
+			TTLSecondsAfterFinished: ttlSeconds,
+			Template:                *podTemplate,
+		},
+	}
+
+	return job
+}
+
+// SetupWithManager sets up the controller with the Manager.
+func (r *CheckpointReconciler) SetupWithManager(mgr ctrl.Manager) error {
+	return ctrl.NewControllerManagedBy(mgr).
+		For(&nvidiacomv1alpha1.DynamoCheckpoint{}).
+		Owns(&batchv1.Job{}, builder.WithPredicates(predicate.Funcs{
+			// Ignore creation - we don't need to reconcile when we just created the Job
+			CreateFunc:  func(ce event.CreateEvent) bool { return false },
+			DeleteFunc:  func(de event.DeleteEvent) bool { return true },
+			UpdateFunc:  func(ue event.UpdateEvent) bool { return true },
+			GenericFunc: func(ge event.GenericEvent) bool { return true },
+		})).
+		WithEventFilter(commonController.EphemeralDeploymentEventFilter(r.Config)).
+		Complete(r)
+}
--- a/deploy/operator/internal/controller/dynamocomponentdeployment_controller.go
+++ b/deploy/operator/internal/controller/dynamocomponentdeployment_controller.go
@@ -34,6 +34,7 @@ import (

 	"emperror.dev/errors"
 	"github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
+	"github.com/ai-dynamo/dynamo/deploy/operator/internal/checkpoint"
 	"github.com/ai-dynamo/dynamo/deploy/operator/internal/common"
 	commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
 	commonController "github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common"
@@ -1200,7 +1201,17 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex

 	isDebugModeEnabled := checkIfIsDebugModeEnabled(resourceAnnotations)

-	podSpec, err := dynamo.GenerateBasePodSpecForController(opt.dynamoComponentDeployment, r.DockerSecretRetriever, r.Config, role, commonconsts.MultinodeDeploymentTypeLWS)
+	// Resolve checkpoint for this component
+	var checkpointInfo *checkpoint.CheckpointInfo
+	if opt.dynamoComponentDeployment.Spec.Checkpoint != nil && opt.dynamoComponentDeployment.Spec.Checkpoint.Enabled {
+		info, err := checkpoint.ResolveCheckpointForService(ctx, r.Client, opt.dynamoComponentDeployment.Namespace, opt.dynamoComponentDeployment.Spec.Checkpoint)
+		if err != nil {
+			return nil, errors.Wrap(err, "failed to resolve checkpoint")
+		}
+		checkpointInfo = info
+	}
+
+	podSpec, err := dynamo.GenerateBasePodSpecForController(opt.dynamoComponentDeployment, r.DockerSecretRetriever, r.Config, role, commonconsts.MultinodeDeploymentTypeLWS, checkpointInfo)
 	if err != nil {
 		err = errors.Wrap(err, "failed to generate base pod spec")
 		return nil, err

--- a/deploy/operator/internal/controller/dynamographdeployment_controller.go
+++ b/deploy/operator/internal/controller/dynamographdeployment_controller.go
@@ -26,6 +26,7 @@ import (
 	grovev1alpha1 "github.com/NVIDIA/grove/operator/api/core/v1alpha1"
 	"k8s.io/apimachinery/pkg/api/errors"

+	"github.com/ai-dynamo/dynamo/deploy/operator/internal/checkpoint"
 	"github.com/ai-dynamo/dynamo/deploy/operator/internal/discovery"
 	"github.com/ai-dynamo/dynamo/deploy/operator/internal/secret"

@@ -46,7 +47,6 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/log"
 	"sigs.k8s.io/controller-runtime/pkg/predicate"

-	"github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
 	nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
 	"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
 	commoncontroller "github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common"
@@ -211,7 +211,7 @@ func (r *DynamoGraphDeploymentReconciler) Reconcile(ctx context.Context, req ctr
 type Resource interface {
 	IsReady() (ready bool, reason string)
 	GetName() string
-	GetServiceStatuses() map[string]v1alpha1.ServiceReplicaStatus
+	GetServiceStatuses() map[string]nvidiacomv1alpha1.ServiceReplicaStatus
 }

 type ReconcileResult struct {
@@ -267,6 +267,14 @@ func (r *DynamoGraphDeploymentReconciler) reconcileResources(ctx context.Context
 		return ReconcileResult{}, fmt.Errorf("failed to reconcile top-level PVCs: %w", err)
 	}

+	// Reconcile checkpoints for services with checkpointing enabled
+	checkpointStatuses, checkpointInfos, err := r.reconcileCheckpoints(ctx, dynamoDeployment)
+	if err != nil {
+		logger.Error(err, "Failed to reconcile checkpoints")
+		return ReconcileResult{}, fmt.Errorf("failed to reconcile checkpoints: %w", err)
+	}
+	dynamoDeployment.Status.Checkpoints = checkpointStatuses
+
 	// Reconcile DynamoGraphDeploymentScalingAdapters for each service
 	err = r.reconcileScalingAdapters(ctx, dynamoDeployment)
 	if err != nil {
@@ -313,7 +321,7 @@ func (r *DynamoGraphDeploymentReconciler) reconcileResources(ctx context.Context
 	var result ReconcileResult
 	if r.isGrovePathway(dynamoDeployment) {
 		logger.Info("Reconciling Grove resources", "hasMultinode", hasMultinode, "lwsEnabled", r.Config.LWS.Enabled)
-		result, err = r.reconcileGroveResources(ctx, dynamoDeployment, restartState)
+		result, err = r.reconcileGroveResources(ctx, dynamoDeployment, restartState, checkpointInfos)
 	} else {
 		logger.Info("Reconciling Dynamo components deployments", "hasMultinode", hasMultinode, "lwsEnabled", r.Config.LWS.Enabled)
 		result, err = r.reconcileDynamoComponentsDeployments(ctx, dynamoDeployment, restartState)
@@ -432,7 +440,7 @@ func (r *DynamoGraphDeploymentReconciler) scaleGroveResource(ctx context.Context
 	return err
 }

-func (r *DynamoGraphDeploymentReconciler) reconcileGrovePodCliqueSet(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment, restartState *dynamo.RestartState) (*commoncontroller.Resource, error) {
+func (r *DynamoGraphDeploymentReconciler) reconcileGrovePodCliqueSet(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment, restartState *dynamo.RestartState, checkpointInfos map[string]*checkpoint.CheckpointInfo) (*commoncontroller.Resource, error) {
 	logger := log.FromContext(ctx)

 	existingRestartAnnotations, err := r.getExistingRestartAnnotationsPCS(ctx, dynamoDeployment)
@@ -442,7 +450,7 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGrovePodCliqueSet(ctx context
 	}

 	// generate the dynamoComponentsDeployments from the config
-	grovePodCliqueSet, err := dynamo.GenerateGrovePodCliqueSet(ctx, dynamoDeployment, r.Config, r.DockerSecretRetriever, restartState, existingRestartAnnotations)
+	grovePodCliqueSet, err := dynamo.GenerateGrovePodCliqueSet(ctx, dynamoDeployment, r.Config, r.DockerSecretRetriever, restartState, existingRestartAnnotations, checkpointInfos)
 	if err != nil {
 		logger.Error(err, "failed to generate the Grove GangSet")
 		return nil, fmt.Errorf("failed to generate the Grove GangSet: %w", err)
@@ -456,7 +464,7 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGrovePodCliqueSet(ctx context
 	}
 	syncedGrovePodCliqueSetAsResource, err := commoncontroller.NewResourceWithServiceStatuses(
 		syncedGrovePodCliqueSet,
-		func() (bool, string, map[string]v1alpha1.ServiceReplicaStatus) {
+		func() (bool, string, map[string]nvidiacomv1alpha1.ServiceReplicaStatus) {
 			// Grove readiness: all underlying PodCliques and PodCliqueScalingGroups have replicas == availableReplicas
 			allComponentsReady, reason, serviceStatuses := dynamo.GetComponentReadinessAndServiceReplicaStatuses(ctx, r.Client, dynamoDeployment)
 			if !allComponentsReady {
@@ -542,10 +550,10 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveScaling(ctx context.Cont
 	return nil
 }

-func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment, restartState *dynamo.RestartState) (ReconcileResult, error) {
+func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment, restartState *dynamo.RestartState, checkpointInfos map[string]*checkpoint.CheckpointInfo) (ReconcileResult, error) {
 	logger := log.FromContext(ctx)

-	grovePodCliqueSetAsResource, err := r.reconcileGrovePodCliqueSet(ctx, dynamoDeployment, restartState)
+	grovePodCliqueSetAsResource, err := r.reconcileGrovePodCliqueSet(ctx, dynamoDeployment, restartState, checkpointInfos)
 	if err != nil {
 		logger.Error(err, "failed to reconcile the Grove PodClique Set")
 		return ReconcileResult{}, fmt.Errorf("failed to reconcile the Grove PodClique Set: %w", err)
@@ -912,7 +920,7 @@ func (r *DynamoGraphDeploymentReconciler) checkResourcesReadiness(resources []Re

 	var notReadyReasons []string
 	notReadyResources := []string{}
-	serviceStatuses := make(map[string]v1alpha1.ServiceReplicaStatus)
+	serviceStatuses := make(map[string]nvidiacomv1alpha1.ServiceReplicaStatus)
 	for _, resource := range resources {
 		ready, reason := resource.IsReady()

@@ -1106,6 +1114,205 @@ func (r *DynamoGraphDeploymentReconciler) reconcilePVCs(ctx context.Context, dyn
 	return nil
 }

+// reconcileCheckpoints reconciles Checkpoint CRs for services with checkpointing enabled
+// For Auto mode, it creates Checkpoint CRs if they don't exist
+// Returns a map of service names to checkpoint status and a map of service names to checkpoint info
+func (r *DynamoGraphDeploymentReconciler) reconcileCheckpoints(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) (map[string]nvidiacomv1alpha1.ServiceCheckpointStatus, map[string]*checkpoint.CheckpointInfo, error) {
+	logger := log.FromContext(ctx)
+	statuses := make(map[string]nvidiacomv1alpha1.ServiceCheckpointStatus)
+	checkpointInfos := make(map[string]*checkpoint.CheckpointInfo)
+
+	for serviceName, component := range dynamoDeployment.Spec.Services {
+		if component.Checkpoint == nil || !component.Checkpoint.Enabled {
+			continue
+		}
+
+		logger.Info("Reconciling checkpoint for service", "service", serviceName)
+
+		// Resolve checkpoint for this service
+		info, err := checkpoint.ResolveCheckpointForService(ctx, r.Client, dynamoDeployment.Namespace, component.Checkpoint)
+		if err != nil {
+			logger.Error(err, "Failed to resolve checkpoint for service", "service", serviceName)
+			return nil, nil, fmt.Errorf("failed to resolve checkpoint for service %s: %w", serviceName, err)
+		}
+
+		// Store checkpoint info for later use in pod spec generation
+		checkpointInfos[serviceName] = info
+
+		// If no checkpoint found and mode is Auto, create one
+		if info.CheckpointName == "" && component.Checkpoint.Mode == nvidiacomv1alpha1.CheckpointModeAuto {
+			logger.Info("Creating DynamoCheckpoint CR in Auto mode", "service", serviceName)
+
+			ckpt, err := r.createCheckpointCR(ctx, dynamoDeployment, serviceName, component)
+			if err != nil {
+				logger.Error(err, "Failed to create DynamoCheckpoint CR", "service", serviceName)
+				return nil, nil, fmt.Errorf("failed to create checkpoint for service %s: %w", serviceName, err)
+			}
+
+			info.CheckpointName = ckpt.Name
+			// Compute hash locally since status may not be populated yet
+			// (checkpoint controller reconciles asynchronously)
+			hash, err := checkpoint.ComputeIdentityHash(*component.Checkpoint.Identity)
+			if err != nil {
+				logger.Error(err, "Failed to compute checkpoint identity hash", "service", serviceName)
+				return nil, nil, fmt.Errorf("failed to compute checkpoint hash for service %s: %w", serviceName, err)
+			}
+			info.Hash = hash
+			info.Ready = false // Newly created checkpoint is not ready yet
+		}
+
+		// Update status
+		statuses[serviceName] = nvidiacomv1alpha1.ServiceCheckpointStatus{
+			CheckpointName: info.CheckpointName,
+			IdentityHash:   info.Hash,
+			Ready:          info.Ready,
+		}
+	}
+
+	return statuses, checkpointInfos, nil
+}
+
+// createCheckpointCR creates a DynamoCheckpoint CR for a service in Auto mode
+func (r *DynamoGraphDeploymentReconciler) createCheckpointCR(
+	ctx context.Context,
+	dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment,
+	serviceName string,
+	component *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec,
+) (*nvidiacomv1alpha1.DynamoCheckpoint, error) {
+	if component.Checkpoint == nil || component.Checkpoint.Identity == nil {
+		return nil, fmt.Errorf("checkpoint identity is required for Auto mode")
+	}
+
+	identity := component.Checkpoint.Identity
+
+	// Compute hash for naming
+	hash, err := checkpoint.ComputeIdentityHash(*identity)
+	if err != nil {
+		return nil, fmt.Errorf("failed to compute identity hash: %w", err)
+	}
+
+	// Generate checkpoint name: use hash directly (16 chars, 64 bits)
+	// This allows natural deduplication - same identity = same checkpoint name
+	// 16 characters provides excellent collision resistance (1% at 500M configs)
+	ckptName := hash
+
+	// Use SyncResource to create/update the DynamoCheckpoint CR
+	// Pass nil as parentResource to create an independent checkpoint (no owner reference)
+	// This ensures the checkpoint persists even if the DGD is deleted
+	_, ckpt, err := commoncontroller.SyncResource(ctx, r, nil, func(ctx context.Context) (*nvidiacomv1alpha1.DynamoCheckpoint, bool, error) {
+		// Build the checkpoint identity from service identity
+		checkpointIdentity := nvidiacomv1alpha1.DynamoCheckpointIdentity{
+			Model:                identity.Model,
+			BackendFramework:     identity.BackendFramework,
+			DynamoVersion:        identity.DynamoVersion,
+			TensorParallelSize:   identity.TensorParallelSize,
+			PipelineParallelSize: identity.PipelineParallelSize,
+			Dtype:                identity.Dtype,
+			MaxModelLen:          identity.MaxModelLen,
+			ExtraParameters:      identity.ExtraParameters,
+		}
+
+		// Build pod template from service spec for checkpoint job
+		// This uses GenerateBasePodSpec to ensure same config as worker pods (image pull secrets, etc.)
+		// Pass framework from checkpoint identity for accurate backend detection
+		podTemplate, err := r.buildCheckpointJobPodTemplate(
+			dynamoDeployment,
+			component,
+			serviceName,
+			identity.BackendFramework, // Use framework from checkpoint identity
+		)
+		if err != nil {
+			return nil, false, fmt.Errorf("failed to build checkpoint job pod template: %w", err)
+		}
+
+		ckpt := &nvidiacomv1alpha1.DynamoCheckpoint{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      ckptName,
+				Namespace: dynamoDeployment.Namespace,
+				Labels: map[string]string{
+					consts.KubeLabelDynamoGraphDeploymentName: dynamoDeployment.Name,
+					consts.KubeLabelDynamoComponent:           serviceName,
+					consts.KubeLabelCheckpointHash:            hash,
+				},
+			},
+			Spec: nvidiacomv1alpha1.DynamoCheckpointSpec{
+				Identity: checkpointIdentity,
+				Job: nvidiacomv1alpha1.DynamoCheckpointJobConfig{
+					PodTemplateSpec: podTemplate,
+				},
+			},
+		}
+		return ckpt, false, nil
+	})
+	if err != nil {
+		return nil, fmt.Errorf("failed to sync checkpoint CR: %w", err)
+	}
+
+	return ckpt, nil
+}
+
+// buildCheckpointJobPodTemplate builds a pod template for the checkpoint job from service spec
+// It reuses GenerateBasePodSpec to ensure checkpoint jobs have the same configuration as regular pods,
+// including auto-discovered image pull secrets, envFromSecret, resources, security context, etc.
+func (r *DynamoGraphDeploymentReconciler) buildCheckpointJobPodTemplate(
+	dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment,
+	component *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec,
+	serviceName string,
+	framework string, // From checkpoint identity (e.g., "vllm", "sglang", "trtllm")
+) (corev1.PodTemplateSpec, error) {
+	// Parse framework string to BackendFramework type
+	backendFramework, err := dynamo.ParseBackendFramework(framework)
+	if err != nil {
+		return corev1.PodTemplateSpec{}, err
+	}
+
+	// Create a copy of the component spec without checkpoint config
+	// The checkpoint job is CREATING the checkpoint, not restoring from one
+	componentForJob := component.DeepCopy()
+	componentForJob.Checkpoint = nil
+
+	// Ensure DYN_NAMESPACE is set for checkpoint job using the same logic as regular pods
+	// This is required for service discovery and distributed coordination
+	dynamoNamespace := dynamo.GetDynamoNamespace(dynamoDeployment, component)
+	componentForJob.DynamoNamespace = &dynamoNamespace
+
+	// Generate base PodSpec using the same logic as regular worker pods
+	// This includes: image pull secrets (auto-discovered + explicit), envFromSecret,
+	// resources, security context, tolerations, node selectors, etc.
+	//
+	// Note: For checkpoint jobs, we use Grove deployment type even though it's single-node.
+	// This is because GenerateBasePodSpec requires a valid MultinodeDeployer, and for
+	// single-node cases, the backends simply return early without modifications.
+	podSpec, err := dynamo.GenerateBasePodSpec(
+		componentForJob,
+		backendFramework,
+		r.DockerSecretRetriever,
+		dynamoDeployment.Name,
+		dynamoDeployment.Namespace,
+		dynamo.RoleCheckpoint, // Use checkpoint role
+		1,                     // Single node for checkpoint job
+		r.Config,
+		consts.MultinodeDeploymentTypeGrove, // Use Grove (single-node backends return early)
+		serviceName,
+		nil, // No checkpoint info for checkpoint creation jobs
+	)
+	if err != nil {
+		return corev1.PodTemplateSpec{}, fmt.Errorf("failed to generate base pod spec: %w", err)
+	}
+
+	// Override RestartPolicy for job (must be Never or OnFailure)
+	podSpec.RestartPolicy = corev1.RestartPolicyNever
+
+	return corev1.PodTemplateSpec{
+		ObjectMeta: metav1.ObjectMeta{
+			Labels: map[string]string{
+				consts.KubeLabelDynamoComponent: serviceName,
+			},
+		},
+		Spec: *podSpec,
+	}, nil
+}
+
 // reconcileScalingAdapters ensures a DynamoGraphDeploymentScalingAdapter exists for each service in the DGD
 // that has scaling adapter explicitly enabled. Services without scalingAdapter.enabled=true will not have a DGDSA.
 // This enables pluggable autoscaling via HPA, KEDA, or Planner.

--- a/deploy/operator/internal/controller/dynamographdeployment_controller_test.go
+++ b/deploy/operator/internal/controller/dynamographdeployment_controller_test.go
@@ -678,7 +678,7 @@ func Test_reconcileGroveResources(t *testing.T) {
 				},
 			}

-			result, err := reconciler.reconcileGroveResources(ctx, dgd, nil)
+			result, err := reconciler.reconcileGroveResources(ctx, dgd, nil, nil)
 			g.Expect(err).NotTo(gomega.HaveOccurred())

 			g.Expect(result).To(gomega.Equal(tt.wantReconcileResult))

--- a/deploy/operator/internal/controller_common/predicate.go
+++ b/deploy/operator/internal/controller_common/predicate.go
@@ -84,6 +84,9 @@ type Config struct {
 	// When true, controllers skip validation (webhooks handle it)
 	// When false, controllers perform validation (defense in depth)
 	WebhooksEnabled bool
+
+	// Checkpoint configuration for checkpoint/restore functionality
+	Checkpoint CheckpointConfig
 }

 // RBACConfig holds configuration for RBAC management
@@ -96,6 +99,65 @@ type RBACConfig struct {
 	EPPClusterRoleName string
 }

+// CheckpointConfig holds configuration for checkpoint/restore functionality
+type CheckpointConfig struct {
+	// Enabled indicates if checkpoint functionality is enabled
+	Enabled bool
+	// Storage holds storage backend configuration
+	Storage CheckpointStorageConfig
+	// CRIUTimeout is the CRIU timeout in seconds (required for CUDA checkpoints/restores)
+	CRIUTimeout string
+	// InitContainerImage is the image used for init containers (e.g., signal file cleanup)
+	// Defaults to "busybox:latest" if not specified
+	InitContainerImage string
+}
+
+// Checkpoint storage type constants
+const (
+	CheckpointStorageTypePVC = "pvc"
+	CheckpointStorageTypeS3  = "s3"
+	CheckpointStorageTypeOCI = "oci"
+)
+
+// CheckpointStorageConfig holds storage backend configuration for checkpoints
+type CheckpointStorageConfig struct {
+	// Type is the storage backend type: pvc, s3, or oci
+	Type string
+	// SignalHostPath is the host path for signal files (used for checkpoint job coordination)
+	SignalHostPath string
+	// PVC configuration (used when Type=pvc)
+	PVC CheckpointPVCConfig
+	// S3 configuration (used when Type=s3)
+	S3 CheckpointS3Config
+	// OCI configuration (used when Type=oci)
+	OCI CheckpointOCIConfig
+}
+
+// CheckpointPVCConfig holds PVC storage configuration
+type CheckpointPVCConfig struct {
+	// PVCName is the name of the PVC
+	PVCName string
+	// BasePath is the base directory within the PVC
+	BasePath string
+}
+
+// CheckpointS3Config holds S3 storage configuration
+type CheckpointS3Config struct {
+	// URI is the S3 URI (s3://[endpoint/]bucket/prefix)
+	URI string
+	// CredentialsSecretRef is the name of the credentials secret
+	// (should contain AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, and optionally AWS_REGION)
+	CredentialsSecretRef string
+}
+
+// CheckpointOCIConfig holds OCI registry storage configuration
+type CheckpointOCIConfig struct {
+	// URI is the OCI URI (oci://registry/repository)
+	URI string
+	// CredentialsSecretRef is the name of the docker config secret
+	CredentialsSecretRef string
+}
+
 type IngressConfig struct {
 	VirtualServiceGateway      string
 	IngressControllerClassName string

--- a/deploy/operator/internal/controller_common/resource.go
+++ b/deploy/operator/internal/controller_common/resource.go
@@ -96,7 +96,7 @@ func SyncResource[T client.Object](ctx context.Context, r Reconciler, parentReso
 	err = r.Get(ctx, types.NamespacedName{Name: resourceName, Namespace: resourceNamespace}, oldResource)
 	oldResourceIsNotFound := errors.IsNotFound(err)
 	if err != nil && !oldResourceIsNotFound {
-		r.GetRecorder().Eventf(parentResource, corev1.EventTypeWarning, fmt.Sprintf("Get%s", resourceType), "Failed to get %s %s: %s", resourceType, resourceNamespace, err)
+		r.GetRecorder().Eventf(resource, corev1.EventTypeWarning, fmt.Sprintf("Get%s", resourceType), "Failed to get %s %s: %s", resourceType, resourceNamespace, err)
 		logs.Error(err, "Failed to get resource.")
 		return
 	}
@@ -109,33 +109,39 @@ func SyncResource[T client.Object](ctx context.Context, r Reconciler, parentReso
 		}
 		logs.Info("Resource not found. Creating a new one.")

+		// Only set controller reference if parentResource is provided
+		// Passing nil as parentResource creates an independent resource (no owner reference)
+		if parentResource != nil {
 			err = ctrl.SetControllerReference(parentResource, resource, r.Scheme())
 			if err != nil {
 				logs.Error(err, "Failed to set controller reference.")
-			r.GetRecorder().Eventf(parentResource, corev1.EventTypeWarning, "SetControllerReference", "Failed to set controller reference for %s %s: %s", resourceType, resourceNamespace, err)
+				r.GetRecorder().Eventf(resource, corev1.EventTypeWarning, "SetControllerReference", "Failed to set controller reference for %s %s: %s", resourceType, resourceNamespace, err)
 				return
 			}
+		} else {
+			logs.Info("No parent resource provided, creating resource without owner reference (independent lifecycle)")
+		}

 		var hash string
 		hash, err = GetSpecHash(resource)
 		if err != nil {
 			logs.Error(err, "Failed to get spec hash.")
-			r.GetRecorder().Eventf(parentResource, corev1.EventTypeWarning, "GetSpecHash", "Failed to get spec hash for %s %s: %s", resourceType, resourceNamespace, err)
+			r.GetRecorder().Eventf(resource, corev1.EventTypeWarning, "GetSpecHash", "Failed to get spec hash for %s %s: %s", resourceType, resourceNamespace, err)
 			return
 		}

 		// On create, set generation to 1 (new resources start at generation 1)
 		updateAnnotations(resource, hash, 1)

-		r.GetRecorder().Eventf(parentResource, corev1.EventTypeNormal, fmt.Sprintf("Create%s", resourceType), "Creating a new %s %s", resourceType, resourceNamespace)
+		r.GetRecorder().Eventf(resource, corev1.EventTypeNormal, fmt.Sprintf("Create%s", resourceType), "Creating a new %s %s", resourceType, resourceNamespace)
 		err = r.Create(ctx, resource)
 		if err != nil {
 			logs.Error(err, "Failed to create Resource.")
-			r.GetRecorder().Eventf(parentResource, corev1.EventTypeWarning, fmt.Sprintf("Create%s", resourceType), "Failed to create %s %s: %s", resourceType, resourceNamespace, err)
+			r.GetRecorder().Eventf(resource, corev1.EventTypeWarning, fmt.Sprintf("Create%s", resourceType), "Failed to create %s %s: %s", resourceType, resourceNamespace, err)
 			return
 		}
 		logs.Info(fmt.Sprintf("%s created.", resourceType))
-		r.GetRecorder().Eventf(parentResource, corev1.EventTypeNormal, fmt.Sprintf("Create%s", resourceType), "Created %s %s", resourceType, resourceNamespace)
+		r.GetRecorder().Eventf(resource, corev1.EventTypeNormal, fmt.Sprintf("Create%s", resourceType), "Created %s %s", resourceType, resourceNamespace)
 		modified = true
 		res = resource
 	} else {
@@ -145,11 +151,11 @@ func SyncResource[T client.Object](ctx context.Context, r Reconciler, parentReso
 			err = r.Delete(ctx, oldResource)
 			if err != nil {
 				logs.Error(err, fmt.Sprintf("Failed to delete %s.", resourceType))
-				r.GetRecorder().Eventf(parentResource, corev1.EventTypeWarning, fmt.Sprintf("Delete%s", resourceType), "Failed to delete %s %s: %s", resourceType, resourceNamespace, err)
+				r.GetRecorder().Eventf(oldResource, corev1.EventTypeWarning, fmt.Sprintf("Delete%s", resourceType), "Failed to delete %s %s: %s", resourceType, resourceNamespace, err)
 				return
 			}
 			logs.Info(fmt.Sprintf("%s deleted.", resourceType))
-			r.GetRecorder().Eventf(parentResource, corev1.EventTypeNormal, fmt.Sprintf("Delete%s", resourceType), "Deleted %s %s", resourceType, resourceNamespace)
+			r.GetRecorder().Eventf(oldResource, corev1.EventTypeNormal, fmt.Sprintf("Delete%s", resourceType), "Deleted %s %s", resourceType, resourceNamespace)
 			modified = true
 			return
 		}
@@ -158,13 +164,13 @@ func SyncResource[T client.Object](ctx context.Context, r Reconciler, parentReso
 		var changeResult SpecChangeResult
 		changeResult, err = GetSpecChangeResult(oldResource, resource)
 		if err != nil {
-			r.GetRecorder().Eventf(parentResource, corev1.EventTypeWarning, fmt.Sprintf("CalculatePatch%s", resourceType), "Failed to calculate patch for %s %s: %s", resourceType, resourceNamespace, err)
+			r.GetRecorder().Eventf(resource, corev1.EventTypeWarning, fmt.Sprintf("CalculatePatch%s", resourceType), "Failed to calculate patch for %s %s: %s", resourceType, resourceNamespace, err)
 			return false, resource, fmt.Errorf("failed to check if spec has changed: %w", err)
 		}

 		if !changeResult.NeedsUpdate {
 			logs.Info(fmt.Sprintf("%s spec is the same. Skipping update.", resourceType))
-			r.GetRecorder().Eventf(parentResource, corev1.EventTypeNormal, fmt.Sprintf("Update%s", resourceType), "Skipping update %s %s", resourceType, resourceNamespace)
+			r.GetRecorder().Eventf(oldResource, corev1.EventTypeNormal, fmt.Sprintf("Update%s", resourceType), "Skipping update %s %s", resourceType, resourceNamespace)
 			res = oldResource
 			return
 		}
@@ -188,7 +194,7 @@ func SyncResource[T client.Object](ctx context.Context, r Reconciler, parentReso
 		err = CopySpec(resource, oldResource)
 		if err != nil {
 			logs.Error(err, fmt.Sprintf("Failed to copy spec for %s.", resourceType))
-			r.GetRecorder().Eventf(parentResource, corev1.EventTypeWarning, fmt.Sprintf("CopySpec%s", resourceType), "Failed to copy spec for %s %s: %s", resourceType, resourceNamespace, err)
+			r.GetRecorder().Eventf(oldResource, corev1.EventTypeWarning, fmt.Sprintf("CopySpec%s", resourceType), "Failed to copy spec for %s %s: %s", resourceType, resourceNamespace, err)
 			return
 		}

@@ -197,11 +203,11 @@ func SyncResource[T client.Object](ctx context.Context, r Reconciler, parentReso
 		err = r.Update(ctx, oldResource)
 		if err != nil {
 			logs.Error(err, fmt.Sprintf("Failed to update %s.", resourceType))
-			r.GetRecorder().Eventf(parentResource, corev1.EventTypeWarning, fmt.Sprintf("Update%s", resourceType), "Failed to update %s %s: %s", resourceType, resourceNamespace, err)
+			r.GetRecorder().Eventf(oldResource, corev1.EventTypeWarning, fmt.Sprintf("Update%s", resourceType), "Failed to update %s %s: %s", resourceType, resourceNamespace, err)
 			return
 		}
 		logs.Info(fmt.Sprintf("%s updated.", resourceType))
-		r.GetRecorder().Eventf(parentResource, corev1.EventTypeNormal, fmt.Sprintf("Update%s", resourceType), "Updated %s %s", resourceType, resourceNamespace)
+		r.GetRecorder().Eventf(oldResource, corev1.EventTypeNormal, fmt.Sprintf("Update%s", resourceType), "Updated %s %s", resourceType, resourceNamespace)
 		modified = true
 		res = oldResource
 	}

--- a/deploy/operator/internal/dynamo/graph.go
+++ b/deploy/operator/internal/dynamo/graph.go
@@ -35,6 +35,7 @@ import (

 	grovev1alpha1 "github.com/NVIDIA/grove/operator/api/core/v1alpha1"
 	"github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
+	"github.com/ai-dynamo/dynamo/deploy/operator/internal/checkpoint"
 	commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
 	"github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common"
 	"github.com/ai-dynamo/dynamo/deploy/operator/internal/discovery"
@@ -252,7 +253,7 @@ func ParseDynDeploymentConfig(ctx context.Context, jsonContent []byte) (DynDeplo
 func GenerateDynamoComponentsDeployments(ctx context.Context, parentDynamoGraphDeployment *v1alpha1.DynamoGraphDeployment, defaultIngressSpec *v1alpha1.IngressSpec, restartState *RestartState, existingRestartAnnotations map[string]string) (map[string]*v1alpha1.DynamoComponentDeployment, error) {
 	deployments := make(map[string]*v1alpha1.DynamoComponentDeployment)
 	for componentName, component := range parentDynamoGraphDeployment.Spec.Services {
-		dynamoNamespace := getDynamoNamespace(parentDynamoGraphDeployment, component)
+		dynamoNamespace := GetDynamoNamespace(parentDynamoGraphDeployment, component)
 		deployment := &v1alpha1.DynamoComponentDeployment{}
 		deployment.Spec.DynamoComponentDeploymentSharedSpec = *component
 		deployment.Name = GetDynamoComponentName(parentDynamoGraphDeployment, componentName)
@@ -336,7 +337,7 @@ func GenerateDynamoComponentsDeployments(ctx context.Context, parentDynamoGraphD
 	return deployments, nil
 }

-func getDynamoNamespace(object metav1.Object, service *v1alpha1.DynamoComponentDeploymentSharedSpec) string {
+func GetDynamoNamespace(object metav1.Object, service *v1alpha1.DynamoComponentDeploymentSharedSpec) string {
 	return v1alpha1.ComputeDynamoNamespace(service.GlobalDynamoNamespace, object.GetNamespace(), object.GetName())
 }

@@ -736,6 +737,7 @@ const (
 	RoleLeader     Role = "leader"
 	RoleWorker     Role = "worker"
 	RoleMain       Role = "main"
+	RoleCheckpoint Role = "checkpoint"
 )

 // Update ServiceRole struct for expandRolesForService
@@ -766,8 +768,21 @@ const (
 	BackendFrameworkSGLang BackendFramework = "sglang"
 	BackendFrameworkVLLM   BackendFramework = "vllm"
 	BackendFrameworkTRTLLM BackendFramework = "trtllm"
+	BackendFrameworkNoop   BackendFramework = "noop"
 )

+// ParseBackendFramework converts a string to BackendFramework type.
+// Returns an error if the framework string is not recognized.
+func ParseBackendFramework(framework string) (BackendFramework, error) {
+	bf := BackendFramework(framework)
+	switch bf {
+	case BackendFrameworkVLLM, BackendFrameworkSGLang, BackendFrameworkTRTLLM, BackendFrameworkNoop:
+		return bf, nil
+	default:
+		return "", fmt.Errorf("unsupported backend framework: %s (valid values: vllm, sglang, trtllm)", framework)
+	}
+}
+
 // Backend interface for modular backend logic
 // Each backend (SGLang, VLLM, etc.) implements this interface
 type Backend interface {
@@ -897,6 +912,7 @@ func GenerateBasePodSpec(
 	controllerConfig controller_common.Config,
 	multinodeDeploymentType commonconsts.MultinodeDeploymentType,
 	serviceName string,
+	checkpointInfo *checkpoint.CheckpointInfo, // Optional checkpoint info (resolved by ResolveCheckpointForService)
 ) (*corev1.PodSpec, error) {
 	// Start with base container generated per component type
 	componentContext := generateComponentContext(component, parentGraphDeploymentName, namespace, numberOfNodes, controllerConfig.GetDiscoveryBackend(component.Annotations))
@@ -1071,6 +1087,23 @@ func GenerateBasePodSpec(
 	podSpec.ImagePullSecrets = controller_common.AppendUniqueImagePullSecrets(podSpec.ImagePullSecrets, imagePullSecrets)

 	backend.UpdatePodSpec(&podSpec, numberOfNodes, role, component, serviceName)
+
+	// Inject checkpoint configuration if enabled
+	// This handles ALL checkpoint-related modifications:
+	// - Command/Args transformation (moves Command to Args to respect image ENTRYPOINT)
+	// - Security context (hostIPC, privileged mode)
+	// - Environment variables (checkpoint path, hash, CRIU settings)
+	// - Storage configuration (volumes, mounts)
+	// CheckpointInfo should have been resolved by ResolveCheckpointForService before calling this function
+	// Checkpoint config comes from the operator's controller config (Helm values)
+	var checkpointConfig *controller_common.CheckpointConfig
+	if controllerConfig.Checkpoint.Enabled {
+		checkpointConfig = &controllerConfig.Checkpoint
+	}
+	if err := checkpoint.InjectCheckpointIntoPodSpec(&podSpec, checkpointInfo, checkpointConfig); err != nil {
+		return nil, fmt.Errorf("failed to inject checkpoint config: %w", err)
+	}
+
 	return &podSpec, nil
 }

@@ -1111,11 +1144,12 @@ func GeneratePodSpecForComponent(
 	controllerConfig controller_common.Config,
 	multinodeDeploymentType commonconsts.MultinodeDeploymentType,
 	serviceName string,
+	checkpointInfo *checkpoint.CheckpointInfo, // Optional checkpoint info
 ) (*corev1.PodSpec, error) {
 	if len(dynamoDeployment.Spec.Envs) > 0 {
 		component.Envs = MergeEnvs(dynamoDeployment.Spec.Envs, component.Envs)
 	}
-	podSpec, err := GenerateBasePodSpec(component, backendFramework, secretsRetriever, dynamoDeployment.Name, dynamoDeployment.Namespace, role, numberOfNodes, controllerConfig, multinodeDeploymentType, serviceName)
+	podSpec, err := GenerateBasePodSpec(component, backendFramework, secretsRetriever, dynamoDeployment.Name, dynamoDeployment.Namespace, role, numberOfNodes, controllerConfig, multinodeDeploymentType, serviceName, checkpointInfo)
 	if err != nil {
 		return nil, err
 	}
@@ -1130,6 +1164,7 @@ func GenerateGrovePodCliqueSet(
 	secretsRetriever SecretsRetriever,
 	restartState *RestartState,
 	existingRestartAnnotations map[string]string,
+	checkpointInfoByService map[string]*checkpoint.CheckpointInfo, // Optional checkpoint info per service
 ) (*grovev1alpha1.PodCliqueSet, error) {
 	gangSet := &grovev1alpha1.PodCliqueSet{}
 	gangSet.Name = dynamoDeployment.Name
@@ -1157,7 +1192,7 @@ func GenerateGrovePodCliqueSet(

 	var scalingGroups []grovev1alpha1.PodCliqueScalingGroupConfig
 	for serviceName, component := range dynamoDeployment.Spec.Services {
-		dynamoNamespace := getDynamoNamespace(dynamoDeployment, component)
+		dynamoNamespace := GetDynamoNamespace(dynamoDeployment, component)
 		component.DynamoNamespace = &dynamoNamespace
 		// Determine backend framework using hybrid approach
 		backendFramework, err := getBackendFrameworkFromComponent(component, dynamoDeployment)
@@ -1172,6 +1207,12 @@ func GenerateGrovePodCliqueSet(
 			component.Annotations[commonconsts.KubeAnnotationDynamoDiscoveryBackend] = discoveryBackend
 		}

+		// Get checkpoint info for this service if available
+		var checkpointInfo *checkpoint.CheckpointInfo
+		if checkpointInfoByService != nil {
+			checkpointInfo = checkpointInfoByService[serviceName]
+		}
+
 		numberOfNodes := component.GetNumberOfNodes()
 		isMultinode := numberOfNodes > 1
 		roles := expandRolesForService(serviceName, component.Replicas, numberOfNodes)
@@ -1188,6 +1229,7 @@ func GenerateGrovePodCliqueSet(
 				controllerConfig,
 				commonconsts.MultinodeDeploymentTypeGrove,
 				serviceName,
+				checkpointInfo,
 			)
 			if err != nil {
 				return nil, fmt.Errorf("failed to generate podSpec for role %s: %w", r.Name, err)
@@ -1272,15 +1314,21 @@ func generateLabels(component *v1alpha1.DynamoComponentDeploymentSharedSpec, dyn
 	}
 	// Add base model label if modelRef is specified
 	AddBaseModelLabel(labels, component.ModelRef)
+	// Add checkpoint labels if checkpointing is enabled
+	var err error
+	labels, err = checkpoint.InjectCheckpointLabelsFromConfig(labels, component.Checkpoint)
+	if err != nil {
+		return nil, fmt.Errorf("failed to inject checkpoint labels: %w", err)
+	}
 	setMetricsLabels(labels, dynamoDeployment)
 	if component.Labels != nil {
-		err := mergo.Merge(&labels, component.Labels, mergo.WithOverride)
+		err = mergo.Merge(&labels, component.Labels, mergo.WithOverride)
 		if err != nil {
 			return nil, fmt.Errorf("failed to merge labels: %w", err)
 		}
 	}
 	if component.ExtraPodMetadata != nil {
-		err := mergo.Merge(&labels, component.ExtraPodMetadata.Labels, mergo.WithOverride)
+		err = mergo.Merge(&labels, component.ExtraPodMetadata.Labels, mergo.WithOverride)
 		if err != nil {
 			return nil, fmt.Errorf("failed to merge extraPodMetadata labels: %w", err)
 		}
@@ -1336,9 +1384,6 @@ func detectBackendFrameworkFromArgs(command []string, args []string) (BackendFra
 	return detected[0], nil
 }

-// BackendFrameworkNoop represents no backend processing needed
-const BackendFrameworkNoop BackendFramework = "noop"
-
 // determineBackendFramework is the core logic for hybrid backend framework detection
 // Takes extracted parameters and applies the detection logic
 func determineBackendFramework(
@@ -1457,6 +1502,7 @@ func GenerateBasePodSpecForController(
 	controllerConfig controller_common.Config,
 	role Role,
 	multinodeDeploymentType commonconsts.MultinodeDeploymentType,
+	checkpointInfo *checkpoint.CheckpointInfo, // Optional checkpoint info (resolved by caller)
 ) (*corev1.PodSpec, error) {
 	// Convert to our interface
 	componentSpec := ConvertDynamoComponentDeploymentToSpec(dynComponent)
@@ -1483,6 +1529,7 @@ func GenerateBasePodSpecForController(
 		controllerConfig,
 		multinodeDeploymentType,
 		serviceName,
+		checkpointInfo,
 	)
 	if err != nil {
 		return nil, err

--- a/deploy/operator/internal/dynamo/graph_test.go
+++ b/deploy/operator/internal/dynamo/graph_test.go
@@ -3665,7 +3665,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			got, err := GenerateGrovePodCliqueSet(tt.args.ctx, tt.args.dynamoDeployment, tt.args.controllerConfig, nil, nil, nil)
+			got, err := GenerateGrovePodCliqueSet(tt.args.ctx, tt.args.dynamoDeployment, tt.args.controllerConfig, nil, nil, nil, nil)
 			if (err != nil) != tt.wantErr {
 				t.Errorf("GenerateGrovePodCliqueSet() error = %v, wantErr %v", err, tt.wantErr)
 				return
@@ -3717,7 +3717,7 @@ func Test_GeneratePodCliqueSetGlobalDynamoNamespace(t *testing.T) {
 		},
 	}

-	got, err := GenerateGrovePodCliqueSet(context.Background(), dynamoDeployment, controller_common.Config{}, nil, nil, nil)
+	got, err := GenerateGrovePodCliqueSet(context.Background(), dynamoDeployment, controller_common.Config{}, nil, nil, nil, nil)
 	if !assert.NoError(t, err) {
 		return
 	}
@@ -3880,6 +3880,7 @@ func TestGeneratePodSpecForComponent_SGLang(t *testing.T) {
 				controllerConfig,
 				commonconsts.MultinodeDeploymentTypeGrove,
 				"worker",
+				nil, // No checkpoint info in tests
 			)

 			if tt.expectError {
@@ -4037,6 +4038,7 @@ func TestGeneratePodSpecForComponent_VLLM(t *testing.T) {
 				controllerConfig,
 				commonconsts.MultinodeDeploymentTypeGrove,
 				"worker",
+				nil, // No checkpoint info in tests
 			)

 			if tt.expectError {
@@ -4123,6 +4125,7 @@ func TestGeneratePodSpecForComponent_UnsupportedBackend(t *testing.T) {
 				controllerConfig,
 				commonconsts.MultinodeDeploymentTypeGrove,
 				"worker",
+				nil, // No checkpoint info in tests
 			)

 			if tt.expectError {
@@ -4800,7 +4803,7 @@ func TestGenerateGrovePodCliqueSet_StartsAfterDependencies(t *testing.T) {
 				NatsAddress: "nats-address",
 			}

-			got, err := GenerateGrovePodCliqueSet(context.Background(), dynamoDeployment, controllerConfig, secretsRetriever, nil, nil)
+			got, err := GenerateGrovePodCliqueSet(context.Background(), dynamoDeployment, controllerConfig, secretsRetriever, nil, nil, nil)
 			if err != nil {
 				t.Errorf("GenerateGrovePodCliqueSet() error = %v", err)
 				return
@@ -4909,6 +4912,7 @@ func TestGenerateBasePodSpec_Frontend(t *testing.T) {
 				controllerConfig,
 				commonconsts.MultinodeDeploymentTypeGrove,
 				"test-service",
+				nil, // No checkpoint info in tests
 			)

 			if (err != nil) != tt.wantErr {
@@ -4984,6 +4988,7 @@ func TestGenerateBasePodSpec_PlannerServiceAccount(t *testing.T) {
 				controllerConfig,
 				commonconsts.MultinodeDeploymentTypeGrove,
 				"test-service",
+				nil, // No checkpoint info in tests
 			)

 			if err != nil {
@@ -5106,6 +5111,7 @@ func TestGenerateBasePodSpec_DisableImagePullSecretDiscovery(t *testing.T) {
 				controllerConfig,
 				commonconsts.MultinodeDeploymentTypeGrove,
 				"test-service",
+				nil, // No checkpoint info in tests
 			)

 			if err != nil {
@@ -5201,6 +5207,7 @@ func TestGenerateBasePodSpec_DiscoverBackend(t *testing.T) {
 				tt.controllerConfig,
 				commonconsts.MultinodeDeploymentTypeGrove,
 				"test-service",
+				nil, // No checkpoint info in tests
 			)
 			if !assert.NoError(t, err) {
 				return
@@ -5360,6 +5367,7 @@ func TestGenerateBasePodSpec_Worker(t *testing.T) {
 				controllerConfig,
 				commonconsts.MultinodeDeploymentTypeGrove,
 				"test-service",
+				nil, // No checkpoint info in tests
 			)

 			if err != nil {
@@ -5456,6 +5464,7 @@ func TestGenerateBasePodSpec_VolumeMounts(t *testing.T) {
 				controllerConfig,
 				commonconsts.MultinodeDeploymentTypeGrove,
 				"test-service",
+				nil, // No checkpoint info in tests
 			)

 			if tt.expectError {
@@ -5691,6 +5700,7 @@ func TestGenerateBasePodSpec_ResourceClaims(t *testing.T) {
 				controllerConfig,
 				commonconsts.MultinodeDeploymentTypeGrove,
 				"test-service",
+				nil, // No checkpoint info in tests
 			)

 			if tt.expectError {
@@ -5902,6 +5912,7 @@ func TestGenerateBasePodSpec_UseAsCompilationCache_BackendSupport(t *testing.T)
 				controllerConfig,
 				commonconsts.MultinodeDeploymentTypeGrove,
 				"test-service",
+				nil, // No checkpoint info in tests
 			)

 			if tt.expectError {
@@ -6087,6 +6098,7 @@ func TestGenerateBasePodSpec_SecurityContext(t *testing.T) {
 				controllerConfig,
 				commonconsts.MultinodeDeploymentTypeGrove,
 				"test-service",
+				nil, // No checkpoint info in tests
 			)

 			if err != nil {
@@ -6581,7 +6593,7 @@ func TestGenerateGrovePodCliqueSet_RestartAnnotations(t *testing.T) {
 				NatsAddress: "nats-address",
 			}

-			got, err := GenerateGrovePodCliqueSet(context.Background(), dgd, controllerConfig, nil, tt.restartState, nil)
+			got, err := GenerateGrovePodCliqueSet(context.Background(), dgd, controllerConfig, nil, tt.restartState, nil, nil)
 			if err != nil {
 				t.Fatalf("GenerateGrovePodCliqueSet() error = %v", err)
 			}

--- a/docs/_sections/k8s_deployment.rst
+++ b/docs/_sections/k8s_deployment.rst
@@ -12,3 +12,4 @@ Deployment Guide
   Minikube Setup <../kubernetes/deployment/minikube>
   Managing Models with DynamoModel <../kubernetes/deployment/dynamomodel-guide>
   Autoscaling <../kubernetes/autoscaling>
+   Checkpointing <../kubernetes/chrek/README>
--- a/docs/hidden_toctree.rst
+++ b/docs/hidden_toctree.rst
@@ -27,6 +27,9 @@
   kubernetes/api_reference.md
   kubernetes/deployment/create_deployment.md
   kubernetes/deployment/dynamomodel-guide.md
+   kubernetes/chrek/README.md
+   kubernetes/chrek/dynamo.md
+   kubernetes/chrek/standalone.md

   kubernetes/fluxcd.md
   kubernetes/grove.md

--- a/docs/kubernetes/README.md
+++ b/docs/kubernetes/README.md
@@ -244,6 +244,7 @@ Key customization points include:
 - **[Operator Documentation](/docs/kubernetes/dynamo_operator.md)** - How the platform works
 - **[Service Discovery](/docs/kubernetes/service_discovery.md)** - Discovery backends and configuration
 - **[Helm Charts](/deploy/helm/README.md)** - For advanced users
+- **[Checkpointing](/docs/kubernetes/chrek/README.md)** - Fast pod startup with checkpoint/restore
 - **[GitOps Deployment with FluxCD](/docs/kubernetes/fluxcd.md)** - For advanced users
 - **[Logging](/docs/kubernetes/observability/logging.md)** - For logging setup
 - **[Multinode Deployment](/docs/kubernetes/deployment/multinode-deployment.md)** - For multinode deployment

--- a/docs/kubernetes/api_reference.md
+++ b/docs/kubernetes/api_reference.md
@@ -34,6 +34,7 @@ a high-level, SLA-driven interface for deploying machine learning models on Dyna
 Package v1alpha1 contains API Schema definitions for the nvidia.com v1alpha1 API group.

 ### Resource Types
+- [DynamoCheckpoint](#dynamocheckpoint)
 - [DynamoComponentDeployment](#dynamocomponentdeployment)
 - [DynamoGraphDeployment](#dynamographdeployment)
 - [DynamoGraphDeploymentRequest](#dynamographdeploymentrequest)
@@ -67,6 +68,24 @@ _Appears in:_



+#### CheckpointMode
+
+_Underlying type:_ _string_
+
+CheckpointMode defines how checkpoint creation is handled
+
+_Validation:_
+- Enum: [Auto Manual]
+
+_Appears in:_
+- [ServiceCheckpointConfig](#servicecheckpointconfig)
+
+| Field | Description |
+| --- | --- |
+| `Auto` | CheckpointModeAuto means the DGD controller will automatically create a Checkpoint CR<br /> |
+| `Manual` | CheckpointModeManual means the user must create the Checkpoint CR themselves<br /> |
+
+
 #### ComponentKind

 _Underlying type:_ _string_
@@ -148,6 +167,146 @@ _Appears in:_



+#### DynamoCheckpoint
+
+
+
+DynamoCheckpoint is the Schema for the dynamocheckpoints API
+It represents a container checkpoint that can be used to restore pods to a warm state
+
+
+
+
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `apiVersion` _string_ | `nvidia.com/v1alpha1` | | |
+| `kind` _string_ | `DynamoCheckpoint` | | |
+| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. |  |  |
+| `spec` _[DynamoCheckpointSpec](#dynamocheckpointspec)_ |  |  |  |
+| `status` _[DynamoCheckpointStatus](#dynamocheckpointstatus)_ |  |  |  |
+
+
+
+
+#### DynamoCheckpointIdentity
+
+
+
+DynamoCheckpointIdentity defines the inputs that determine checkpoint equivalence
+Two checkpoints with the same identity hash are considered equivalent
+
+
+
+_Appears in:_
+- [DynamoCheckpointSpec](#dynamocheckpointspec)
+- [ServiceCheckpointConfig](#servicecheckpointconfig)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `model` _string_ | Model is the model identifier (e.g., "meta-llama/Llama-3-70B") |  | Required: \{\} <br /> |
+| `backendFramework` _string_ | BackendFramework is the runtime framework (vllm, sglang, trtllm) |  | Enum: [vllm sglang trtllm] <br />Required: \{\} <br /> |
+| `dynamoVersion` _string_ | DynamoVersion is the Dynamo platform version (optional)<br />If not specified, version is not included in identity hash<br />This ensures checkpoint compatibility across Dynamo releases |  | Optional: \{\} <br /> |
+| `tensorParallelSize` _integer_ | TensorParallelSize is the tensor parallel configuration | 1 | Minimum: 1 <br />Optional: \{\} <br /> |
+| `pipelineParallelSize` _integer_ | PipelineParallelSize is the pipeline parallel configuration | 1 | Minimum: 1 <br />Optional: \{\} <br /> |
+| `dtype` _string_ | Dtype is the data type (fp16, bf16, fp8, etc.) |  | Optional: \{\} <br /> |
+| `maxModelLen` _integer_ | MaxModelLen is the maximum sequence length |  | Minimum: 1 <br />Optional: \{\} <br /> |
+| `extraParameters` _object (keys:string, values:string)_ | ExtraParameters are additional parameters that affect the checkpoint hash<br />Use for any framework-specific or custom parameters not covered above |  | Optional: \{\} <br /> |
+
+
+#### DynamoCheckpointJobConfig
+
+
+
+DynamoCheckpointJobConfig defines the configuration for the checkpoint creation Job
+
+
+
+_Appears in:_
+- [DynamoCheckpointSpec](#dynamocheckpointspec)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `podTemplateSpec` _[PodTemplateSpec](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#podtemplatespec-v1-core)_ | PodTemplateSpec allows customizing the checkpoint Job pod<br />This should include the container that runs the workload to be checkpointed |  | Required: \{\} <br /> |
+| `activeDeadlineSeconds` _integer_ | ActiveDeadlineSeconds specifies the maximum time the Job can run | 3600 | Optional: \{\} <br /> |
+| `backoffLimit` _integer_ | BackoffLimit specifies the number of retries before marking the Job failed | 3 | Optional: \{\} <br /> |
+| `ttlSecondsAfterFinished` _integer_ | TTLSecondsAfterFinished specifies how long to keep the Job after completion | 300 | Optional: \{\} <br /> |
+
+
+#### DynamoCheckpointPhase
+
+_Underlying type:_ _string_
+
+DynamoCheckpointPhase represents the current phase of the checkpoint lifecycle
+
+_Validation:_
+- Enum: [Pending Creating Ready Failed]
+
+_Appears in:_
+- [DynamoCheckpointStatus](#dynamocheckpointstatus)
+
+| Field | Description |
+| --- | --- |
+| `Pending` | DynamoCheckpointPhasePending indicates the checkpoint CR has been created but the Job has not started<br /> |
+| `Creating` | DynamoCheckpointPhaseCreating indicates the checkpoint Job is running<br /> |
+| `Ready` | DynamoCheckpointPhaseReady indicates the checkpoint tar file is available on the PVC<br /> |
+| `Failed` | DynamoCheckpointPhaseFailed indicates the checkpoint creation failed<br /> |
+
+
+#### DynamoCheckpointSpec
+
+
+
+DynamoCheckpointSpec defines the desired state of DynamoCheckpoint
+
+
+
+_Appears in:_
+- [DynamoCheckpoint](#dynamocheckpoint)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `identity` _[DynamoCheckpointIdentity](#dynamocheckpointidentity)_ | Identity defines the inputs that determine checkpoint equivalence |  | Required: \{\} <br /> |
+| `job` _[DynamoCheckpointJobConfig](#dynamocheckpointjobconfig)_ | Job defines the configuration for the checkpoint creation Job |  | Required: \{\} <br /> |
+
+
+#### DynamoCheckpointStatus
+
+
+
+DynamoCheckpointStatus defines the observed state of DynamoCheckpoint
+
+
+
+_Appears in:_
+- [DynamoCheckpoint](#dynamocheckpoint)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `phase` _[DynamoCheckpointPhase](#dynamocheckpointphase)_ | Phase represents the current phase of the checkpoint lifecycle |  | Enum: [Pending Creating Ready Failed] <br />Optional: \{\} <br /> |
+| `identityHash` _string_ | IdentityHash is the computed hash of the checkpoint identity<br />This hash is used to identify equivalent checkpoints |  | Optional: \{\} <br /> |
+| `location` _string_ | Location is the full URI/path to the checkpoint in the storage backend<br />For PVC: same as TarPath (e.g., /checkpoints/\{hash\}.tar)<br />For S3: s3://bucket/prefix/\{hash\}.tar<br />For OCI: oci://registry/repo:\{hash\} |  | Optional: \{\} <br /> |
+| `storageType` _[DynamoCheckpointStorageType](#dynamocheckpointstoragetype)_ | StorageType indicates the storage backend type used for this checkpoint |  | Enum: [pvc s3 oci] <br />Optional: \{\} <br /> |
+| `jobName` _string_ | JobName is the name of the checkpoint creation Job |  | Optional: \{\} <br /> |
+| `createdAt` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#time-v1-meta)_ | CreatedAt is the timestamp when the checkpoint tar was created |  | Optional: \{\} <br /> |
+| `message` _string_ | Message provides additional information about the current state |  | Optional: \{\} <br /> |
+| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions represent the latest available observations of the checkpoint's state |  | Optional: \{\} <br /> |
+
+
+#### DynamoCheckpointStorageType
+
+_Underlying type:_ _string_
+
+DynamoCheckpointStorageType defines the supported storage backends for checkpoints
+
+_Validation:_
+- Enum: [pvc s3 oci]
+
+_Appears in:_
+- [DynamoCheckpointStatus](#dynamocheckpointstatus)
+
+
+
 #### DynamoComponentDeployment


@@ -203,6 +362,7 @@ _Appears in:_
 | `multinode` _[MultinodeSpec](#multinodespec)_ | Multinode is the configuration for multinode components. |  |  |
 | `scalingAdapter` _[ScalingAdapter](#scalingadapter)_ | ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter.<br />When enabled, replicas are managed via DGDSA and external autoscalers can scale<br />the service using the Scale subresource. When disabled, replicas can be modified directly. |  | Optional: \{\} <br /> |
 | `eppConfig` _[EPPConfig](#eppconfig)_ | EPPConfig defines EPP-specific configuration options for Endpoint Picker Plugin components.<br />Only applicable when ComponentType is "epp". |  | Optional: \{\} <br /> |
+| `checkpoint` _[ServiceCheckpointConfig](#servicecheckpointconfig)_ | Checkpoint configures container checkpointing for this service.<br />When enabled, pods can be restored from a checkpoint files for faster cold start. |  | Optional: \{\} <br /> |


 #### DynamoComponentDeploymentSpec
@@ -242,6 +402,7 @@ _Appears in:_
 | `multinode` _[MultinodeSpec](#multinodespec)_ | Multinode is the configuration for multinode components. |  |  |
 | `scalingAdapter` _[ScalingAdapter](#scalingadapter)_ | ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter.<br />When enabled, replicas are managed via DGDSA and external autoscalers can scale<br />the service using the Scale subresource. When disabled, replicas can be modified directly. |  | Optional: \{\} <br /> |
 | `eppConfig` _[EPPConfig](#eppconfig)_ | EPPConfig defines EPP-specific configuration options for Endpoint Picker Plugin components.<br />Only applicable when ComponentType is "epp". |  | Optional: \{\} <br /> |
+| `checkpoint` _[ServiceCheckpointConfig](#servicecheckpointconfig)_ | Checkpoint configures container checkpointing for this service.<br />When enabled, pods can be restored from a checkpoint files for faster cold start. |  | Optional: \{\} <br /> |


 #### DynamoGraphDeployment
@@ -456,6 +617,7 @@ _Appears in:_
 | `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions contains the latest observed conditions of the graph deployment.<br />The slice is merged by type on patch updates. |  |  |
 | `services` _object (keys:string, values:[ServiceReplicaStatus](#servicereplicastatus))_ | Services contains per-service replica status information.<br />The map key is the service name from spec.services. |  | Optional: \{\} <br /> |
 | `restart` _[RestartStatus](#restartstatus)_ | Restart contains the status of the restart of the graph deployment. |  | Optional: \{\} <br /> |
+| `checkpoints` _object (keys:string, values:[ServiceCheckpointStatus](#servicecheckpointstatus))_ | Checkpoints contains per-service checkpoint status information.<br />The map key is the service name from spec.services. |  | Optional: \{\} <br /> |


 #### DynamoModel
@@ -872,6 +1034,44 @@ _Appears in:_
 | `enabled` _boolean_ | Enabled indicates whether the ScalingAdapter should be enabled for this service.<br />When true, a DGDSA is created and owns the replicas field.<br />When false (default), no DGDSA is created and replicas can be modified directly in the DGD. | false | Optional: \{\} <br /> |


+#### ServiceCheckpointConfig
+
+
+
+ServiceCheckpointConfig configures checkpointing for a DGD service
+
+
+
+_Appears in:_
+- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
+- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `enabled` _boolean_ | Enabled indicates whether checkpointing is enabled for this service | false | Optional: \{\} <br /> |
+| `mode` _[CheckpointMode](#checkpointmode)_ | Mode defines how checkpoint creation is handled<br />- Auto: DGD controller creates Checkpoint CR automatically<br />- Manual: User must create Checkpoint CR | Auto | Enum: [Auto Manual] <br />Optional: \{\} <br /> |
+| `checkpointRef` _string_ | CheckpointRef references an existing Checkpoint CR to use<br />If specified, Identity is ignored and this checkpoint is used directly |  | Optional: \{\} <br /> |
+| `identity` _[DynamoCheckpointIdentity](#dynamocheckpointidentity)_ | Identity defines the checkpoint identity for hash computation<br />Used when Mode is Auto or when looking up existing checkpoints<br />Required when checkpointRef is not specified |  | Optional: \{\} <br /> |
+
+
+#### ServiceCheckpointStatus
+
+
+
+ServiceCheckpointStatus contains checkpoint information for a single service.
+
+
+
+_Appears in:_
+- [DynamoGraphDeploymentStatus](#dynamographdeploymentstatus)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `checkpointName` _string_ | CheckpointName is the name of the associated Checkpoint CR |  | Optional: \{\} <br /> |
+| `identityHash` _string_ | IdentityHash is the computed hash of the checkpoint identity |  | Optional: \{\} <br /> |
+| `ready` _boolean_ | Ready indicates if the checkpoint is ready for use |  | Optional: \{\} <br /> |
+
+
 #### ServiceReplicaStatus



--- a/docs/kubernetes/chrek/README.md
+++ b/docs/kubernetes/chrek/README.md
+# ChReK: Checkpoint/Restore in Kubernetes
+
+> ⚠️ **Experimental Feature**: ChReK is currently in **beta/preview**. It requires privileged mode for restore operations, which may not be suitable for all production environments. See [Limitations](#limitations) for details.
+
+**ChReK** (Checkpoint/Restore in Kubernetes) is an experimental infrastructure for fast-starting GPU applications using CRIU (Checkpoint/Restore in User-space). ChReK dramatically reduces cold-start times for large models from minutes to seconds by capturing initialized application state and restoring it on-demand.
+
+## What is ChReK?
+
+ChReK provides:
+- **Fast cold starts**: Restore GPU-accelerated applications in seconds instead of minutes
+- **CUDA state preservation**: Checkpoint and restore GPU memory and CUDA contexts
+- **Kubernetes-native**: Integrates seamlessly with Kubernetes primitives
+- **Storage flexibility**: PVC-based storage (S3/OCI planned for future releases)
+- **Namespace isolation**: Each namespace gets its own checkpoint infrastructure
+
+## Use Cases
+
+### 1. With NVIDIA Dynamo Platform (Recommended)
+
+Use ChReK as part of the Dynamo platform for automatic checkpoint management:
+- Automatic checkpoint creation and lifecycle management
+- Seamless integration with DynamoGraphDeployment CRDs
+- Built-in autoscaling with fast restore
+
+📖 **[Read the Dynamo Integration Guide →](dynamo.md)**
+
+### 2. Standalone (Without Dynamo)
+
+Use ChReK independently in your own Kubernetes applications:
+- Manual checkpoint job creation
+- Build your own restore-enabled container images
+- Full control over checkpoint lifecycle
+
+📖 **[Read the Standalone Usage Guide →](standalone.md)**
+
+## Architecture
+
+ChReK consists of two main components:
+
+### 1. ChReK Helm Chart
+Deploys the checkpoint/restore infrastructure:
+- **DaemonSet**: Runs on GPU nodes to perform CRIU checkpoint operations
+- **PVC**: Stores checkpoint data (rootfs diffs, CUDA memory state)
+- **RBAC**: Namespace-scoped or cluster-wide permissions
+- **Seccomp Profile**: Security policies for CRIU syscalls
+
+### 2. Smart Entrypoint
+A wrapper script that intelligently decides between:
+- **Cold start**: Normal application startup (when no checkpoint exists)
+- **Restore**: CRIU restore from checkpoint (when checkpoint available)
+
+## Quick Start
+
+### Install ChReK Infrastructure
+
+```bash
+helm install chrek nvidia/chrek \
+  --namespace my-team \
+  --create-namespace \
+  --set storage.pvc.size=100Gi
+```
+
+### Choose Your Integration Path
+
+- **Using Dynamo Platform?** → Follow the [Dynamo Integration Guide](dynamo.md)
+- **Using standalone?** → Follow the [Standalone Usage Guide](standalone.md)
+
+## Key Features
+
+### ✅ Currently Supported
+- ✅ **vLLM backend only** (SGLang and TensorRT-LLM planned)
+- ✅ Single-node, single-GPU checkpoints
+- ✅ PVC storage backend (RWX for multi-node)
+- ✅ CUDA checkpoint/restore
+- ✅ PyTorch distributed state (with `GLOO_SOCKET_IFNAME=lo`)
+- ✅ Namespace-scoped and cluster-wide RBAC
+- ✅ Idempotent checkpoint creation
+- ✅ Automatic signal-based checkpoint coordination
+
+### 🚧 Planned Features
+- 🚧 SGLang backend support
+- 🚧 TensorRT-LLM backend support
+- 🚧 S3/MinIO storage backend
+- 🚧 OCI registry storage backend
+- 🚧 Multi-GPU checkpoints
+- 🚧 Multi-node distributed checkpoints
+
+## Limitations
+
+⚠️ **Important**: ChReK has significant limitations that may impact production readiness:
+
+### Security Considerations
+- **🔴 Privileged mode required**: Restore pods **must run in privileged mode** for CRIU to function. This grants containers elevated host access and may violate security policies in many production environments.
+- **Security Impact**: Privileged containers can:
+  - Access all host devices
+  - Bypass most security restrictions
+  - Potentially compromise node security if the container is exploited
+
+### Technical Limitations
+- **vLLM backend only**: Currently only the vLLM backend supports checkpoint/restore. SGLang and TensorRT-LLM support is planned.
+- **Single-node only**: Checkpoints must be created and restored on the same node
+- **Single-GPU only**: Multi-GPU configurations not yet supported
+- **Network state limitations**: Active TCP connections are closed during restore (use `tcp-close` CRIU option)
+- **Storage**: Only PVC storage is currently implemented (S3/OCI planned)
+
+### Recommendation
+ChReK is best suited for:
+- ✅ Development and testing environments
+- ✅ Research and experimentation
+- ✅ Controlled production environments with appropriate security controls
+- ❌ Security-sensitive production workloads without proper risk assessment
+
+## Documentation
+
+### Getting Started
+- [Dynamo Integration Guide](dynamo.md) - Using ChReK with Dynamo Platform
+- [Standalone Usage Guide](standalone.md) - Using ChReK independently
+- [ChReK Helm Chart README](../../../deploy/helm/charts/chrek/README.md) - Helm chart configuration
+
+### Related Documentation
+- [CRIU Documentation](https://criu.org/Main_Page) - Upstream CRIU docs
+
+## Prerequisites
+
+- Kubernetes 1.21+
+- GPU nodes with NVIDIA runtime (`nvidia` runtime class)
+- CRIU support in container runtime (containerd with CRIU plugin)
+- RWX storage class (for multi-node deployments)
+- **Security clearance for privileged pods** (required for restore operations)
+
+## Troubleshooting
+
+### Common Issues
+
+**DaemonSet not starting?**
+- Check GPU node labels: `kubectl get nodes -l nvidia.com/gpu.present=true`
+- Verify NVIDIA runtime is available
+
+**Checkpoint fails?**
+- Check DaemonSet logs: `kubectl logs -l app.kubernetes.io/name=chrek -n <namespace>`
+- Ensure application properly signals readiness
+- Verify CRIU is installed in the runtime
+
+**Restore fails?**
+- Ensure restore pod uses the same volumes as checkpoint job
+- Verify `hostIPC: true` is set (required for CUDA)
+- Check for `PSM3_DISABLED=1` and `GLOO_SOCKET_IFNAME=lo` environment variables
+
+For detailed troubleshooting, see:
+- [Dynamo Integration Guide - Troubleshooting](dynamo.md#troubleshooting)
+- [Standalone Guide - Troubleshooting](standalone.md#troubleshooting)
+
+## Contributing
+
+ChReK is part of the NVIDIA Dynamo project. Contributions are welcome!
+
+## License
+
+Apache License 2.0
--- a/docs/kubernetes/chrek/dynamo.md
+++ b/docs/kubernetes/chrek/dynamo.md
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Checkpoint/Restore for Fast Pod Startup
+
+> ⚠️ **Experimental Feature**: ChReK is currently in **beta/preview**. It requires privileged mode for restore operations. See [Limitations](#limitations) for details.
+
+Reduce cold start times for LLM inference workers from ~3 minutes to ~30 seconds using container checkpointing.
+
+## Overview
+
+Checkpointing captures the complete state of a running worker pod (including GPU memory) and saves it to storage. New pods can restore from this checkpoint instead of performing a full cold start.
+
+| Startup Type | Time | What Happens |
+|--------------|------|--------------|
+| **Cold Start** | ~3 min | Download model, load to GPU, initialize engine |
+| **Warm Start** (checkpoint) | ~30 sec | Restore from checkpoint tar |
+
+## Prerequisites
+
+- Dynamo Platform installed (v0.4.0+)
+- ChReK Helm chart installed (separate from platform)
+- GPU nodes with CRIU support
+- RWX PVC storage (PVC is currently the only supported backend)
+
+## Quick Start
+
+### 1. Install ChReK Infrastructure
+
+First, install the ChReK Helm chart in each namespace where you need checkpointing:
+
+```bash
+# Install ChReK infrastructure
+helm install chrek nvidia/chrek \
+  --namespace my-team \
+  --create-namespace \
+  --set storage.pvc.size=100Gi
+```
+
+This creates:
+- A PVC for checkpoint storage (`chrek-pvc`)
+- A DaemonSet for CRIU operations (`chrek-agent`)
+
+### 2. Configure Operator Values
+
+Update your Helm values to point to the ChReK infrastructure:
+
+```yaml
+# values.yaml
+dynamo-operator:
+  checkpoint:
+    enabled: true
+    storage:
+      type: pvc  # Only PVC is currently supported (S3/OCI planned)
+      pvc:
+        pvcName: "chrek-pvc"  # Must match ChReK chart
+        basePath: "/checkpoints"
+      signalHostPath: "/var/lib/chrek/signals"  # Must match ChReK chart
+```
+
+### 2. Configure Your DGD
+
+Add checkpoint configuration to your service:
+
+```yaml
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: my-llm
+spec:
+  services:
+    VllmWorker:
+      replicas: 1
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/dynamo-vllm:latest
+          args:
+            - python3 -m dynamo.vllm --model meta-llama/Llama-3-8B
+      resources:
+        limits:
+          nvidia.com/gpu: "1"
+
+      # Checkpoint configuration
+      checkpoint:
+        enabled: true
+        mode: auto  # Automatically create checkpoint if not found
+        identity:
+          model: "meta-llama/Llama-3-8B"
+          backendFramework: "vllm"
+          tensorParallelSize: 1
+          dtype: "bfloat16"
+```
+
+### 3. Deploy
+
+```bash
+kubectl apply -f my-llm.yaml -n dynamo-system
+```
+
+On first deployment:
+1. A checkpoint job runs to create the checkpoint
+2. Worker pods start with cold start (checkpoint not ready yet)
+3. Once checkpoint is ready, new pods (scale-up, restarts) restore from checkpoint
+
+## Storage Backends
+
+### PVC (Currently Supported)
+
+Use when you have RWX storage available (e.g., NFS, EFS, Filestore).
+
+```yaml
+checkpoint:
+  storage:
+    type: pvc
+    pvc:
+      pvcName: "chrek-pvc"
+      basePath: "/checkpoints"
+```
+
+**Requirements:**
+- RWX (ReadWriteMany) PVC for multi-node access
+- Sufficient storage (checkpoints are ~10-50GB per model)
+
+### S3 / MinIO (Planned - Not Yet Implemented)
+
+> ⚠️ **Note:** S3 storage backend is defined in the API but not yet fully implemented.
+
+Object storage support is planned for a future release. The configuration will look like:
+
+```yaml
+checkpoint:
+  storage:
+    type: s3  # Not yet supported
+    s3:
+      # AWS S3
+      uri: "s3://my-bucket/checkpoints"
+
+      # Or MinIO / custom S3
+      uri: "s3://minio.example.com/my-bucket/checkpoints"
+
+      # Optional: credentials secret
+      credentialsSecretRef: "s3-creds"
+```
+
+### OCI Registry (Planned - Not Yet Implemented)
+
+> ⚠️ **Note:** OCI registry storage backend is defined in the API but not yet fully implemented.
+
+Container registry storage support is planned for a future release. The configuration will look like:
+
+```yaml
+checkpoint:
+  storage:
+    type: oci  # Not yet supported
+    oci:
+      uri: "oci://myregistry.io/checkpoints"
+      credentialsSecretRef: "registry-creds"  # Docker config secret
+```
+
+## Checkpoint Modes
+
+### Auto Mode (Recommended)
+
+The operator automatically creates a `DynamoCheckpoint` CR if one doesn't exist:
+
+```yaml
+checkpoint:
+  enabled: true
+  mode: auto
+  identity:
+    model: "meta-llama/Llama-3-8B"
+    backendFramework: "vllm"
+    tensorParallelSize: 1
+```
+
+### Reference Mode
+
+Reference an existing `DynamoCheckpoint` CR by its 16-character hash using `checkpointRef`:
+
+```yaml
+checkpoint:
+  enabled: true
+  checkpointRef: "e5962d34ba272638"  # 16-char hash of DynamoCheckpoint CR
+```
+
+This is useful when:
+- You want to **pre-warm checkpoints** before creating DGDs
+- You want to **explicit control** over which checkpoint to use
+
+**Flow:**
+1. Create a `DynamoCheckpoint` CR (see [DynamoCheckpoint CRD](#dynamocheckpoint-crd) section)
+2. Wait for it to become `Ready`
+3. Reference it in your DGD using `checkpointRef` with the hash
+
+```bash
+# Check checkpoint status (using 16-char hash name)
+kubectl get dynamocheckpoint e5962d34ba272638 -n dynamo-system
+NAME                MODEL                   BACKEND  PHASE  HASH              AGE
+e5962d34ba272638    meta-llama/Llama-3-8B  vllm     Ready  e5962d34ba272638  5m
+
+# Now create DGD referencing it
+kubectl apply -f my-dgd.yaml
+```
+
+## Checkpoint Identity
+
+Checkpoints are uniquely identified by a **16-character SHA256 hash** (64 bits) of configuration that affects runtime state:
+
+| Field | Required | Affects Hash | Example |
+|-------|----------|-------------|---------|
+| `model` | ✓ | ✓ | `meta-llama/Llama-3-8B` |
+| `framework` | ✓ | ✓ | `vllm`, `sglang`, `trtllm` |
+| `dynamoVersion` | | ✓ | `0.9.0`, `1.0.0` |
+| `tensorParallelSize` | | ✓ | `1`, `2`, `4`, `8` (default: 1) |
+| `pipelineParallelSize` | | ✓ | `1`, `2` (default: 1) |
+| `dtype` | | ✓ | `float16`, `bfloat16`, `fp8` |
+| `maxModelLen` | | ✓ | `4096`, `8192` |
+| `extraParameters` | | ✓ | Custom key-value pairs |
+
+**Not included in hash** (don't invalidate checkpoint):
+- `replicas`
+- `nodeSelector`, `affinity`, `tolerations`
+- `resources` (requests/limits)
+- Logging/observability config
+
+**Example with all fields:**
+```yaml
+checkpoint:
+  enabled: true
+  mode: auto
+  identity:
+    model: "meta-llama/Llama-3-8B"
+    backendFramework: "vllm"
+    dynamoVersion: "0.9.0"
+    tensorParallelSize: 1
+    pipelineParallelSize: 1
+    dtype: "bfloat16"
+    maxModelLen: 8192
+    extraParameters:
+      enableChunkedPrefill: "true"
+      quantization: "awq"
+```
+
+**Checkpoint Naming:** The `DynamoCheckpoint` CR is automatically named using the 16-character identity hash (e.g., `e5962d34ba272638`).
+
+**Checkpoint Sharing:** Multiple DGDs with the same identity automatically share the same checkpoint.
+
+## DynamoCheckpoint CRD
+
+The `DynamoCheckpoint` (shortname: `dckpt`) is a Kubernetes Custom Resource that manages checkpoint lifecycle.
+
+**When to create a DynamoCheckpoint directly:**
+- **Pre-warming:** Create checkpoints before deploying DGDs for instant startup
+- **Explicit control:** Manage checkpoint lifecycle independently from DGDs
+
+**Note:** With the new hash-based naming, checkpoint names are automatically generated (16-character hash). The operator handles checkpoint discovery and reuse automatically in `auto` mode.
+
+**Create a checkpoint:**
+
+```yaml
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoCheckpoint
+metadata:
+  name: e5962d34ba272638  # Use the computed 16-char hash
+spec:
+  identity:
+    model: "meta-llama/Llama-3-8B"
+    backendFramework: "vllm"
+    tensorParallelSize: 1
+    dtype: "bfloat16"
+
+  job:
+    activeDeadlineSeconds: 3600
+    podTemplateSpec:
+      spec:
+        containers:
+          - name: main
+            image: nvcr.io/nvidia/ai-dynamo/dynamo-vllm:latest
+            command: ["python3", "-m", "dynamo.vllm"]
+            args: ["--model", "meta-llama/Llama-3-8B"]
+            resources:
+              limits:
+                nvidia.com/gpu: "1"
+            env:
+              - name: HF_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-token-secret
+                    key: HF_TOKEN
+```
+
+**Note:** You can compute the hash yourself, or use `auto` mode to let the operator create it.
+
+**Check status:**
+
+```bash
+# List all checkpoints
+kubectl get dynamocheckpoint -n dynamo-system
+# Or use shortname
+kubectl get dckpt -n dynamo-system
+
+NAME                MODEL                          BACKEND  PHASE    HASH              AGE
+e5962d34ba272638    meta-llama/Llama-3-8B         vllm     Ready    e5962d34ba272638  5m
+a7b4f89c12de3456    meta-llama/Llama-3-70B        vllm     Creating a7b4f89c12de3456  2m
+```
+
+**Phases:**
+| Phase | Description |
+|-------|-------------|
+| `Pending` | CR created, waiting for job to start |
+| `Creating` | Checkpoint job is running |
+| `Ready` | Checkpoint available for use |
+| `Failed` | Checkpoint creation failed |
+
+**Detailed status:**
+
+```bash
+kubectl describe dckpt e5962d34ba272638 -n dynamo-system
+```
+
+```yaml
+Status:
+  Phase: Ready
+  IdentityHash: e5962d34ba272638
+  Location: /checkpoints/e5962d34ba272638
+  StorageType: pvc
+  CreatedAt: 2026-01-29T10:05:00Z
+```
+
+**Reference from DGD:**
+
+Once the checkpoint is `Ready`, you can reference it by hash:
+
+```yaml
+spec:
+  services:
+    VllmWorker:
+      checkpoint:
+        enabled: true
+        checkpointRef: "e5962d34ba272638"  # 16-char hash
+```
+
+Or use `auto` mode and the operator will find/create it automatically.
+
+## Limitations
+
+⚠️ **Important**: ChReK has significant limitations that impact production readiness:
+
+### Security Considerations
+- **🔴 Privileged mode required**: Restore pods **must run in privileged mode** for CRIU to function
+- Privileged containers have elevated host access, which may violate security policies in many production environments
+- This requirement applies to all worker pods that restore from checkpoints
+
+### Technical Limitations
+- **vLLM backend only**: Currently only the vLLM backend supports checkpoint/restore. SGLang and TensorRT-LLM support is planned.
+- **Single-node only**: Checkpoints must be created and restored on the same node
+- **Single-GPU only**: Multi-GPU configurations are not yet supported
+- **Network state**: Active TCP connections are closed during restore (handled with `tcp-close` CRIU option)
+- **Storage**: Only PVC backend currently implemented (S3/OCI planned)
+
+### Recommendation
+ChReK is **experimental/beta** and best suited for:
+- ✅ Development and testing environments
+- ✅ Research and experimentation
+- ✅ Controlled production environments with appropriate security controls
+- ❌ Security-sensitive production workloads without proper risk assessment
+
+## Troubleshooting
+
+### Checkpoint Not Creating
+
+1. Check the checkpoint job:
+   ```bash
+   kubectl get jobs -l nvidia.com/checkpoint-source=true -n dynamo-system
+   kubectl logs job/checkpoint-<name> -n dynamo-system
+   ```
+
+2. Check the DaemonSet:
+   ```bash
+   kubectl logs daemonset/chrek-agent -n dynamo-system
+   ```
+
+3. Verify storage access:
+   ```bash
+   kubectl exec -it <checkpoint-agent-pod> -- ls -la /checkpoints
+   ```
+
+### Restore Failing
+
+1. Check pod logs:
+   ```bash
+   kubectl logs <worker-pod> -n dynamo-system
+   ```
+
+2. Verify checkpoint file exists:
+   ```bash
+   # For PVC
+   kubectl exec -it <any-pod-with-pvc> -- ls -la /checkpoints/
+
+   # For S3
+   aws s3 ls s3://my-bucket/checkpoints/
+   ```
+
+3. Check environment variables:
+   ```bash
+   kubectl exec <worker-pod> -- env | grep DYN_CHECKPOINT
+   ```
+
+### Cold Start Despite Checkpoint
+
+Pods fall back to cold start if:
+- Checkpoint file doesn't exist yet (still being created)
+- Checkpoint file is corrupted
+- CRIU restore fails
+
+Check logs for "Falling back to cold start" message.
+
+## Best Practices
+
+1. **Use RWX PVCs** for multi-node deployments (currently the only supported backend)
+2. **Pre-warm checkpoints** before scaling up
+3. **Monitor checkpoint size** - large models create large checkpoints
+4. **Clean up old checkpoints** to save storage
+
+## Environment Variables
+
+| Variable | Description |
+|----------|-------------|
+| `DYN_CHECKPOINT_STORAGE_TYPE` | Backend: `pvc`, `s3`, `oci` |
+| `DYN_CHECKPOINT_LOCATION` | Source location (URI) |
+| `DYN_CHECKPOINT_PATH` | Local path to tar file |
+| `DYN_CHECKPOINT_HASH` | Identity hash (debugging) |
+| `DYN_CHECKPOINT_SIGNAL_FILE` | Signal file (creation mode only) |
+
+## Complete Example
+
+Create a checkpoint and use it in a DGD:
+
+```yaml
+# 1. Create the DynamoCheckpoint CR
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoCheckpoint
+metadata:
+  name: e5962d34ba272638  # 16-char hash (computed from identity)
+  namespace: dynamo-system
+spec:
+  identity:
+    model: "meta-llama/Meta-Llama-3-8B-Instruct"
+    backendFramework: "vllm"
+    tensorParallelSize: 1
+    dtype: "bfloat16"
+  job:
+    activeDeadlineSeconds: 3600
+    backoffLimit: 3
+    podTemplateSpec:
+      spec:
+        containers:
+          - name: main
+            image: nvcr.io/nvidia/ai-dynamo/dynamo-vllm:latest
+            command: ["python3", "-m", "dynamo.vllm"]
+            args:
+              - "--model"
+              - "meta-llama/Meta-Llama-3-8B-Instruct"
+              - "--tensor-parallel-size"
+              - "1"
+              - "--dtype"
+              - "bfloat16"
+            env:
+              - name: HF_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-token-secret
+                    key: HF_TOKEN
+            resources:
+              limits:
+                nvidia.com/gpu: "1"
+        restartPolicy: Never
+---
+# 2. Wait for Ready: kubectl get dckpt e5962d34ba272638 -n dynamo-system -w
+---
+# 3. Reference the checkpoint in your DGD
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: my-llm
+  namespace: dynamo-system
+spec:
+  services:
+    VllmWorker:
+      replicas: 2
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/dynamo-vllm:latest
+      resources:
+        limits:
+          nvidia.com/gpu: "1"
+      checkpoint:
+        enabled: true
+        checkpointRef: "e5962d34ba272638"  # Reference by hash
+```
+
+## Related Documentation
+
+- [ChReK Overview](README.md) - ChReK architecture and use cases
+- [ChReK Standalone Usage Guide](standalone.md) - Use ChReK without Dynamo Platform
+- [ChReK Helm Chart README](../../../deploy/helm/charts/chrek/README.md) - Chart configuration
+- [Installation Guide](../installation_guide.md) - Platform installation
+- [API Reference](../api_reference.md) - Complete CRD specifications
+
--- a/docs/kubernetes/chrek/standalone.md
+++ b/docs/kubernetes/chrek/standalone.md
+# ChReK Standalone Usage Guide
+
+> ⚠️ **Experimental Feature**: ChReK is currently in **beta/preview**. It requires privileged mode for restore operations, which may not be suitable for all production environments. Review the [security implications](#security-considerations) before deploying.
+
+This guide explains how to use **ChReK** (Checkpoint/Restore for Kubernetes) as a standalone component without deploying the full Dynamo platform. This is useful if you want to add checkpoint/restore capabilities to your own GPU workloads.
+
+## Table of Contents
+
+- [Overview](#overview)
+- [Prerequisites](#prerequisites)
+- [Step 1: Deploy ChReK](#step-1-deploy-chrek)
+- [Step 2: Build Checkpoint-Enabled Images](#step-2-build-checkpoint-enabled-images)
+- [Step 3: Create Checkpoint Jobs](#step-3-create-checkpoint-jobs)
+- [Step 4: Restore from Checkpoints](#step-4-restore-from-checkpoints)
+- [Environment Variables Reference](#environment-variables-reference)
+- [Checkpoint Flow Explained](#checkpoint-flow-explained)
+- [Troubleshooting](#troubleshooting)
+
+---
+
+## Overview
+
+When using ChReK standalone, you are responsible for:
+
+1. **Deploying the ChReK Helm chart** (DaemonSet + PVC)
+2. **Building checkpoint-enabled container images** with the restore entrypoint
+3. **Creating checkpoint jobs** with the correct environment variables
+4. **Creating restore pods** that detect and use the checkpoints
+
+The ChReK DaemonSet handles the actual CRIU checkpoint/restore operations automatically once your pods are configured correctly.
+
+---
+
+## Prerequisites
+
+- Kubernetes cluster with:
+  - NVIDIA GPUs with checkpoint support
+  - **Privileged security context allowed** (⚠️ required for CRIU - see [Security Considerations](#security-considerations))
+  - PVC storage (ReadWriteMany recommended for multi-node)
+- Docker or compatible container runtime for building images
+- Access to the ChReK source code: `deploy/chrek/`
+
+### Security Considerations
+
+⚠️ **Important**: ChReK restore operations **require privileged mode**, which has significant security implications:
+
+- **Privileged containers** can access all host devices and bypass most security restrictions
+- This may violate security policies in production environments
+- Privileged containers, if compromised, can potentially compromise node security
+
+**Recommended for:**
+- ✅ Development and testing environments
+- ✅ Research and experimentation
+- ✅ Controlled production environments with appropriate security controls
+
+**Not recommended for:**
+- ❌ Multi-tenant clusters without proper isolation
+- ❌ Security-sensitive production workloads without risk assessment
+- ❌ Environments with strict security compliance requirements
+
+### Technical Limitations
+
+⚠️ **Current Restrictions:**
+- **vLLM backend only**: Currently only the vLLM backend supports checkpoint/restore. SGLang and TensorRT-LLM support is planned.
+- **Single-node only**: Checkpoints must be created and restored on the same node
+- **Single-GPU only**: Multi-GPU configurations are not yet supported
+- **Network state**: Active TCP connections are closed during restore
+- **Storage**: Only PVC backend currently implemented (S3/OCI planned)
+
+---
+
+## Step 1: Deploy ChReK
+
+### Install the Helm Chart
+
+```bash
+# Clone the repository
+git clone https://github.com/ai-dynamo/dynamo.git
+cd dynamo
+
+# Install ChReK in your namespace
+helm install chrek ./deploy/helm/charts/chrek \
+  --namespace my-app \
+  --create-namespace \
+  --set storage.pvc.size=100Gi \
+  --set storage.pvc.storageClass=your-storage-class
+```
+
+### Verify Installation
+
+```bash
+# Check the DaemonSet is running
+kubectl get daemonset -n my-app
+# NAME          DESIRED   CURRENT   READY   UP-TO-DATE   AVAILABLE
+# chrek-agent   3         3         3       3            3
+
+# Check the PVC is bound
+kubectl get pvc -n my-app
+# NAME        STATUS   VOLUME     CAPACITY   ACCESS MODES   STORAGECLASS
+# chrek-pvc   Bound    pvc-xyz    100Gi      RWX            your-storage-class
+```
+
+---
+
+## Step 2: Build Checkpoint-Enabled Images
+
+ChReK provides a convenient `placeholder` target in its Dockerfile that automatically injects checkpoint/restore capabilities into your existing container images.
+
+### Quick Start: Using the Placeholder Target (Recommended)
+
+```bash
+cd deploy/chrek
+
+# Define your images
+export BASE_IMAGE="your-app:latest"           # Your existing application image
+export RESTORE_IMAGE="your-app:checkpoint-enabled"  # Output checkpoint-enabled image
+
+# Build using the placeholder target
+docker build \
+  --target placeholder \
+  --build-arg BASE_IMAGE="$BASE_IMAGE" \
+  -t "$RESTORE_IMAGE" \
+  .
+
+# Push to your registry
+docker push "$RESTORE_IMAGE"
+```
+
+**Example with a Dynamo vLLM image:**
+
+```bash
+cd deploy/chrek
+
+export DYNAMO_IMAGE="nvidia/dynamo-vllm:v1.2.0"
+export RESTORE_IMAGE="nvidia/dynamo-vllm:v1.2.0-checkpoint"
+
+docker build \
+  --target placeholder \
+  --build-arg BASE_IMAGE="$DYNAMO_IMAGE" \
+  -t "$RESTORE_IMAGE" \
+  .
+```
+
+### What the Placeholder Target Does
+
+The ChReK Dockerfile's `placeholder` stage automatically:
+
+- ✅ Builds the restore-entrypoint binary
+- ✅ Injects it into `/usr/local/bin/restore-entrypoint`
+- ✅ Adds `smart-entrypoint.sh` to `/usr/local/bin/`
+- ✅ Sets executable permissions
+- ✅ Configures the entrypoint to detect and restore checkpoints
+- ✅ Preserves your original application CMD
+
+### Alternative: Manual Multi-Stage Build
+
+If you need more control, you can create your own Dockerfile:
+
+```dockerfile
+# Stage 1: Build restore-entrypoint
+FROM golang:1.23-alpine AS restore-builder
+WORKDIR /build
+COPY deploy/chrek/cmd/restore-entrypoint ./cmd/restore-entrypoint
+COPY deploy/chrek/pkg ./pkg
+COPY deploy/chrek/go.mod deploy/chrek/go.sum ./
+
+RUN go build -o /restore-entrypoint ./cmd/restore-entrypoint
+
+# Stage 2: Your application image
+FROM your-base-image:latest
+
+# Copy restore-entrypoint
+COPY --from=restore-builder /restore-entrypoint /usr/local/bin/restore-entrypoint
+
+# Copy smart-entrypoint.sh
+COPY deploy/chrek/scripts/smart-entrypoint.sh /usr/local/bin/smart-entrypoint.sh
+RUN chmod +x /usr/local/bin/smart-entrypoint.sh /usr/local/bin/restore-entrypoint
+
+# Set smart-entrypoint as the default entrypoint
+ENTRYPOINT ["/usr/local/bin/smart-entrypoint.sh"]
+
+# Your application command (becomes CMD, can be overridden)
+CMD ["python", "your_app.py"]
+```
+
+> **💡 Tip**: Using the `placeholder` target is the recommended approach as it's maintained with the ChReK codebase and ensures compatibility.
+
+---
+
+## Step 3: Create Checkpoint Jobs
+
+A checkpoint job loads your application, waits for the ChReK DaemonSet to checkpoint it, and then exits.
+
+### Required Environment Variables
+
+Your checkpoint job MUST set these environment variables:
+
+| Variable | Description | Example |
+|----------|-------------|---------|
+| `DYN_CHECKPOINT_SIGNAL_FILE` | Path where DaemonSet writes completion signal | `/checkpoint-signal/my-checkpoint.done` |
+| `DYN_CHECKPOINT_READY_FILE` | Path where your app signals it's ready | `/tmp/checkpoint-ready` |
+| `DYN_CHECKPOINT_HASH` | Unique identifier for this checkpoint | `abc123def456` |
+| `DYN_CHECKPOINT_LOCATION` | Directory where checkpoint is stored | `/checkpoints/abc123def456` |
+| `DYN_CHECKPOINT_STORAGE_TYPE` | Storage backend type | `pvc` |
+
+### Required Labels
+
+Add this label to enable DaemonSet checkpoint detection:
+
+```yaml
+labels:
+  nvidia.com/checkpoint-source: "true"
+```
+
+### Example Checkpoint Job
+
+```yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: checkpoint-my-model
+  namespace: my-app
+spec:
+  template:
+    metadata:
+      labels:
+        nvidia.com/checkpoint-source: "true"  # Required for DaemonSet detection
+    spec:
+      restartPolicy: Never
+
+      # Init container to clean up stale signal files
+      initContainers:
+      - name: cleanup-signal-file
+        image: busybox:latest
+        command:
+        - sh
+        - -c
+        - |
+          rm -f /checkpoint-signal/my-checkpoint.done || true
+          echo "Signal file cleanup complete"
+        volumeMounts:
+        - name: checkpoint-signal
+          mountPath: /checkpoint-signal
+
+      containers:
+      - name: main
+        image: my-app:checkpoint-enabled
+
+        # Security context required for CRIU
+        securityContext:
+          privileged: true
+          capabilities:
+            add: ["SYS_ADMIN", "SYS_PTRACE", "SYS_CHROOT"]
+
+        # Readiness probe: Pod becomes Ready when model is loaded
+        # This is what triggers the DaemonSet to start checkpointing
+        readinessProbe:
+          exec:
+            command: ["sh", "-c", "cat ${DYN_CHECKPOINT_READY_FILE}"]
+          initialDelaySeconds: 15
+          periodSeconds: 2
+
+        # Remove liveness/startup probes for checkpoint jobs
+        # Model loading can take several minutes
+        livenessProbe: null
+        startupProbe: null
+
+        # Checkpoint-related environment variables
+        env:
+        - name: DYN_CHECKPOINT_SIGNAL_FILE
+          value: "/checkpoint-signal/my-checkpoint.done"
+        - name: DYN_CHECKPOINT_READY_FILE
+          value: "/tmp/checkpoint-ready"
+        - name: DYN_CHECKPOINT_HASH
+          value: "abc123def456"
+        - name: DYN_CHECKPOINT_LOCATION
+          value: "/checkpoints/abc123def456"
+        - name: DYN_CHECKPOINT_STORAGE_TYPE
+          value: "pvc"
+
+        # GPU request
+        resources:
+          limits:
+            nvidia.com/gpu: 1
+
+        # Required volume mounts
+        volumeMounts:
+        - name: checkpoint-storage
+          mountPath: /checkpoints
+        - name: checkpoint-signal
+          mountPath: /checkpoint-signal
+        - name: tmp
+          mountPath: /tmp
+
+      volumes:
+      - name: checkpoint-storage
+        persistentVolumeClaim:
+          claimName: chrek-pvc
+      - name: checkpoint-signal
+        hostPath:
+          path: /var/lib/chrek/signals
+          type: DirectoryOrCreate
+      - name: tmp
+        emptyDir: {}
+```
+
+### Application Code Requirements
+
+Your application must implement the checkpoint flow. Here's the pattern used by Dynamo vLLM:
+
+```python
+import os
+import time
+
+def main():
+    # 1. Check for checkpoint mode
+    signal_file = os.environ.get("DYN_CHECKPOINT_SIGNAL_FILE")
+    ready_file = os.environ.get("DYN_CHECKPOINT_READY_FILE")
+    restore_marker = os.environ.get("DYN_RESTORE_MARKER_FILE", "/tmp/dynamo-restored")
+
+    is_checkpoint_mode = signal_file is not None
+
+    if is_checkpoint_mode:
+        print("Checkpoint mode detected")
+
+        # 2. Load your model/application
+        model = load_model()
+
+        # 3. Optional: Put model to sleep to reduce memory footprint
+        # model.sleep()
+
+        # 4. Write ready file (for application use, not DaemonSet)
+        if ready_file:
+            with open(ready_file, "w") as f:
+                f.write("ready")
+            print(f"Wrote checkpoint ready file: {ready_file}")
+
+        # 5. Log readiness messages (helps debugging)
+        print("CHECKPOINT_READY: Model loaded, ready for container checkpoint")
+        print(f"CHECKPOINT_READY: Waiting for signal file: {signal_file}")
+        print(f"CHECKPOINT_READY: Or restore marker file: {restore_marker}")
+
+        # 6. Wait for checkpoint completion OR restore detection
+        while True:
+            # Check if we've been restored (marker file created by restore entrypoint)
+            if os.path.exists(restore_marker):
+                print(f"Detected restore from checkpoint (marker: {restore_marker})")
+                # Continue with normal application flow
+                break
+
+            # Check if checkpoint is complete (signal file created by DaemonSet)
+            if os.path.exists(signal_file):
+                print(f"Checkpoint signal file detected: {signal_file}")
+                print("Checkpoint complete, exiting")
+                return  # Exit gracefully
+
+            time.sleep(1)
+
+    # Normal application flow (or post-restore flow)
+    run_application()
+```
+
+**Important Notes:**
+
+1. **Ready File & Readiness Probe**: The checkpoint job must have a readiness probe that checks for the ready file:
+   ```yaml
+   readinessProbe:
+     exec:
+       command: ["sh", "-c", "cat ${DYN_CHECKPOINT_READY_FILE}"]
+     initialDelaySeconds: 15
+     periodSeconds: 2
+   ```
+   The ChReK DaemonSet triggers checkpointing when:
+   - Pod has `nvidia.com/checkpoint-source: "true"` label
+   - Pod status is `Ready` (readiness probe passes = ready file exists)
+
+2. **Restore Marker**: Created by `restore-entrypoint` before CRIU restore, allows the restored process to detect it was restored
+
+3. **Two Exit Paths**:
+   - **Signal file found**: Checkpoint complete, exit gracefully
+   - **Restore marker found**: Process was restored, continue running
+
+
+---
+
+## Step 4: Restore from Checkpoints
+
+Restore pods automatically detect and restore from checkpoints if they exist.
+
+### Example Restore Pod
+
+```yaml
+apiVersion: v1
+kind: Pod
+metadata:
+  name: my-app-restored
+  namespace: my-app
+spec:
+  restartPolicy: Never
+
+  containers:
+  - name: main
+    image: my-app:checkpoint-enabled
+
+    # Security context required for CRIU restore
+    securityContext:
+      privileged: true
+      capabilities:
+        add: ["SYS_ADMIN", "SYS_PTRACE", "SYS_CHROOT"]
+
+    # Set checkpoint environment variables
+    env:
+    - name: DYN_CHECKPOINT_HASH
+      value: "abc123def456"  # Must match checkpoint job
+    - name: DYN_CHECKPOINT_PATH
+      value: "/checkpoints"  # Base path (hash appended automatically)
+
+    # Optional: Customize restore marker file path
+    # - name: DYN_RESTORE_MARKER_FILE
+    #   value: "/tmp/dynamo-restored"
+
+    # GPU request
+    resources:
+      limits:
+        nvidia.com/gpu: 1
+
+    # Mount checkpoint storage (READ-ONLY is fine for restore)
+    volumeMounts:
+    - name: checkpoint-storage
+      mountPath: /checkpoints
+      readOnly: true
+    - name: checkpoint-signal
+      mountPath: /checkpoint-signal
+
+  volumes:
+  - name: checkpoint-storage
+    persistentVolumeClaim:
+      claimName: chrek-pvc
+  - name: checkpoint-signal
+    hostPath:
+      path: /var/lib/chrek/signals
+      type: DirectoryOrCreate
+```
+
+### How Restore Works
+
+1. **Smart Entrypoint Detects Checkpoint**: The `smart-entrypoint.sh` checks if a checkpoint exists at `/checkpoints/${DYN_CHECKPOINT_HASH}/`
+2. **Calls Restore Entrypoint**: If found, calls `/usr/local/bin/restore-entrypoint` which invokes CRIU
+3. **CRIU Restores Process**: The entire process tree is restored from the checkpoint, including GPU state
+4. **Application Continues**: Your application resumes exactly where it was checkpointed
+
+---
+
+## Environment Variables Reference
+
+### Checkpoint Jobs
+
+| Variable | Required | Description |
+|----------|----------|-------------|
+| `DYN_CHECKPOINT_SIGNAL_FILE` | Yes | Full path to signal file (e.g., `/checkpoint-signal/my-checkpoint.done`) |
+| `DYN_CHECKPOINT_READY_FILE` | Yes | Full path where app signals readiness (e.g., `/tmp/checkpoint-ready`) |
+| `DYN_CHECKPOINT_HASH` | Yes | Unique checkpoint identifier (alphanumeric string) |
+| `DYN_CHECKPOINT_LOCATION` | Yes | Directory where checkpoint is stored (e.g., `/checkpoints/abc123`) |
+| `DYN_CHECKPOINT_STORAGE_TYPE` | Yes | Storage backend: `pvc`, `s3`, or `oci` |
+
+### Restore Pods
+
+| Variable | Required | Description |
+|----------|----------|-------------|
+| `DYN_CHECKPOINT_HASH` | Yes | Checkpoint identifier (must match checkpoint job) |
+| `DYN_CHECKPOINT_PATH` | Yes | Base checkpoint directory (hash appended automatically) |
+| `DYN_RESTORE_MARKER_FILE` | No | Path for restore marker file (default: `/tmp/dynamo-restored`) |
+
+### Optional CRIU Tuning (Advanced)
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `CRIU_TIMEOUT` | `0` (unlimited) | CRIU operation timeout in seconds |
+| `CRIU_LOG_LEVEL` | `4` | CRIU log verbosity (0-4) |
+| `CRIU_WORK_DIR` | `/tmp` | CRIU working directory |
+| `CUDA_PLUGIN_DIR` | `/usr/local/lib/criu` | Path to CRIU CUDA plugin |
+| `CRIU_SKIP_IN_FLIGHT` | `false` | Skip in-flight TCP connections |
+| `CRIU_AUTO_DEDUP` | `false` | Enable auto-deduplication |
+| `CRIU_LAZY_PAGES` | `false` | Enable lazy page migration (experimental) |
+| `WAIT_FOR_CHECKPOINT` | `false` | Wait for checkpoint to appear before starting |
+| `RESTORE_WAIT_TIMEOUT` | `300` | Max seconds to wait for checkpoint |
+| `DEBUG` | `false` | Enable debug mode (sleeps 300s on error) |
+
+---
+
+## Checkpoint Flow Explained
+
+### 1. Checkpoint Creation Flow
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│ 1. Pod starts with nvidia.com/checkpoint-source=true label  │
+└──────────────────────┬──────────────────────────────────────┘
+                       │
+                       ▼
+┌─────────────────────────────────────────────────────────────┐
+│ 2. Application loads model and creates ready file           │
+│    /tmp/checkpoint-ready                                     │
+└──────────────────────┬──────────────────────────────────────┘
+                       │
+                       ▼
+┌─────────────────────────────────────────────────────────────┐
+│ 3. Pod becomes Ready (kubelet readiness probe passes)       │
+└──────────────────────┬──────────────────────────────────────┘
+                       │
+                       ▼
+┌─────────────────────────────────────────────────────────────┐
+│ 4. ChReK DaemonSet detects:                                 │
+│    - Pod is Ready                                            │
+│    - Has checkpoint-source label                             │
+│    - Ready file exists: /tmp/checkpoint-ready               │
+└──────────────────────┬──────────────────────────────────────┘
+                       │
+                       ▼
+┌─────────────────────────────────────────────────────────────┐
+│ 5. DaemonSet executes CRIU checkpoint via runc:             │
+│    - Freezes container process                               │
+│    - Dumps memory (CPU + GPU)                                │
+│    - Saves to /checkpoints/${HASH}/                          │
+└──────────────────────┬──────────────────────────────────────┘
+                       │
+                       ▼
+┌─────────────────────────────────────────────────────────────┐
+│ 6. DaemonSet writes signal file:                            │
+│    /checkpoint-signal/${HASH}.done                           │
+└──────────────────────┬──────────────────────────────────────┘
+                       │
+                       ▼
+┌─────────────────────────────────────────────────────────────┐
+│ 7. Application detects signal file and exits gracefully     │
+└─────────────────────────────────────────────────────────────┘
+```
+
+### 2. Restore Flow
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│ 1. Pod starts with DYN_CHECKPOINT_HASH set                  │
+└──────────────────────┬──────────────────────────────────────┘
+                       │
+                       ▼
+┌─────────────────────────────────────────────────────────────┐
+│ 2. smart-entrypoint.sh checks for checkpoint:               │
+│    /checkpoints/${DYN_CHECKPOINT_HASH}/checkpoint.done      │
+└──────────────────────┬──────────────────────────────────────┘
+                       │
+                       ├─ Not Found ─────────────────┐
+                       │                              │
+                       ▼                              ▼
+           ┌───────────────────────┐    ┌──────────────────────┐
+           │ Checkpoint exists     │    │ Cold start           │
+           └──────────┬────────────┘    │ Run original CMD     │
+                      │                 └──────────────────────┘
+                      ▼
+┌─────────────────────────────────────────────────────────────┐
+│ 3. Call restore-entrypoint with checkpoint path             │
+└──────────────────────┬──────────────────────────────────────┘
+                       │
+                       ▼
+┌─────────────────────────────────────────────────────────────┐
+│ 4. restore-entrypoint extracts checkpoint and calls CRIU:   │
+│    criu restore --images-dir /checkpoints/${HASH}/images    │
+└──────────────────────┬──────────────────────────────────────┘
+                       │
+                       ▼
+┌─────────────────────────────────────────────────────────────┐
+│ 5. CRIU restores process from checkpoint                    │
+│    - Restores memory (CPU + GPU)                             │
+│    - Restores file descriptors                               │
+│    - Resumes process execution                               │
+└──────────────────────┬──────────────────────────────────────┘
+                       │
+                       ▼
+┌─────────────────────────────────────────────────────────────┐
+│ 6. Application continues from checkpointed state            │
+│    (Model already loaded, GPU memory initialized)           │
+└─────────────────────────────────────────────────────────────┘
+```
+
+---
+
+## Troubleshooting
+
+### Checkpoint Not Created
+
+**Symptom**: Job runs but no checkpoint appears in `/checkpoints/`
+
+**Checks**:
+1. Verify the pod has the label:
+   ```bash
+   kubectl get pod <pod-name> -o jsonpath='{.metadata.labels.nvidia\.com/checkpoint-source}'
+   ```
+
+2. Check pod readiness:
+   ```bash
+   kubectl get pod <pod-name> -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}'
+   ```
+
+3. Check ready file was created:
+   ```bash
+   kubectl exec <pod-name> -- ls -la /tmp/checkpoint-ready
+   ```
+
+4. Check DaemonSet logs:
+   ```bash
+   kubectl logs -n my-app daemonset/chrek-agent --all-containers
+   ```
+
+### Restore Fails
+
+**Symptom**: Pod fails to restore from checkpoint
+
+**Checks**:
+1. Verify checkpoint files exist:
+   ```bash
+   kubectl exec <pod-name> -- ls -la /checkpoints/${DYN_CHECKPOINT_HASH}/
+   ```
+
+2. Check privileged mode is enabled:
+   ```bash
+   kubectl get pod <pod-name> -o jsonpath='{.spec.containers[0].securityContext.privileged}'
+   ```
+
+3. Check CRIU logs in `/tmp/criu-restore.log`:
+   ```bash
+   kubectl exec <pod-name> -- cat /tmp/criu-restore.log
+   ```
+
+4. Ensure checkpoint and restore have same:
+   - Container image
+   - GPU count
+   - Volume mounts
+   - Environment variables (except POD_NAME, POD_IP, etc.)
+
+### Permission Denied Errors
+
+**Symptom**: `CRIU: Permission denied` or `Operation not permitted`
+
+**Solution**: Ensure pod has:
+```yaml
+securityContext:
+  privileged: true
+  capabilities:
+    add:
+    - SYS_ADMIN
+    - SYS_PTRACE
+    - SYS_CHROOT
+```
+
+### Signal File Not Appearing
+
+**Symptom**: Application waits forever for signal file
+
+**Checks**:
+1. Verify hostPath mount is correct:
+   ```bash
+   kubectl get pod <pod-name> -o jsonpath='{.spec.volumes[?(@.name=="checkpoint-signal")]}'
+   ```
+
+2. Check DaemonSet has access to the same path:
+   ```bash
+   kubectl get daemonset -n my-app chrek-agent -o jsonpath='{.spec.template.spec.volumes[?(@.name=="signal-dir")]}'
+   ```
+
+3. Verify paths match exactly:
+   - Pod: `/var/lib/chrek/signals`
+   - DaemonSet: `/var/lib/chrek/signals`
+
+---
+
+## Additional Resources
+
+- [ChReK Helm Chart Values](../../deploy/helm/charts/chrek/values.yaml)
+- [Smart Entrypoint Script](../../deploy/chrek/scripts/smart-entrypoint.sh)
+- [CRIU Documentation](https://criu.org/Main_Page)
+- [CUDA Checkpoint Plugin](https://docs.nvidia.com/cuda/cuda-checkpoint-plugin/)
+
+---
+
+## Getting Help
+
+If you encounter issues:
+
+1. Check the [Troubleshooting](#troubleshooting) section
+2. Review DaemonSet logs: `kubectl logs -n <namespace> daemonset/chrek-agent`
+3. Open an issue on [GitHub](https://github.com/ai-dynamo/dynamo/issues)