feat: introducing ChReK (Checkpoint Restore in K8s) (#4978)

Signed-off-by: Julien Mancuso <jmancuso@nvidia.com>

feat: introducing ChReK (Checkpoint Restore in K8s) (#4978)
Signed-off-by: Julien Mancuso <jmancuso@nvidia.com>
f3aa1e01 · Julien Mancuso · GitHub · 44986bf5 · f3aa1e01 · f3aa1e01
Unverified Commit f3aa1e01 authored Feb 03, 2026 by Julien Mancuso Committed by GitHub Feb 03, 2026
20 changed files
--- a/deploy/chrek/pkg/restore/criu.go
+++ b/deploy/chrek/pkg/restore/criu.go
+// criu provides CRIU-specific configuration and utilities for restore operations.
+package restore
+import (
+	"os"
+	criurpc "github.com/checkpoint-restore/go-criu/v7/rpc"
+	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
+	"google.golang.org/protobuf/proto"
+	"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/common"
+)
+// CRIURestoreConfig holds configuration for CRIU restore operations.
+// Most options are always-on with safe defaults for K8s environments.
+type CRIURestoreConfig struct {
+	ImageDirFD   int32
+	RootPath     string
+	LogLevel     int32
+	LogFile      string
+	WorkDirFD    int32
+	NetNsFD      int32
+	ExtMountMaps []*criurpc.ExtMountMap
+}
+// OpenImageDir opens a checkpoint directory and clears CLOEXEC for CRIU.
+// Returns the opened file and its FD. Caller must close the file when done.
+func OpenImageDir(checkpointPath string) (*os.File, int32, error) {
+	return common.OpenDirForCRIU(checkpointPath)
+}
+// OpenNetworkNamespace opens the target network namespace for restore.
+// Returns the opened file and its FD. Caller must close the file when done.
+func OpenNetworkNamespace(nsPath string) (*os.File, int32, error) {
+	return common.OpenDirForCRIU(nsPath)
+}
+// OpenWorkDir opens a work directory for CRIU and clears CLOEXEC.
+// Returns the opened file and its FD, or nil/-1 if workDir is empty or fails.
+func OpenWorkDir(workDir string, log *logrus.Entry) (*os.File, int32) {
+	if workDir == "" {
+		return nil, -1
+	}
+	// Ensure work directory exists
+	if err := os.MkdirAll(workDir, 0755); err != nil {
+		log.WithError(err).Warn("Failed to create CRIU work directory, using default")
+		return nil, -1
+	}
+	workDirFile, err := os.Open(workDir)
+	if err != nil {
+		log.WithError(err).Warn("Failed to open CRIU work directory, using default")
+		return nil, -1
+	}
+	if _, err := unix.FcntlInt(workDirFile.Fd(), unix.F_SETFD, 0); err != nil {
+		log.WithError(err).Warn("Failed to clear CLOEXEC on work dir")
+		workDirFile.Close()
+		return nil, -1
+	}
+	log.WithField("path", workDir).Info("Using custom CRIU work directory")
+	return workDirFile, int32(workDirFile.Fd())
+}
+// BuildRestoreCRIUOpts creates CRIU options for restore from a config struct.
+//
+// Always-on options for K8s:
+//   - ShellJob: containers are often session leaders
+//   - TcpClose: pod IPs change on restore/migration
+//   - FileLocks: applications use file locks
+//   - ExtUnixSk: containers have external Unix sockets
+//   - ManageCgroups (IGNORE): let K8s manage cgroups
+func BuildRestoreCRIUOpts(cfg CRIURestoreConfig) *criurpc.CriuOpts {
+	cgMode := criurpc.CriuCgMode_IGNORE
+	criuOpts := &criurpc.CriuOpts{
+		ImagesDirFd: proto.Int32(cfg.ImageDirFD),
+		LogLevel:    proto.Int32(cfg.LogLevel),
+		LogFile:     proto.String(cfg.LogFile),
+		// Root filesystem - use current container's root
+		Root: proto.String(cfg.RootPath),
+		// Restore in detached mode - process runs in background
+		RstSibling: proto.Bool(true),
+		// Mount namespace compatibility mode for cross-container restore
+		MntnsCompatMode: proto.Bool(true),
+		// Always-on for K8s environments
+		ShellJob:  proto.Bool(true),
+		TcpClose:  proto.Bool(true),
+		FileLocks: proto.Bool(true),
+		ExtUnixSk: proto.Bool(true),
+		// Cgroup management - ignore to avoid conflicts
+		ManageCgroups:     proto.Bool(true),
+		ManageCgroupsMode: &cgMode,
+		// Device and inode handling
+		EvasiveDevices: proto.Bool(true),
+		ForceIrmap:     proto.Bool(true),
+		// External mount mappings
+		ExtMnt: cfg.ExtMountMaps,
+	}
+	// Add network namespace inheritance if provided
+	if cfg.NetNsFD >= 0 {
+		criuOpts.InheritFd = []*criurpc.InheritFd{
+			{
+				Key: proto.String("extNetNs"),
+				Fd:  proto.Int32(cfg.NetNsFD),
+			},
+		}
+	}
+	// Add work directory if specified
+	if cfg.WorkDirFD >= 0 {
+		criuOpts.WorkDirFd = proto.Int32(cfg.WorkDirFD)
+	}
+	return criuOpts
+}
--- a/deploy/chrek/pkg/restore/filesystem.go
+++ b/deploy/chrek/pkg/restore/filesystem.go
+package restore
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"github.com/sirupsen/logrus"
+)
+const (
+	// RootfsDiffFilename is the name of the rootfs diff tar file
+	RootfsDiffFilename = "rootfs-diff.tar"
+	// DeletedFilesFilename is the name of the deleted files JSON
+	DeletedFilesFilename = "deleted-files.json"
+)
+// ApplyRootfsDiff extracts the rootfs-diff.tar from the checkpoint to the target root.
+// This restores filesystem changes that were made in the original container.
+func ApplyRootfsDiff(checkpointPath, targetRoot string, log *logrus.Entry) error {
+	rootfsDiffPath := filepath.Join(checkpointPath, RootfsDiffFilename)
+	// Check if rootfs-diff.tar exists
+	if _, err := os.Stat(rootfsDiffPath); os.IsNotExist(err) {
+		log.Info("No rootfs-diff.tar found, skipping filesystem restoration")
+		return nil
+	}
+	log.WithField("path", rootfsDiffPath).Info("Applying rootfs diff")
+	// Build tar command with options to handle conflicts:
+	// --keep-old-files: Don't overwrite existing files (may already be mounted)
+	// Exclude paths that are typically mounted read-only by the container runtime
+	cmd := exec.Command("tar",
+		"--keep-old-files",
+		"--exclude=./run/secrets",
+		"--exclude=./etc/resolv.conf",
+		"--exclude=./etc/hostname",
+		"--exclude=./etc/hosts",
+		"-C", targetRoot,
+		"-xf", rootfsDiffPath,
+	)
+	output, err := cmd.CombinedOutput()
+	if err != nil {
+		// Some failures are expected (read-only mounts, existing files)
+		// tar returns exit code 1 for "file exists" which is not fatal for us
+		if exitErr, ok := err.(*exec.ExitError); ok && exitErr.ExitCode() == 1 {
+			log.WithField("output", string(output)).Info("Rootfs diff applied (some files may have been skipped due to mounts)")
+			return nil
+		}
+		return fmt.Errorf("failed to extract rootfs diff: %w (output: %s)", err, string(output))
+	}
+	log.Info("Rootfs diff applied successfully")
+	return nil
+}
+// ApplyDeletedFiles removes files that were deleted in the original container.
+// These are tracked via overlay whiteout markers (.wh.<filename>).
+func ApplyDeletedFiles(checkpointPath, targetRoot string, log *logrus.Entry) error {
+	deletedFilesPath := filepath.Join(checkpointPath, DeletedFilesFilename)
+	// Check if deleted-files.json exists
+	data, err := os.ReadFile(deletedFilesPath)
+	if os.IsNotExist(err) {
+		log.Debug("No deleted-files.json found")
+		return nil
+	}
+	if err != nil {
+		return fmt.Errorf("failed to read deleted files list: %w", err)
+	}
+	log.Info("Applying deleted files from whiteout list")
+	// Parse JSON array of deleted file paths
+	var deletedFiles []string
+	if err := json.Unmarshal(data, &deletedFiles); err != nil {
+		return fmt.Errorf("failed to parse deleted files JSON: %w", err)
+	}
+	deletedCount := 0
+	for _, filePath := range deletedFiles {
+		if filePath == "" {
+			continue
+		}
+		targetPath := filepath.Join(targetRoot, filePath)
+		// Check if file exists before attempting deletion
+		if _, err := os.Stat(targetPath); os.IsNotExist(err) {
+			continue
+		}
+		if err := os.RemoveAll(targetPath); err != nil {
+			log.WithError(err).WithField("path", targetPath).Debug("Could not delete file")
+			continue
+		}
+		deletedCount++
+	}
+	log.WithField("count", deletedCount).Info("Deleted files applied")
+	return nil
+}
+// CheckpointFilesExist verifies that the checkpoint directory contains valid checkpoint files.
+func CheckpointFilesExist(checkpointPath string) bool {
+	// Check for CRIU image files (core-*.img is always present)
+	matches, err := filepath.Glob(filepath.Join(checkpointPath, "core-*.img"))
+	if err != nil || len(matches) == 0 {
+		return false
+	}
+	return true
+}
--- a/deploy/chrek/pkg/restore/mounts.go
+++ b/deploy/chrek/pkg/restore/mounts.go
+package restore
+import (
+	"fmt"
+	criurpc "github.com/checkpoint-restore/go-criu/v7/rpc"
+	"google.golang.org/protobuf/proto"
+	"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/common"
+)
+// GenerateExtMountMaps generates external mount mappings for CRIU restore.
+// It parses /proc/1/mountinfo (the restore container's mounts) and adds
+// mappings for all mount points plus masked/readonly paths from common.
+//
+// If meta is nil or doesn't have OCI-derived paths, falls back to defaults.
+func GenerateExtMountMaps(meta *common.CheckpointMetadata) ([]*criurpc.ExtMountMap, error) {
+	var maps []*criurpc.ExtMountMap
+	addedMounts := make(map[string]bool)
+	// Add root filesystem mapping first
+	maps = append(maps, &criurpc.ExtMountMap{
+		Key: proto.String("/"),
+		Val: proto.String("."),
+	})
+	addedMounts["/"] = true
+	// Parse /proc/1/mountinfo for all current mount points
+	mountPoints, err := common.GetMountPointPaths("/proc/1/mountinfo")
+	if err != nil {
+		return nil, fmt.Errorf("failed to parse mountinfo: %w", err)
+	}
+	for _, mountPoint := range mountPoints {
+		if addedMounts[mountPoint] || mountPoint == "/" {
+			continue
+		}
+		maps = append(maps, &criurpc.ExtMountMap{
+			Key: proto.String(mountPoint),
+			Val: proto.String(mountPoint),
+		})
+		addedMounts[mountPoint] = true
+	}
+	// Use masked paths from checkpoint metadata (OCI spec derived)
+	// Fall back to defaults for backwards compatibility
+	maskedPaths := common.DefaultMaskedPaths()
+	if meta != nil && len(meta.MaskedPaths) > 0 {
+		maskedPaths = meta.MaskedPaths
+	}
+	for _, path := range maskedPaths {
+		if addedMounts[path] {
+			continue
+		}
+		maps = append(maps, &criurpc.ExtMountMap{
+			Key: proto.String(path),
+			Val: proto.String(path),
+		})
+		addedMounts[path] = true
+	}
+	// Also add readonly paths from metadata if available
+	if meta != nil {
+		for _, path := range meta.ReadonlyPaths {
+			if addedMounts[path] {
+				continue
+			}
+			maps = append(maps, &criurpc.ExtMountMap{
+				Key: proto.String(path),
+				Val: proto.String(path),
+			})
+			addedMounts[path] = true
+		}
+	}
+	return maps, nil
+}
+// AddExtMountMap is a helper to create a single ExtMountMap entry.
+func AddExtMountMap(key, val string) *criurpc.ExtMountMap {
+	return &criurpc.ExtMountMap{
+		Key: proto.String(key),
+		Val: proto.String(val),
+	}
+}
--- a/deploy/chrek/pkg/restore/notify.go
+++ b/deploy/chrek/pkg/restore/notify.go
+package restore
+import (
+	criu "github.com/checkpoint-restore/go-criu/v7"
+	"github.com/sirupsen/logrus"
+)
+// RestoreNotify implements criu.Notify for restore callbacks.
+// It captures the restored process PID from the PostRestore callback.
+type RestoreNotify struct {
+	criu.NoNotify // Embed no-op implementation for all methods
+	// RestoredPID is the PID of the restored process, set by PostRestore callback
+	RestoredPID int32
+	// log is the logger for notification events
+	log *logrus.Entry
+}
+// NewRestoreNotify creates a new RestoreNotify handler.
+func NewRestoreNotify(log *logrus.Entry) *RestoreNotify {
+	return &RestoreNotify{
+		log: log,
+	}
+}
+// PreRestore is called before CRIU starts the restore operation.
+func (n *RestoreNotify) PreRestore() error {
+	if n.log != nil {
+		n.log.Debug("CRIU pre-restore notification")
+	}
+	return nil
+}
+// PostRestore is called after CRIU completes the restore operation.
+// The pid parameter contains the PID of the restored process.
+func (n *RestoreNotify) PostRestore(pid int32) error {
+	n.RestoredPID = pid
+	if n.log != nil {
+		n.log.WithField("pid", pid).Info("CRIU post-restore notification: process restored")
+	}
+	return nil
+}
+// PostResume is called after the restored process has resumed execution.
+func (n *RestoreNotify) PostResume() error {
+	if n.log != nil {
+		n.log.Debug("CRIU post-resume notification")
+	}
+	return nil
+}
--- a/deploy/chrek/pkg/restore/options.go
+++ b/deploy/chrek/pkg/restore/options.go
+// Package restore provides CRIU restore operations for self-restoring placeholder containers.
+package restore
+import (
+	"context"
+	"os"
+	"strconv"
+	"time"
+	criurpc "github.com/checkpoint-restore/go-criu/v7/rpc"
+	"github.com/sirupsen/logrus"
+	"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/common"
+)
+// Config holds the configuration for the restore entrypoint.
+// These values are typically set via environment variables.
+type Config struct {
+	// CheckpointPath is the base directory containing checkpoints (default: /checkpoints)
+	// Env: DYN_CHECKPOINT_PATH
+	CheckpointPath string
+	// CheckpointHash is the ID/hash of the checkpoint to restore
+	// Env: DYN_CHECKPOINT_HASH
+	CheckpointHash string
+	// RestoreTrigger is the path to the trigger file that signals restore should start
+	RestoreTrigger string
+	// WaitForCheckpoint indicates whether to wait for a checkpoint to appear
+	WaitForCheckpoint bool
+	// WaitTimeout is the maximum time to wait for a checkpoint to become available
+	WaitTimeout time.Duration
+	// CRIULogLevel is the CRIU verbosity level (0-4, default: 4)
+	CRIULogLevel int32
+	// DefaultCmd is the command to run if no checkpoint is available
+	DefaultCmd string
+	// Debug enables debug logging
+	Debug bool
+	// EmbeddedCheckpointPath is the path to an embedded checkpoint within the image
+	// When set, the checkpoint data is baked into the container image itself
+	EmbeddedCheckpointPath string
+	// SkipInFlightConnections skips in-flight TCP connections during restore
+	SkipInFlightConnections bool
+	// AutoDedup enables auto-deduplication of memory pages
+	AutoDedup bool
+	// LazyPages enables lazy page migration (experimental)
+	LazyPages bool
+	// CRIUWorkDir is an alternative work directory for CRIU (instead of /tmp)
+	// Useful when /tmp has mount issues
+	CRIUWorkDir string
+	// CUDAPluginDir is the path to CRIU CUDA plugin directory (e.g., /usr/local/lib/criu)
+	// When set, a CRIU config file is created with libdir for CUDA plugin discovery during restore.
+	CUDAPluginDir string
+	// CRIUTimeout is the CRIU timeout in seconds (required for CUDA restores)
+	CRIUTimeout uint32
+	// RestoreMarkerFile is the path to a marker file created before CRIU restore.
+	// The restored process can check for this file to detect it was restored.
+	RestoreMarkerFile string
+}
+// DefaultEmbeddedCheckpointPath is the default path for embedded checkpoints
+const DefaultEmbeddedCheckpointPath = "/embedded-checkpoint"
+// ConfigFromEnv creates a Config from environment variables.
+func ConfigFromEnv() *Config {
+	cfg := &Config{
+		CheckpointPath:          getEnvOrDefault("DYN_CHECKPOINT_PATH", "/checkpoints"),
+		CheckpointHash:          os.Getenv("DYN_CHECKPOINT_HASH"),
+		RestoreTrigger:          getEnvOrDefault("RESTORE_TRIGGER", "/tmp/restore-trigger"),
+		WaitForCheckpoint:       os.Getenv("WAIT_FOR_CHECKPOINT") == "1",
+		WaitTimeout:             parseDurationOrDefault("RESTORE_WAIT_TIMEOUT", 300*time.Second),
+		CRIULogLevel:            parseIntOrDefault("CRIU_LOG_LEVEL", 4),
+		DefaultCmd:              os.Getenv("DEFAULT_CMD"),
+		Debug:                   os.Getenv("DEBUG") == "1",
+		EmbeddedCheckpointPath:  getEnvOrDefault("EMBEDDED_CHECKPOINT_PATH", DefaultEmbeddedCheckpointPath),
+		SkipInFlightConnections: os.Getenv("CRIU_SKIP_IN_FLIGHT") == "1",
+		AutoDedup:               os.Getenv("CRIU_AUTO_DEDUP") == "1",
+		LazyPages:               os.Getenv("CRIU_LAZY_PAGES") == "1",
+		CRIUWorkDir:             getEnvOrDefault("CRIU_WORK_DIR", ""),
+		CUDAPluginDir:           os.Getenv("CUDA_PLUGIN_DIR"), // For CUDA plugin discovery during restore
+		CRIUTimeout:             uint32(parseIntOrDefault("CRIU_TIMEOUT", 0)),
+		RestoreMarkerFile:       getEnvOrDefault("DYN_RESTORE_MARKER_FILE", "/tmp/dynamo-restored"),
+	}
+	return cfg
+}
+// RestoreOptions holds the options for a CRIU restore operation.
+// Most CRIU options are hardcoded with safe K8s defaults.
+type RestoreOptions struct {
+	// CheckpointPath is the path to the checkpoint directory
+	CheckpointPath string
+	// RootPath is the root filesystem path for restore (typically "/")
+	RootPath string
+	// PidFile is the path where CRIU writes the restored process PID
+	PidFile string
+	// LogFile is the name of the CRIU restore log file
+	LogFile string
+	// LogLevel is the CRIU logging verbosity (0-4)
+	LogLevel int32
+	// ExtMountMaps contains external mount mappings for CRIU
+	ExtMountMaps []*criurpc.ExtMountMap
+	// WorkDir is an alternative work directory for CRIU (instead of /tmp)
+	WorkDir string
+	// LibDir is the path to CRIU plugin directory (e.g., /usr/local/lib/criu)
+	// When set, a CRIU config file is created with libdir for CUDA plugin discovery.
+	LibDir string
+	// Timeout is the CRIU timeout in seconds (required for CUDA restores)
+	Timeout uint32
+}
+// DefaultRestoreOptions returns RestoreOptions with sensible defaults.
+func DefaultRestoreOptions(checkpointPath string) *RestoreOptions {
+	return &RestoreOptions{
+		CheckpointPath: checkpointPath,
+		RootPath:       "/",
+		PidFile:        "/tmp/restored.pid",
+		LogFile:        "restore.log",
+		LogLevel:       4,
+	}
+}
+// LoadRestoreOptions creates RestoreOptions from checkpoint metadata.
+// CRIU options are hardcoded with safe K8s defaults; metadata is only used for mount mappings.
+func LoadRestoreOptions(checkpointPath string, logLevel int32) (*RestoreOptions, error) {
+	opts := DefaultRestoreOptions(checkpointPath)
+	opts.LogLevel = logLevel
+	// Load metadata for OCI-derived paths (masked/readonly paths for external mounts)
+	meta, err := common.LoadMetadata(checkpointPath)
+	if err != nil {
+		// Return defaults if metadata is unavailable
+		// GenerateExtMountMaps with nil will use fallback defaults
+		return opts, nil
+	}
+	// Pre-generate external mount maps using OCI-derived paths from metadata
+	// This uses masked/readonly paths from the OCI spec instead of hardcoded defaults
+	extMounts, err := GenerateExtMountMaps(meta)
+	if err != nil {
+		// Fall back to defaults if generation fails
+		return opts, nil
+	}
+	opts.ExtMountMaps = extMounts
+	return opts, nil
+}
+// ShouldRestore checks if a restore should be performed.
+// Returns the checkpoint path and true if restore should proceed.
+// IMPORTANT: We check for checkpoint.done marker (not just metadata.json or inventory.img) because
+// checkpoint.done is written LAST in the checkpoint process, after rootfs-diff.tar completes.
+// Order: metadata.json -> CRIU dump (*.img files) -> rootfs-diff.tar -> checkpoint.done
+func ShouldRestore(cfg *Config, log *logrus.Entry) (string, bool) {
+	// Method 0: Embedded checkpoint in image (highest priority)
+	// This is for self-contained checkpoint images where data is baked in
+	if cfg.EmbeddedCheckpointPath != "" {
+		metadataPath := cfg.EmbeddedCheckpointPath + "/" + common.MetadataFilename
+		if _, err := os.Stat(metadataPath); err == nil {
+			log.WithField("path", cfg.EmbeddedCheckpointPath).Info("Embedded checkpoint found in image")
+			return cfg.EmbeddedCheckpointPath, true
+		}
+	}
+	// Method 1: DYN_CHECKPOINT_HASH is set and checkpoint is fully complete
+	if cfg.CheckpointHash != "" {
+		checkpointPath := cfg.CheckpointPath + "/" + cfg.CheckpointHash
+		// Check for checkpoint.done marker (written LAST after rootfs-diff.tar completes)
+		donePath := checkpointPath + "/checkpoint.done"
+		if _, err := os.Stat(donePath); err == nil {
+			log.WithField("path", checkpointPath).Info("Checkpoint found (checkpoint.done marker present)")
+			return checkpointPath, true
+		}
+		// Fallback: check for metadata.json but warn about potential race condition
+		metadataPath := checkpointPath + "/" + common.MetadataFilename
+		if _, err := os.Stat(metadataPath); err == nil {
+			log.WithFields(logrus.Fields{
+				"path":    checkpointPath,
+				"warning": "checkpoint.done marker not found, checkpoint may be incomplete",
+			}).Warn("Checkpoint metadata found but checkpoint.done missing - checkpoint may still be in progress")
+			// Don't return true here - wait for checkpoint.done
+		}
+	}
+	// Method 2: Restore trigger file exists with checkpoint path
+	if cfg.RestoreTrigger != "" {
+		data, err := os.ReadFile(cfg.RestoreTrigger)
+		if err == nil {
+			checkpointPath := string(data)
+			if checkpointPath != "" {
+				donePath := checkpointPath + "/checkpoint.done"
+				if _, err := os.Stat(donePath); err == nil {
+					log.WithField("path", checkpointPath).Info("Restore triggered via file (checkpoint.done marker present)")
+					return checkpointPath, true
+				}
+			}
+		}
+	}
+	return "", false
+}
+// WaitForCheckpoint waits for a checkpoint to become available.
+func WaitForCheckpoint(ctx context.Context, cfg *Config, log *logrus.Entry) (string, error) {
+	log.WithField("timeout", cfg.WaitTimeout).Info("Waiting for checkpoint")
+	deadline := time.Now().Add(cfg.WaitTimeout)
+	ticker := time.NewTicker(time.Second)
+	defer ticker.Stop()
+	lastLog := time.Now()
+	for {
+		select {
+		case <-ctx.Done():
+			return "", ctx.Err()
+		case <-ticker.C:
+			if path, ok := ShouldRestore(cfg, log); ok {
+				return path, nil
+			}
+			// Log progress every 30 seconds
+			if time.Since(lastLog) >= 30*time.Second {
+				elapsed := time.Since(deadline.Add(-cfg.WaitTimeout))
+				log.WithField("elapsed", elapsed).Info("Still waiting for checkpoint...")
+				lastLog = time.Now()
+			}
+			if time.Now().After(deadline) {
+				return "", context.DeadlineExceeded
+			}
+		}
+	}
+}
+// Helper functions
+func getEnvOrDefault(key, defaultValue string) string {
+	if value := os.Getenv(key); value != "" {
+		return value
+	}
+	return defaultValue
+}
+func parseDurationOrDefault(key string, defaultValue time.Duration) time.Duration {
+	value := os.Getenv(key)
+	if value == "" {
+		return defaultValue
+	}
+	seconds, err := strconv.Atoi(value)
+	if err != nil {
+		return defaultValue
+	}
+	return time.Duration(seconds) * time.Second
+}
+func parseIntOrDefault(key string, defaultValue int32) int32 {
+	value := os.Getenv(key)
+	if value == "" {
+		return defaultValue
+	}
+	i, err := strconv.Atoi(value)
+	if err != nil {
+		return defaultValue
+	}
+	return int32(i)
+}
--- a/deploy/chrek/pkg/restore/process.go
+++ b/deploy/chrek/pkg/restore/process.go
+package restore
+import (
+	"fmt"
+	"io"
+	"os"
+	"os/exec"
+	"os/signal"
+	"strconv"
+	"strings"
+	"syscall"
+	"time"
+	"github.com/sirupsen/logrus"
+)
+// MonitorProcess monitors the restored process and returns its exit code.
+// It blocks until the process exits. Does not forward stdout/stderr.
+// For output forwarding, use ForwardProcessOutput instead.
+func MonitorProcess(pid int, log *logrus.Entry) int {
+	log.WithField("pid", pid).Info("Monitoring restored process")
+	for {
+		// Check if process still exists by sending signal 0
+		proc, err := os.FindProcess(pid)
+		if err != nil {
+			log.WithError(err).Error("Failed to find process")
+			return 1
+		}
+		err = proc.Signal(syscall.Signal(0))
+		if err != nil {
+			// Process has exited
+			log.WithField("pid", pid).Info("Restored process exited")
+			// Try to read exit status from /proc/<pid>/stat
+			// If process is gone, assume exit code 0
+			exitCode := getExitCode(pid)
+			log.WithField("exit_code", exitCode).Info("Restored process exit status")
+			return exitCode
+		}
+		time.Sleep(time.Second)
+	}
+}
+// ForwardProcessOutput forwards the stdout and stderr of a restored process
+// to our own stdout/stderr via /proc/<pid>/fd/1 and /proc/<pid>/fd/2.
+// This ensures logs from the restored process appear in kubectl logs.
+// Returns the exit code of the process.
+func ForwardProcessOutput(pid int, log *logrus.Entry) int {
+	log.WithField("pid", pid).Info("Forwarding output from restored process")
+	// Try to open the process's stdout and stderr via /proc
+	stdoutPath := fmt.Sprintf("/proc/%d/fd/1", pid)
+	stderrPath := fmt.Sprintf("/proc/%d/fd/2", pid)
+	// Channel to signal when copying goroutines should stop
+	done := make(chan struct{})
+	// Forward stdout
+	go forwardFD(stdoutPath, os.Stdout, "stdout", log, done)
+	// Forward stderr
+	go forwardFD(stderrPath, os.Stderr, "stderr", log, done)
+	// Wait for process to exit
+	exitCode := waitForProcess(pid, log)
+	// Signal goroutines to stop
+	close(done)
+	// Give goroutines a moment to flush any remaining output
+	time.Sleep(100 * time.Millisecond)
+	return exitCode
+}
+// forwardFD copies data from a file descriptor path to a writer.
+// It handles the case where the FD may not be readable.
+func forwardFD(fdPath string, dst io.Writer, name string, log *logrus.Entry, done <-chan struct{}) {
+	// Try to open the FD path
+	src, err := os.Open(fdPath)
+	if err != nil {
+		log.WithError(err).WithField("path", fdPath).Debug("Could not open process FD for forwarding")
+		return
+	}
+	defer src.Close()
+	// Check what kind of file this is
+	stat, err := src.Stat()
+	if err != nil {
+		log.WithError(err).WithField("path", fdPath).Debug("Could not stat process FD")
+		return
+	}
+	log.WithFields(logrus.Fields{
+		"name": name,
+		"mode": stat.Mode().String(),
+		"path": fdPath,
+	}).Debug("Forwarding process output")
+	// Copy data until done or EOF
+	buf := make([]byte, 4096)
+	for {
+		select {
+		case <-done:
+			return
+		default:
+			// Set a read deadline to allow checking done channel periodically
+			src.SetReadDeadline(time.Now().Add(100 * time.Millisecond))
+			n, err := src.Read(buf)
+			if n > 0 {
+				dst.Write(buf[:n])
+			}
+			if err != nil {
+				if os.IsTimeout(err) {
+					continue
+				}
+				if err != io.EOF {
+					log.WithError(err).WithField("name", name).Debug("Error reading from process FD")
+				}
+				return
+			}
+		}
+	}
+}
+// waitForProcess waits for a process to exit and returns its exit code.
+func waitForProcess(pid int, log *logrus.Entry) int {
+	for {
+		// Check if process still exists by sending signal 0
+		proc, err := os.FindProcess(pid)
+		if err != nil {
+			log.WithError(err).Error("Failed to find process")
+			return 1
+		}
+		err = proc.Signal(syscall.Signal(0))
+		if err != nil {
+			// Process has exited
+			log.WithField("pid", pid).Info("Restored process exited")
+			// Try to get exit status
+			exitCode := getExitCode(pid)
+			log.WithField("exit_code", exitCode).Info("Restored process exit status")
+			return exitCode
+		}
+		time.Sleep(100 * time.Millisecond)
+	}
+}
+// getExitCode attempts to get the exit code of a process.
+// Returns 0 if unable to determine the exit code.
+func getExitCode(pid int) int {
+	// Try to wait for the process (only works if we're the parent)
+	proc, err := os.FindProcess(pid)
+	if err != nil {
+		return 0
+	}
+	// Try waitpid with WNOHANG - this may not work for non-child processes
+	var wstatus syscall.WaitStatus
+	wpid, err := syscall.Wait4(pid, &wstatus, syscall.WNOHANG, nil)
+	if err == nil && wpid == pid {
+		if wstatus.Exited() {
+			return wstatus.ExitStatus()
+		}
+		if wstatus.Signaled() {
+			return 128 + int(wstatus.Signal())
+		}
+	}
+	// If we can't wait on it, check if it's still running
+	if proc.Signal(syscall.Signal(0)) != nil {
+		// Process is gone, assume clean exit
+		return 0
+	}
+	return 0
+}
+// SetupSignalForwarding sets up signal forwarding to the restored process.
+// Returns a cleanup function that should be called when done.
+func SetupSignalForwarding(pid int, log *logrus.Entry) func() {
+	sigChan := make(chan os.Signal, 1)
+	signal.Notify(sigChan, syscall.SIGTERM, syscall.SIGINT, syscall.SIGQUIT)
+	done := make(chan struct{})
+	go func() {
+		select {
+		case sig := <-sigChan:
+			log.WithFields(logrus.Fields{
+				"signal": sig,
+				"pid":    pid,
+			}).Info("Forwarding signal to restored process")
+			proc, err := os.FindProcess(pid)
+			if err == nil {
+				proc.Signal(sig)
+			}
+		case <-done:
+			return
+		}
+	}()
+	return func() {
+		signal.Stop(sigChan)
+		close(done)
+	}
+}
+// WaitForPidFile waits for the CRIU PID file to be created and returns the PID.
+func WaitForPidFile(pidFile string, timeout time.Duration, log *logrus.Entry) (int, error) {
+	deadline := time.Now().Add(timeout)
+	for time.Now().Before(deadline) {
+		data, err := os.ReadFile(pidFile)
+		if err == nil {
+			pidStr := strings.TrimSpace(string(data))
+			pid, err := strconv.Atoi(pidStr)
+			if err == nil && pid > 0 {
+				return pid, nil
+			}
+		}
+		time.Sleep(100 * time.Millisecond)
+	}
+	return 0, fmt.Errorf("timeout waiting for PID file %s after %v", pidFile, timeout)
+}
+// RunDefault runs the default command when no checkpoint is available.
+// It attempts to detect and run the appropriate default command for the container.
+func RunDefault(cfg *Config, log *logrus.Entry) error {
+	// If DEFAULT_CMD is set, use it
+	if cfg.DefaultCmd != "" {
+		log.WithField("cmd", cfg.DefaultCmd).Info("Running default command")
+		return execCommand(cfg.DefaultCmd)
+	}
+	// Try common application entrypoints
+	if _, err := os.Stat("/docker-entrypoint.sh"); err == nil {
+		log.Info("Running docker-entrypoint.sh")
+		return execCommand("/docker-entrypoint.sh nginx -g 'daemon off;'")
+	}
+	// Check for nginx
+	if _, err := exec.LookPath("nginx"); err == nil {
+		log.Info("Running nginx")
+		return execCommand("nginx -g 'daemon off;'")
+	}
+	// Fallback to sleep infinity
+	log.Warn("No default command specified and no known entrypoint found, sleeping")
+	return execCommand("sleep infinity")
+}
+// execCommand executes a command by replacing the current process.
+func execCommand(cmdLine string) error {
+	// Parse command line - simple split by spaces
+	// For complex commands, shell wrapper is needed
+	parts := strings.Fields(cmdLine)
+	if len(parts) == 0 {
+		return fmt.Errorf("empty command")
+	}
+	cmd := parts[0]
+	args := parts
+	// Find the executable path
+	path, err := exec.LookPath(cmd)
+	if err != nil {
+		// Try running through shell for complex commands
+		path = "/bin/sh"
+		args = []string{"sh", "-c", cmdLine}
+	}
+	// Replace current process with the command
+	return syscall.Exec(path, args, os.Environ())
+}
--- a/deploy/chrek/pkg/restore/restore.go
+++ b/deploy/chrek/pkg/restore/restore.go
+package restore
+import (
+	"context"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+	"time"
+	criu "github.com/checkpoint-restore/go-criu/v7"
+	"github.com/sirupsen/logrus"
+	"google.golang.org/protobuf/proto"
+)
+// Restore performs the CRIU restore operation using go-criu.
+// Returns the PID of the restored process.
+func Restore(ctx context.Context, opts *RestoreOptions, log *logrus.Entry) (int, error) {
+	log.WithField("checkpoint", opts.CheckpointPath).Info("Starting CRIU restore")
+	// 1. Open checkpoint directory
+	imageDir, imageDirFD, err := OpenImageDir(opts.CheckpointPath)
+	if err != nil {
+		return 0, err
+	}
+	defer imageDir.Close()
+	log.WithField("fd", imageDirFD).Debug("Opened checkpoint directory")
+	// 2. Generate external mount mappings if not already set
+	if opts.ExtMountMaps == nil {
+		extMounts, err := GenerateExtMountMaps(nil)
+		if err != nil {
+			return 0, fmt.Errorf("failed to generate mount maps: %w", err)
+		}
+		opts.ExtMountMaps = extMounts
+	}
+	log.WithField("mount_count", len(opts.ExtMountMaps)).Debug("External mount maps ready")
+	// 3. Open target network namespace
+	netNsFile, netNsFD, err := OpenNetworkNamespace("/proc/1/ns/net")
+	if err != nil {
+		return 0, err
+	}
+	defer netNsFile.Close()
+	log.WithField("fd", netNsFD).Debug("Opened target network namespace")
+	// 4. Open work directory if specified
+	var workDirFile *os.File
+	var workDirFD int32 = -1
+	if opts.WorkDir != "" {
+		workDirFile, workDirFD = OpenWorkDir(opts.WorkDir, log)
+		if workDirFile != nil {
+			defer workDirFile.Close()
+		}
+	}
+	// 5. Build CRIU options
+	cfg := CRIURestoreConfig{
+		ImageDirFD:   imageDirFD,
+		RootPath:     opts.RootPath,
+		LogLevel:     opts.LogLevel,
+		LogFile:      opts.LogFile,
+		WorkDirFD:    workDirFD,
+		NetNsFD:      netNsFD,
+		ExtMountMaps: opts.ExtMountMaps,
+	}
+	criuOpts := BuildRestoreCRIUOpts(cfg)
+	// 6. Create CRIU config file for CUDA plugin if libdir is specified
+	if opts.LibDir != "" {
+		if opts.Timeout == 0 {
+			return 0, fmt.Errorf("CRIU_TIMEOUT environment variable must be set for CUDA restores")
+		}
+		configPath := filepath.Join(opts.CheckpointPath, "restore-criu.conf")
+		configContent := fmt.Sprintf(`enable-external-masters
+libdir %s
+tcp-close
+link-remap
+timeout %d
+allow-uprobes
+skip-in-flight
+`, opts.LibDir, opts.Timeout)
+		if err := os.WriteFile(configPath, []byte(configContent), 0644); err != nil {
+			log.WithError(err).Warn("Failed to write CRIU config file for restore")
+		} else {
+			criuOpts.ConfigFile = proto.String(configPath)
+			log.WithFields(logrus.Fields{
+				"config_path": configPath,
+				"lib_dir":     opts.LibDir,
+			}).Info("Created CRIU config file with libdir for CUDA plugin")
+		}
+	}
+	// 7. Execute CRIU restore
+	c := criu.MakeCriu()
+	notify := NewRestoreNotify(log)
+	log.Info("Executing CRIU restore")
+	criuExecStart := time.Now()
+	if err := c.Restore(criuOpts, notify); err != nil {
+		log.WithField("duration", time.Since(criuExecStart)).Error("CRIU c.Restore failed")
+		logCRIUErrors(opts.CheckpointPath, opts.LogFile, log)
+		return 0, fmt.Errorf("CRIU restore failed: %w", err)
+	}
+	log.WithFields(logrus.Fields{
+		"pid":      notify.RestoredPID,
+		"duration": time.Since(criuExecStart),
+	}).Info("CRIU c.Restore completed successfully")
+	// 8. Get restored PID
+	if notify.RestoredPID > 0 {
+		return int(notify.RestoredPID), nil
+	}
+	// Fallback: try to read from PID file
+	if opts.PidFile != "" {
+		pid, err := WaitForPidFile(opts.PidFile, 10*time.Second, log)
+		if err != nil {
+			return 0, fmt.Errorf("failed to get restored PID: %w", err)
+		}
+		return pid, nil
+	}
+	return 0, fmt.Errorf("could not determine restored process PID")
+}
+// logCRIUErrors reads CRIU log file and logs errors.
+func logCRIUErrors(checkpointPath, logFile string, log *logrus.Entry) {
+	logPath := filepath.Join(checkpointPath, logFile)
+	data, err := os.ReadFile(logPath)
+	if err != nil {
+		log.WithError(err).Warn("Could not read CRIU log file")
+		return
+	}
+	log.Error("=== CRIU RESTORE LOG START ===")
+	for _, line := range strings.Split(string(data), "\n") {
+		if line != "" {
+			log.Error(line)
+		}
+	}
+	log.Error("=== CRIU RESTORE LOG END ===")
+	// Copy log to shared directory if CRIU_LOG_DIR is set
+	if logDir := os.Getenv("CRIU_LOG_DIR"); logDir != "" {
+		if err := os.MkdirAll(logDir, 0755); err == nil {
+			destPath := filepath.Join(logDir, fmt.Sprintf("restore-%d.log", time.Now().Unix()))
+			if err := os.WriteFile(destPath, data, 0644); err == nil {
+				log.WithField("path", destPath).Info("CRIU log copied to shared directory")
+			}
+		}
+	}
+}
+// Run is the main entry point for the restore entrypoint.
+// It orchestrates the entire restore process.
+func Run(ctx context.Context, cfg *Config, log *logrus.Entry) error {
+	log.Info("=== Self-Restoring Placeholder Entrypoint ===")
+	log.WithFields(logrus.Fields{
+		"checkpoint_path":          cfg.CheckpointPath,
+		"checkpoint_hash":          cfg.CheckpointHash,
+		"embedded_checkpoint_path": cfg.EmbeddedCheckpointPath,
+		"wait_for_checkpoint":      cfg.WaitForCheckpoint,
+		"restore_marker_file":      cfg.RestoreMarkerFile,
+	}).Info("Configuration")
+	// Check CRIU availability
+	c := criu.MakeCriu()
+	version, err := c.GetCriuVersion()
+	if err != nil {
+		log.WithError(err).Error("CRIU is not available")
+		log.Info("Falling back to default command")
+		return RunDefault(cfg, log)
+	}
+	log.WithField("version", version).Info("CRIU version")
+	// Determine checkpoint path
+	var checkpointPath string
+	var shouldRestore bool
+	// Check if we should restore immediately
+	checkpointPath, shouldRestore = ShouldRestore(cfg, log)
+	// If not and we're configured to wait, wait for checkpoint
+	if !shouldRestore && cfg.WaitForCheckpoint {
+		log.Info("Waiting for checkpoint...")
+		var err error
+		checkpointPath, err = WaitForCheckpoint(ctx, cfg, log)
+		if err != nil {
+			log.WithError(err).Info("No checkpoint received, running default command")
+			return RunDefault(cfg, log)
+		}
+		shouldRestore = true
+	}
+	// If no checkpoint, run default command
+	if !shouldRestore {
+		log.Info("No checkpoint configured, running default command")
+		return RunDefault(cfg, log)
+	}
+	// Perform restore
+	log.WithField("checkpoint", checkpointPath).Info("Checkpoint available, starting restore")
+	restoreStart := time.Now()
+	// Apply filesystem changes
+	rootfsDiffStart := time.Now()
+	if err := ApplyRootfsDiff(checkpointPath, "/", log); err != nil {
+		log.WithError(err).Error("Failed to apply rootfs diff")
+	}
+	log.WithField("duration", time.Since(rootfsDiffStart)).Info("ApplyRootfsDiff completed")
+	deletedFilesStart := time.Now()
+	if err := ApplyDeletedFiles(checkpointPath, "/", log); err != nil {
+		log.WithError(err).Error("Failed to apply deleted files")
+	}
+	log.WithField("duration", time.Since(deletedFilesStart)).Info("ApplyDeletedFiles completed")
+	// Load restore options from metadata
+	loadOptsStart := time.Now()
+	opts, err := LoadRestoreOptions(checkpointPath, cfg.CRIULogLevel)
+	if err != nil {
+		log.WithError(err).Warn("Could not load restore options from metadata, using defaults")
+	}
+	log.WithField("duration", time.Since(loadOptsStart)).Info("LoadRestoreOptions completed")
+	// Apply additional config options
+	if cfg.CRIUWorkDir != "" {
+		opts.WorkDir = cfg.CRIUWorkDir
+	}
+	// Set CUDA plugin directory and timeout for restore config file
+	if cfg.CUDAPluginDir != "" {
+		if cfg.CRIUTimeout == 0 {
+			return fmt.Errorf("CRIU_TIMEOUT environment variable must be set for CUDA restores")
+		}
+		opts.LibDir = cfg.CUDAPluginDir
+		opts.Timeout = cfg.CRIUTimeout
+		log.WithFields(logrus.Fields{
+			"lib_dir": cfg.CUDAPluginDir,
+			"timeout": cfg.CRIUTimeout,
+		}).Info("CUDA plugin directory and timeout configured for restore")
+	}
+	// Write restore marker file before CRIU restore
+	// This allows the restored process to detect it's been restored
+	if cfg.RestoreMarkerFile != "" {
+		if err := os.WriteFile(cfg.RestoreMarkerFile, []byte("restored"), 0644); err != nil {
+			log.WithError(err).Warn("Failed to write restore marker file")
+		} else {
+			log.WithField("path", cfg.RestoreMarkerFile).Info("Wrote restore marker file")
+		}
+	}
+	// Perform CRIU restore (CUDA plugin handles CUDA state automatically)
+	criuRestoreStart := time.Now()
+	pid, err := Restore(ctx, opts, log)
+	if err != nil {
+		log.WithField("duration", time.Since(criuRestoreStart)).WithError(err).Error("Restore failed, falling back to default command")
+		if cfg.Debug {
+			log.Info("DEBUG mode: sleeping 300s to allow log collection...")
+			time.Sleep(300 * time.Second)
+		}
+		return RunDefault(cfg, log)
+	}
+	criuRestoreDuration := time.Since(criuRestoreStart)
+	log.WithField("duration", criuRestoreDuration).Info("CRIU Restore completed (CUDA state restored by plugin)")
+	totalDuration := time.Since(restoreStart)
+	log.WithFields(logrus.Fields{
+		"total_duration":        totalDuration,
+		"criu_restore_duration": criuRestoreDuration,
+	}).Info("=== Restore operation completed ===")
+	// Set up signal forwarding and forward stdout/stderr from restored process
+	cleanup := SetupSignalForwarding(pid, log)
+	defer cleanup()
+	// Use ForwardProcessOutput to ensure restored process logs appear in kubectl logs
+	exitCode := ForwardProcessOutput(pid, log)
+	os.Exit(exitCode)
+	return nil
+}
--- a/deploy/chrek/pkg/watcher/watcher.go
+++ b/deploy/chrek/pkg/watcher/watcher.go
+// Package watcher provides Kubernetes pod watching for automatic checkpointing.
+package watcher
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"net/http"
+	"os"
+	"path/filepath"
+	"sync"
+	"time"
+	"github.com/sirupsen/logrus"
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/labels"
+	"k8s.io/client-go/informers"
+	"k8s.io/client-go/kubernetes"
+	"k8s.io/client-go/rest"
+	"k8s.io/client-go/tools/cache"
+	"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/checkpoint"
+	checkpointk8s "github.com/ai-dynamo/dynamo/deploy/chrek/pkg/checkpoint/k8s"
+)
+const (
+	// LabelCheckpointSource is the label that triggers automatic checkpointing
+	LabelCheckpointSource = "nvidia.com/checkpoint-source"
+	// LabelCheckpointHash is the label specifying the checkpoint identity hash
+	LabelCheckpointHash = "nvidia.com/checkpoint-hash"
+	// EnvCheckpointSignalFile is the env var in the pod specifying the signal file path
+	EnvCheckpointSignalFile = "DYN_CHECKPOINT_SIGNAL_FILE"
+)
+// SignalFile represents the content of a checkpoint completion signal file
+type SignalFile struct {
+	CheckpointID   string    `json:"checkpoint_id"`
+	CheckpointPath string    `json:"checkpoint_path"`
+	Timestamp      time.Time `json:"timestamp"`
+	Success        bool      `json:"success"`
+	Error          string    `json:"error,omitempty"`
+}
+// Config holds watcher configuration
+type Config struct {
+	NodeName            string
+	CheckpointDir       string
+	HostProc            string
+	ListenAddr          string // HTTP server address for health checks (e.g., ":8080")
+	RestrictedNamespace string // Optional: restrict watching to this namespace (empty = cluster-wide)
+	// GPU/CUDA checkpoint options (passed to checkpoint.Options)
+	CUDAPluginDir  string   // Path to CRIU CUDA plugin directory
+	GhostLimit     uint32   // Ghost file size limit in bytes (default: 512MB for GPU)
+	Timeout        uint32   // CRIU timeout in seconds
+	ExternalMounts []string // Additional external mount mappings
+}
+// Watcher watches for pods with checkpoint labels and triggers checkpoints
+type Watcher struct {
+	config          Config
+	clientset       kubernetes.Interface
+	discoveryClient *checkpointk8s.DiscoveryClient
+	checkpointer    *checkpoint.Checkpointer
+	log             *logrus.Entry
+	// Track pods checkpoint status: "in_progress", "completed", or "" (not started/failed)
+	checkpointed   map[string]string
+	checkpointedMu sync.RWMutex
+	stopCh chan struct{}
+}
+// NewWatcher creates a new pod watcher
+func NewWatcher(cfg Config, discoveryClient *checkpointk8s.DiscoveryClient, checkpointer *checkpoint.Checkpointer) (*Watcher, error) {
+	// Create in-cluster Kubernetes client
+	restConfig, err := rest.InClusterConfig()
+	if err != nil {
+		return nil, fmt.Errorf("failed to get in-cluster config: %w", err)
+	}
+	clientset, err := kubernetes.NewForConfig(restConfig)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create kubernetes client: %w", err)
+	}
+	return &Watcher{
+		config:          cfg,
+		clientset:       clientset,
+		discoveryClient: discoveryClient,
+		checkpointer:    checkpointer,
+		log:             logrus.WithField("component", "watcher"),
+		checkpointed:    make(map[string]string),
+		stopCh:          make(chan struct{}),
+	}, nil
+}
+// Start begins watching for pods and starts the health check server
+func (w *Watcher) Start(ctx context.Context) error {
+	w.log.WithFields(logrus.Fields{
+		"node":            w.config.NodeName,
+		"label":           LabelCheckpointSource,
+		"signal_file_env": EnvCheckpointSignalFile,
+	}).Info("Starting pod watcher")
+	// Start health check HTTP server if address is configured
+	if w.config.ListenAddr != "" {
+		httpServer := w.startHealthServer(ctx)
+		defer func() {
+			shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+			defer cancel()
+			httpServer.Shutdown(shutdownCtx)
+		}()
+	}
+	// Create informer factory with label selector and optional namespace restriction
+	labelSelector := labels.SelectorFromSet(labels.Set{
+		LabelCheckpointSource: "true",
+	}).String()
+	factoryOptions := []informers.SharedInformerOption{
+		informers.WithTweakListOptions(func(opts *metav1.ListOptions) {
+			opts.LabelSelector = labelSelector
+		}),
+	}
+	// If namespace is specified, restrict watching to that namespace
+	if w.config.RestrictedNamespace != "" {
+		w.log.WithField("namespace", w.config.RestrictedNamespace).Info("Restricting pod watching to namespace")
+		factoryOptions = append(factoryOptions, informers.WithNamespace(w.config.RestrictedNamespace))
+	} else {
+		w.log.Info("Watching pods cluster-wide (all namespaces)")
+	}
+	factory := informers.NewSharedInformerFactoryWithOptions(
+		w.clientset,
+		30*time.Second,
+		factoryOptions...,
+	)
+	podInformer := factory.Core().V1().Pods().Informer()
+	// Add event handlers
+	podInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{
+		AddFunc: func(obj interface{}) {
+			pod := obj.(*corev1.Pod)
+			w.handlePodEvent(ctx, pod)
+		},
+		UpdateFunc: func(oldObj, newObj interface{}) {
+			pod := newObj.(*corev1.Pod)
+			w.handlePodEvent(ctx, pod)
+		},
+	})
+	// Start informer
+	go factory.Start(w.stopCh)
+	// Wait for cache sync
+	if !cache.WaitForCacheSync(w.stopCh, podInformer.HasSynced) {
+		return fmt.Errorf("failed to sync informer cache")
+	}
+	w.log.Info("Pod watcher started and cache synced")
+	// Wait for context cancellation
+	<-ctx.Done()
+	close(w.stopCh)
+	return nil
+}
+// HealthResponse is the response for health check endpoint
+type HealthResponse struct {
+	Status   string `json:"status"`
+	NodeName string `json:"node_name"`
+}
+// startHealthServer starts an HTTP server for health checks
+func (w *Watcher) startHealthServer(ctx context.Context) *http.Server {
+	mux := http.NewServeMux()
+	mux.HandleFunc("/health", func(rw http.ResponseWriter, r *http.Request) {
+		if r.Method != http.MethodGet {
+			http.Error(rw, "Method not allowed", http.StatusMethodNotAllowed)
+			return
+		}
+		rw.Header().Set("Content-Type", "application/json")
+		json.NewEncoder(rw).Encode(HealthResponse{
+			Status:   "healthy",
+			NodeName: w.config.NodeName,
+		})
+	})
+	server := &http.Server{
+		Addr:         w.config.ListenAddr,
+		Handler:      mux,
+		ReadTimeout:  10 * time.Second,
+		WriteTimeout: 10 * time.Second,
+		IdleTimeout:  60 * time.Second,
+	}
+	go func() {
+		w.log.WithField("addr", w.config.ListenAddr).Info("Starting health check server")
+		if err := server.ListenAndServe(); err != nil && err != http.ErrServerClosed {
+			w.log.WithError(err).Error("Health check server error")
+		}
+	}()
+	return server
+}
+// Stop stops the watcher
+func (w *Watcher) Stop() {
+	close(w.stopCh)
+}
+// handlePodEvent processes a pod event
+func (w *Watcher) handlePodEvent(ctx context.Context, pod *corev1.Pod) {
+	// Filter to pods on this node
+	if pod.Spec.NodeName != w.config.NodeName {
+		return
+	}
+	// Check if pod is Ready
+	if !w.isPodReady(pod) {
+		return
+	}
+	// Check if we've already checkpointed this pod
+	podKey := fmt.Sprintf("%s/%s", pod.Namespace, pod.Name)
+	// Get checkpoint ID from label (uses the checkpoint hash)
+	checkpointID, ok := pod.Labels[LabelCheckpointHash]
+	if !ok || checkpointID == "" {
+		w.log.WithField("pod", podKey).Warn("Pod has checkpoint label but no checkpoint-hash label")
+		return
+	}
+	// Check if checkpoint is already in progress or completed for this pod
+	w.checkpointedMu.Lock()
+	status := w.checkpointed[podKey]
+	if status == "completed" || status == "in_progress" {
+		w.checkpointedMu.Unlock()
+		return
+	}
+	// Mark as in_progress to prevent concurrent attempts
+	w.checkpointed[podKey] = "in_progress"
+	w.checkpointedMu.Unlock()
+	// Trigger checkpoint
+	w.log.WithFields(logrus.Fields{
+		"pod":           podKey,
+		"checkpoint_id": checkpointID,
+	}).Info("Pod ready, triggering checkpoint")
+	go w.doCheckpoint(ctx, pod, checkpointID, podKey)
+}
+// isPodReady checks if all containers in the pod are ready
+func (w *Watcher) isPodReady(pod *corev1.Pod) bool {
+	if pod.Status.Phase != corev1.PodRunning {
+		return false
+	}
+	for _, cond := range pod.Status.Conditions {
+		if cond.Type == corev1.PodReady && cond.Status == corev1.ConditionTrue {
+			return true
+		}
+	}
+	return false
+}
+// doCheckpoint performs the checkpoint and writes the signal file
+func (w *Watcher) doCheckpoint(ctx context.Context, pod *corev1.Pod, checkpointID, podKey string) {
+	log := w.log.WithFields(logrus.Fields{
+		"pod":           podKey,
+		"checkpoint_id": checkpointID,
+	})
+	// Find the main container and get signal file path from env
+	var containerID string
+	var signalFilePath string
+	for _, container := range pod.Spec.Containers {
+		if container.Name == "main" || len(pod.Spec.Containers) == 1 {
+			// Get signal file path from environment
+			for _, env := range container.Env {
+				if env.Name == EnvCheckpointSignalFile {
+					signalFilePath = env.Value
+					break
+				}
+			}
+			break
+		}
+	}
+	// Get container ID from status
+	for _, cs := range pod.Status.ContainerStatuses {
+		if cs.Name == "main" || len(pod.Status.ContainerStatuses) == 1 {
+			// Remove containerd:// prefix
+			containerID = cs.ContainerID
+			if len(containerID) > 13 && containerID[:13] == "containerd://" {
+				containerID = containerID[13:]
+			}
+			break
+		}
+	}
+	if containerID == "" {
+		log.Error("Could not find container ID")
+		w.checkpointedMu.Lock()
+		delete(w.checkpointed, podKey)
+		w.checkpointedMu.Unlock()
+		return
+	}
+	if signalFilePath == "" {
+		log.Warn("No DYN_CHECKPOINT_SIGNAL_FILE env var found, signal file will not be written")
+	}
+	log.WithFields(logrus.Fields{
+		"container_id":     containerID,
+		"signal_file_path": signalFilePath,
+	}).Info("Found container, starting checkpoint")
+	// Resolve container to get PID for signal file writing
+	containerInfo, err := w.discoveryClient.ResolveContainer(ctx, containerID)
+	if err != nil {
+		log.WithError(err).Error("Failed to resolve container")
+		w.checkpointedMu.Lock()
+		delete(w.checkpointed, podKey)
+		w.checkpointedMu.Unlock()
+		return
+	}
+	// Perform checkpoint
+	opts := checkpoint.Options{
+		ContainerID:    containerID,
+		CheckpointID:   checkpointID,
+		CheckpointDir:  w.config.CheckpointDir,
+		NodeName:       w.config.NodeName,
+		PodName:        pod.Name,
+		PodNamespace:   pod.Namespace,
+		CUDAPluginDir:  w.config.CUDAPluginDir,
+		GhostLimit:     w.config.GhostLimit,
+		Timeout:        w.config.Timeout,
+		ExternalMounts: w.config.ExternalMounts,
+	}
+	result, err := w.checkpointer.Checkpoint(ctx, opts)
+	if err != nil {
+		log.WithError(err).Error("Checkpoint failed")
+		// Write failure marker to PVC so restore pods know checkpoint failed
+		checkpointDir := filepath.Join(w.config.CheckpointDir, checkpointID)
+		w.writeCheckpointDoneMarker(checkpointDir, checkpointID, false, err.Error(), log)
+		if signalFilePath != "" {
+			w.writeSignalFileToPod(int(containerInfo.PID), signalFilePath, checkpointID, "", false, err.Error())
+		}
+		// Clear the in_progress status so checkpoint can be retried
+		w.checkpointedMu.Lock()
+		delete(w.checkpointed, podKey)
+		w.checkpointedMu.Unlock()
+		return
+	}
+	log.WithField("checkpoint_dir", result.CheckpointDir).Info("Checkpoint completed successfully")
+	// Write checkpoint.done marker to PVC for cross-node restore detection
+	// This is written AFTER rootfs-diff.tar is complete, so it's safe to use as a completion marker
+	w.writeCheckpointDoneMarker(result.CheckpointDir, checkpointID, true, "", log)
+	// Write signal file to pod's hostPath for checkpoint job pod to exit
+	if signalFilePath != "" {
+		w.writeSignalFileToPod(int(containerInfo.PID), signalFilePath, checkpointID, result.CheckpointDir, true, "")
+	}
+	// Mark as completed so we don't checkpoint again
+	w.checkpointedMu.Lock()
+	w.checkpointed[podKey] = "completed"
+	w.checkpointedMu.Unlock()
+}
+// writeSignalFileToPod writes a signal file to the checkpointed pod's filesystem
+// via /proc/<pid>/root to indicate checkpoint completion
+func (w *Watcher) writeSignalFileToPod(pid int, signalFilePath, checkpointID, checkpointPath string, success bool, errMsg string) {
+	signal := SignalFile{
+		CheckpointID:   checkpointID,
+		CheckpointPath: checkpointPath,
+		Timestamp:      time.Now().UTC(),
+		Success:        success,
+		Error:          errMsg,
+	}
+	data, err := json.MarshalIndent(signal, "", "  ")
+	if err != nil {
+		w.log.WithError(err).Error("Failed to marshal signal file")
+		return
+	}
+	// Write to the pod's filesystem via /proc/<pid>/root
+	// signalFilePath is the path inside the pod (e.g., /var/lib/dynamo-checkpoint/signal.done)
+	hostSignalPath := fmt.Sprintf("%s/%d/root%s", w.config.HostProc, pid, signalFilePath)
+	// Ensure signal directory exists in pod's filesystem
+	signalDir := filepath.Dir(hostSignalPath)
+	if err := os.MkdirAll(signalDir, 0755); err != nil {
+		w.log.WithError(err).WithField("path", signalDir).Error("Failed to create signal directory in pod")
+		return
+	}
+	if err := os.WriteFile(hostSignalPath, data, 0644); err != nil {
+		w.log.WithError(err).WithField("path", hostSignalPath).Error("Failed to write signal file to pod")
+		return
+	}
+	w.log.WithFields(logrus.Fields{
+		"host_path": hostSignalPath,
+		"pod_path":  signalFilePath,
+		"pid":       pid,
+		"success":   success,
+	}).Info("Signal file written to pod filesystem")
+}
+// writeCheckpointDoneMarker writes a checkpoint.done marker file to the checkpoint directory on shared PVC.
+// This file is written AFTER all checkpoint steps complete (including rootfs-diff.tar).
+// Restore pods on ANY node check for this file to know the checkpoint is complete and safe to restore.
+// This is separate from writeSignalFileToPod which signals the checkpoint job pod to exit.
+func (w *Watcher) writeCheckpointDoneMarker(checkpointDir, checkpointID string, success bool, errMsg string, log *logrus.Entry) {
+	markerPath := filepath.Join(checkpointDir, "checkpoint.done")
+	marker := SignalFile{
+		CheckpointID:   checkpointID,
+		CheckpointPath: checkpointDir,
+		Timestamp:      time.Now().UTC(),
+		Success:        success,
+		Error:          errMsg,
+	}
+	data, err := json.MarshalIndent(marker, "", "  ")
+	if err != nil {
+		log.WithError(err).Error("Failed to marshal checkpoint.done marker")
+		return
+	}
+	if err := os.WriteFile(markerPath, data, 0644); err != nil {
+		log.WithError(err).WithField("path", markerPath).Error("Failed to write checkpoint.done marker")
+		return
+	}
+	log.WithFields(logrus.Fields{
+		"path":    markerPath,
+		"success": success,
+	}).Info("checkpoint.done marker written to PVC")
+}
--- a/deploy/chrek/scripts/smart-entrypoint.sh
+++ b/deploy/chrek/scripts/smart-entrypoint.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Smart entrypoint wrapper for CRIU checkpoint/restore
+# Automatically detects checkpoints and falls back to cold start if not found
+#
+# Behavior:
+# 1. If DYN_CHECKPOINT_HASH is set and checkpoint exists -> restore
+# 2. If WAIT_FOR_CHECKPOINT=1 -> wait for checkpoint (restore-entrypoint handles this)
+# 3. Otherwise -> execute provided command (cold start)
+set -e
+# Enable debug output if DEBUG=1
+if [ "${DEBUG:-0}" = "1" ]; then
+  set -x
+fi
+# Configuration from environment
+CHECKPOINT_PATH="${DYN_CHECKPOINT_PATH:-/checkpoints}"
+CHECKPOINT_HASH="${DYN_CHECKPOINT_HASH:-}"
+WAIT_FOR_CHECKPOINT="${WAIT_FOR_CHECKPOINT:-0}"
+# Log function for consistent output
+log() {
+  echo "[smart-entrypoint] $*" >&2
+}
+# Check if a checkpoint exists and should be restored
+should_restore_checkpoint() {
+  # If WAIT_FOR_CHECKPOINT is set, always use restore-entrypoint
+  # (it will wait for the checkpoint to appear)
+  if [ "$WAIT_FOR_CHECKPOINT" = "1" ]; then
+    log "WAIT_FOR_CHECKPOINT=1, will wait for checkpoint via restore-entrypoint"
+    return 0
+  fi
+  # If checkpoint hash is not set, no restore
+  if [ -z "$CHECKPOINT_HASH" ]; then
+    log "DYN_CHECKPOINT_HASH not set, no checkpoint to restore"
+    return 1
+  fi
+  # Check if checkpoint directory exists
+  CHECKPOINT_DIR="$CHECKPOINT_PATH/$CHECKPOINT_HASH"
+  if [ ! -d "$CHECKPOINT_DIR" ]; then
+    log "Checkpoint directory not found: $CHECKPOINT_DIR"
+    return 1
+  fi
+  # Check for checkpoint.done marker which is written LAST in the checkpoint process
+  # This is more reliable than inventory.img (created by CRIU) or rootfs-diff.tar (may be mid-write)
+  # Order: metadata.json -> CRIU dump (*.img) -> rootfs-diff.tar -> checkpoint.done
+  DONE_MARKER="$CHECKPOINT_DIR/checkpoint.done"
+  if [ ! -f "$DONE_MARKER" ]; then
+    log "Checkpoint incomplete - checkpoint.done not found in: $CHECKPOINT_DIR"
+    log "Checkpoint may still be in progress..."
+    return 1
+  fi
+  log "Checkpoint found: $CHECKPOINT_HASH (checkpoint.done marker present)"
+  return 0
+}
+# Main logic
+if should_restore_checkpoint; then
+  log "=========================================="
+  log "CHECKPOINT RESTORE MODE"
+  log "=========================================="
+  log "Checkpoint: $CHECKPOINT_HASH"
+  log "Location: $CHECKPOINT_PATH/$CHECKPOINT_HASH"
+  log "Invoking restore-entrypoint..."
+  log "=========================================="
+  # Execute restore-entrypoint
+  # Any args passed to this script are forwarded (though restore-entrypoint ignores them)
+  exec /restore-entrypoint "$@"
+else
+  log "=========================================="
+  log "COLD START MODE"
+  log "=========================================="
+  # No checkpoint found or not requested - fall back to cold start
+  if [ $# -eq 0 ]; then
+    # No args provided - this is likely an error
+    log "ERROR: No checkpoint to restore and no command provided"
+    log "Set DYN_CHECKPOINT_HASH to restore a checkpoint, or provide a command to run"
+    exit 1
+  fi
+  log "No checkpoint to restore"
+  log "Executing command: $*"
+  log "=========================================="
+  # Execute the provided command
+  exec "$@"
+fi
--- a/deploy/helm/charts/chrek/Chart.yaml
+++ b/deploy/helm/charts/chrek/Chart.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+apiVersion: v2
+name: chrek
+description: Checkpoint/Restore infrastructure for Dynamo (PVC + DaemonSet + CRIU Agent)
+type: application
+version: 0.1.0
+appVersion: "1.0"
+keywords:
+  - nvidia
+  - dynamo
+  - checkpoint
+  - criu
+  - gpu
+home: https://github.com/ai-dynamo/dynamo
+sources:
+  - https://github.com/ai-dynamo/dynamo
+maintainers:
+  - name: NVIDIA
--- a/deploy/helm/charts/chrek/README.md
+++ b/deploy/helm/charts/chrek/README.md
+# Chrek Helm Chart
+> ⚠️ **Experimental Feature**: ChReK is currently in **beta/preview**. It requires privileged mode for restore operations, which may not be suitable for all production environments. See [Prerequisites](#prerequisites) for security considerations.
+This Helm chart deploys the checkpoint/restore infrastructure for NVIDIA Dynamo, including:
+- Persistent Volume Claim (PVC) for checkpoint storage
+- DaemonSet running the CRIU checkpoint agent
+- RBAC resources (ServiceAccount, Role, RoleBinding)
+- Seccomp profile for blocking io_uring syscalls
+**Note:**
+- Each namespace gets its own isolated checkpoint infrastructure with namespace-scoped RBAC
+- **Currently only supports vLLM backend** (SGLang and TensorRT-LLM support planned)
+## Prerequisites
+⚠️ **Security Warning**: ChReK restore operations require **privileged mode**, which grants containers elevated host access. This may violate security policies in production environments. Only deploy in environments where privileged containers are acceptable.
+- Kubernetes 1.21+
+- GPU nodes with NVIDIA runtime (`nvidia` runtime class)
+- CRIU support in the container runtime (containerd with CRIU plugin)
+- NVIDIA Dynamo operator installed (cluster-wide or namespace-scoped)
+- RWX (ReadWriteMany) storage class for multi-node deployments
+- **Security clearance for privileged pods** (required for restore operations)
+## Installation
+> **Note:** The ChReK Helm chart is not yet published to a public Helm repository. For now, you must build and deploy from source.
+### Building from Source
+```bash
+# Set environment
+export NAMESPACE=my-team  # Your target namespace
+export DOCKER_SERVER=your-registry.com/  # Your container registry
+export IMAGE_TAG=latest
+# Build ChReK agent image
+cd deploy/chrek
+docker build --target agent -t $DOCKER_SERVER/chrek-agent:$IMAGE_TAG .
+docker push $DOCKER_SERVER/chrek-agent:$IMAGE_TAG
+cd -
+# Install ChReK chart with custom image
+helm install chrek ./deploy/helm/charts/chrek/ \
+  --namespace ${NAMESPACE} \
+  --create-namespace \
+  --set daemonset.image.repository=${DOCKER_SERVER}/chrek-agent \
+  --set daemonset.image.tag=${IMAGE_TAG} \
+  --set daemonset.imagePullSecrets[0].name=your-registry-secret
+```
+## Configuration
+See `values.yaml` for all configuration options.
+### Key Configuration Options
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `storage.type` | Storage type: `pvc` (only supported), `s3` and `oci` planned | `pvc` |
+| `storage.pvc.create` | Create a new PVC | `true` |
+| `storage.pvc.name` | PVC name (must match operator config) | `chrek-pvc` |
+| `storage.pvc.size` | PVC size | `100Gi` |
+| `storage.pvc.storageClass` | Storage class name | `""` (default) |
+| `storage.signalHostPath` | Host path for signal files | `/var/lib/chrek/signals` |
+| `daemonset.image.repository` | DaemonSet image repository | `nvidia/chrek-agent` |
+| `daemonset.nodeSelector` | Node selector for GPU nodes | `nvidia.com/gpu.present: "true"` |
+| `daemonset.runtimeClassName` | Runtime class for GPU access | `nvidia` |
+| `daemonset.criu.timeout` | CRIU timeout in seconds | `"21600"` (6 hours) |
+| `daemonset.criu.ghostLimit` | CRIU ghost file size limit | `"512MB"` |
+| `rbac.namespaceRestricted` | Use namespace-scoped RBAC | `true` |
+## Usage
+After installing this chart, enable checkpointing in your DynamoGraphDeployment:
+```yaml
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: my-model
+  namespace: my-team
+spec:
+  services:
+    worker:
+      checkpoint:
+        enabled: true
+        mode: auto
+        identity:
+          model: Qwen/Qwen3-0.6B
+          backendFramework: vllm
+```
+## Multi-Namespace Deployment
+To enable checkpointing in multiple namespaces, install this chart in each namespace:
+```bash
+# Namespace A
+helm install chrek nvidia/chrek -n team-a
+# Namespace B
+helm install chrek nvidia/chrek -n team-b
+```
+Each namespace will have its own isolated checkpoint storage.
+## Verification
+```bash
+# Check PVC
+kubectl get pvc chrek-pvc -n my-team
+# Check DaemonSet
+kubectl get daemonset -n my-team
+# Check DaemonSet pods are running
+kubectl get pods -n my-team -l app.kubernetes.io/name=chrek
+```
+## Uninstallation
+```bash
+helm uninstall chrek -n my-team
+```
+**Note:** This will NOT delete the PVC by default. To delete the PVC:
+```bash
+kubectl delete pvc chrek-pvc -n my-team
+```
+## Troubleshooting
+### DaemonSet pods not starting
+Check if GPU nodes have the correct labels and runtime class:
+```bash
+kubectl get nodes -l nvidia.com/gpu.present=true
+kubectl describe node <node-name> | grep -A 5 "Runtime Class"
+```
+If nodes don't have the `nvidia.com/gpu.present` label, you can add it:
+```bash
+kubectl label node <node-name> nvidia.com/gpu.present=true
+```
+### Checkpoint job fails
+Check DaemonSet logs:
+```bash
+kubectl logs -n my-team -l app.kubernetes.io/name=chrek
+```
+### PVC not mounting
+Check PVC status and events:
+```bash
+kubectl describe pvc chrek-pvc -n my-team
+```
+Ensure your storage class supports `ReadWriteMany` access mode for multi-node deployments.
+## Related Documentation
+- [ChReK Overview](../../../../docs/kubernetes/chrek/README.md) - ChReK architecture and use cases
+- [ChReK with Dynamo Platform](../../../../docs/kubernetes/chrek/dynamo.md) - Integration guide
+- [ChReK Standalone Usage](../../../../docs/kubernetes/chrek/standalone.md) - Use ChReK without Dynamo Platform
+## License
+Apache License 2.0
--- a/deploy/helm/charts/chrek/templates/_helpers.tpl
+++ b/deploy/helm/charts/chrek/templates/_helpers.tpl
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+{{/*
+Expand the name of the chart.
+*/}}
+{{- define "chrek.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
+{{- end }}
+{{/*
+Create a default fully qualified app name.
+*/}}
+{{- define "chrek.fullname" -}}
+{{- if .Values.fullnameOverride }}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
+{{- else }}
+{{- $name := default .Chart.Name .Values.nameOverride }}
+{{- if contains $name .Release.Name }}
+{{- .Release.Name | trunc 63 | trimSuffix "-" }}
+{{- else }}
+{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
+{{- end }}
+{{- end }}
+{{- end }}
+{{/*
+Create chart name and version as used by the chart label.
+*/}}
+{{- define "chrek.chart" -}}
+{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
+{{- end }}
+{{/*
+Common labels
+*/}}
+{{- define "chrek.labels" -}}
+helm.sh/chart: {{ include "chrek.chart" . }}
+{{ include "chrek.selectorLabels" . }}
+{{- if .Chart.AppVersion }}
+app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
+{{- end }}
+app.kubernetes.io/managed-by: {{ .Release.Service }}
+app.kubernetes.io/component: checkpoint-agent
+{{- end }}
+{{/*
+Selector labels
+*/}}
+{{- define "chrek.selectorLabels" -}}
+app.kubernetes.io/name: {{ include "chrek.name" . }}
+app.kubernetes.io/instance: {{ .Release.Name }}
+{{- end }}
+{{/*
+Create the name of the service account to use
+*/}}
+{{- define "chrek.serviceAccountName" -}}
+{{- if .Values.serviceAccount.create }}
+{{- default (include "chrek.fullname" . ) .Values.serviceAccount.name }}
+{{- else }}
+{{- default "default" .Values.serviceAccount.name }}
+{{- end }}
+{{- end }}
--- a/deploy/helm/charts/chrek/templates/daemonset.yaml
+++ b/deploy/helm/charts/chrek/templates/daemonset.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: {{ include "chrek.fullname" . }}-agent
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "chrek.labels" . | nindent 4 }}
+    app.kubernetes.io/component: checkpoint-agent
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: checkpoint-agent
+      app.kubernetes.io/instance: {{ .Release.Name }}
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: checkpoint-agent
+        app.kubernetes.io/instance: {{ .Release.Name }}
+        {{- with .Values.daemonset.podLabels }}
+        {{- toYaml . | nindent 8 }}
+        {{- end }}
+      {{- with .Values.daemonset.podAnnotations }}
+      annotations:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+    spec:
+      serviceAccountName: {{ include "chrek.serviceAccountName" . }}
+      hostPID: true
+      hostIPC: true
+      hostNetwork: true
+      {{- with .Values.daemonset.nodeSelector }}
+      nodeSelector:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      tolerations:
+        # Default: tolerate all taints (allow running on any node)
+        - operator: Exists
+        {{- with .Values.daemonset.tolerations }}
+        {{- toYaml . | nindent 8 }}
+        {{- end }}
+      {{- with .Values.daemonset.affinity }}
+      affinity:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- if .Values.daemonset.runtimeClassName }}
+      # Use specified runtime class for GPU access (e.g., nvidia for CUDA checkpointing)
+      runtimeClassName: {{ .Values.daemonset.runtimeClassName }}
+      {{- end }}
+      {{- if .Values.seccomp.deploy }}
+      initContainers:
+        # Deploy seccomp profile to host before starting the agent
+        # This profile blocks io_uring syscalls that CRIU doesn't support
+        - name: deploy-seccomp
+          image: busybox:latest
+          command:
+            - sh
+            - -c
+            - |
+              mkdir -p /host-seccomp/profiles
+              cp /seccomp-profiles/block-iouring.json /host-seccomp/profiles/block-iouring.json
+              echo "Deployed seccomp profile to /var/lib/kubelet/seccomp/profiles/block-iouring.json"
+          volumeMounts:
+            - name: seccomp-profiles
+              mountPath: /seccomp-profiles
+              readOnly: true
+            - name: host-seccomp
+              mountPath: /host-seccomp
+      {{- end }}
+      containers:
+        - name: agent
+          image: "{{ .Values.daemonset.image.repository }}:{{ .Values.daemonset.image.tag }}"
+          imagePullPolicy: {{ .Values.daemonset.image.pullPolicy }}
+          securityContext:
+            privileged: true
+          env:
+            - name: NODE_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: spec.nodeName
+            # Agent mode: use "watcher" to watch for labeled pods
+            - name: CHECKPOINT_SIGNAL_FROM
+              value: "watcher"
+            {{- if .Values.rbac.namespaceRestricted }}
+            # Restrict pod watching to this namespace (namespace-scoped RBAC)
+            - name: RESTRICTED_NAMESPACE
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.namespace
+            {{- end }}
+            # Checkpoint storage directory
+            - name: CHECKPOINT_DIR
+              value: {{ .Values.storage.pvc.basePath | quote }}
+            # Host proc mount point for CRIU operations
+            - name: HOST_PROC
+              value: "/host/proc"
+            # Containerd socket path
+            - name: CONTAINERD_SOCKET
+              value: {{ .Values.daemonset.containerRuntimeSocket }}
+            {{- if .Values.daemonset.criu.cudaPluginDir }}
+            # CUDA plugin directory for GPU checkpoint support
+            - name: CUDA_PLUGIN_DIR
+              value: {{ .Values.daemonset.criu.cudaPluginDir | quote }}
+            {{- end }}
+            {{- if .Values.daemonset.criu.ghostLimit }}
+            # CRIU ghost file size limit in bytes
+            - name: CRIU_GHOST_LIMIT
+              value: {{ .Values.daemonset.criu.ghostLimit | quote }}
+            {{- end }}
+            {{- if .Values.daemonset.criu.timeout }}
+            # CRIU timeout in seconds
+            - name: CRIU_TIMEOUT
+              value: {{ .Values.daemonset.criu.timeout | quote }}
+            {{- end }}
+            # Storage type (for future S3/OCI support)
+            - name: DYN_CHECKPOINT_STORAGE_TYPE
+              value: {{ .Values.storage.type | quote }}
+          volumeMounts:
+            {{- if eq .Values.storage.type "pvc" }}
+            # Mount the checkpoint PVC (only for PVC storage type)
+            - name: checkpoints
+              mountPath: {{ .Values.storage.pvc.basePath }}
+            {{- end }}
+            # Mount containerd runtime directory for checkpoint operations
+            - name: containerd-run
+              mountPath: /run/containerd
+            # Mount kubelet pods directory for volume discovery
+            - name: kubelet-pods
+              mountPath: /var/lib/kubelet/pods
+              readOnly: true
+            # Mount containerd storage for filesystem info
+            - name: containerd-storage
+              mountPath: /var/lib/containerd
+              readOnly: true
+            # Mount host proc for CRIU and signal file writing
+            - name: host-proc
+              mountPath: /host/proc
+            # Mount host cgroup for CRIU
+            - name: host-cgroup
+              mountPath: /sys/fs/cgroup
+              readOnly: true
+            {{- if and (eq .Values.storage.type "oci") .Values.storage.oci.credentialsSecretRef }}
+            # Mount docker config for OCI registry auth
+            - name: docker-config
+              mountPath: /root/.docker
+              readOnly: true
+            {{- end }}
+          {{- if and (eq .Values.storage.type "s3") .Values.storage.s3.credentialsSecretRef }}
+          envFrom:
+            - secretRef:
+                name: {{ .Values.storage.s3.credentialsSecretRef }}
+          {{- end }}
+          resources:
+            {{- toYaml .Values.daemonset.resources | nindent 12 }}
+      volumes:
+        {{- if .Values.seccomp.deploy }}
+        # Seccomp profile ConfigMap (used by initContainer)
+        - name: seccomp-profiles
+          configMap:
+            name: {{ include "chrek.fullname" . }}-seccomp
+        # Host seccomp directory (for deploying the profile)
+        - name: host-seccomp
+          hostPath:
+            path: /var/lib/kubelet/seccomp
+            type: DirectoryOrCreate
+        {{- end }}
+        {{- if eq .Values.storage.type "pvc" }}
+        - name: checkpoints
+          persistentVolumeClaim:
+            claimName: {{ .Values.storage.pvc.name }}
+        {{- end }}
+        # Containerd runtime directory (read-write for checkpoint operations)
+        - name: containerd-run
+          hostPath:
+            path: /run/containerd
+            type: Directory
+        # Kubelet pods directory (for volume discovery)
+        - name: kubelet-pods
+          hostPath:
+            path: /var/lib/kubelet/pods
+            type: Directory
+        # Containerd storage directory (for filesystem info)
+        - name: containerd-storage
+          hostPath:
+            path: /var/lib/containerd
+            type: Directory
+        # Host proc (for CRIU and signal files - needs write access)
+        - name: host-proc
+          hostPath:
+            path: /proc
+            type: Directory
+        # Host cgroup (for CRIU)
+        - name: host-cgroup
+          hostPath:
+            path: /sys/fs/cgroup
+            type: Directory
+        {{- if and (eq .Values.storage.type "oci") .Values.storage.oci.credentialsSecretRef }}
+        - name: docker-config
+          secret:
+            secretName: {{ .Values.storage.oci.credentialsSecretRef }}
+        {{- end }}
+      {{- with .Values.daemonset.imagePullSecrets }}
+      imagePullSecrets:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
--- a/deploy/helm/charts/chrek/templates/pvc.yaml
+++ b/deploy/helm/charts/chrek/templates/pvc.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+{{- if and (eq .Values.storage.type "pvc") .Values.storage.pvc.create }}
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: {{ .Values.storage.pvc.name }}
+  namespace: {{ .Release.Namespace }}
+  labels:
+    app.kubernetes.io/name: {{ include "chrek.name" . }}
+    app.kubernetes.io/instance: {{ .Release.Name }}
+    app.kubernetes.io/component: storage
+spec:
+  accessModes:
+    - {{ .Values.storage.pvc.accessMode }}
+  resources:
+    requests:
+      storage: {{ .Values.storage.pvc.size }}
+  {{- if .Values.storage.pvc.storageClass }}
+  storageClassName: {{ .Values.storage.pvc.storageClass }}
+  {{- end }}
+{{- end }}
--- a/deploy/helm/charts/chrek/templates/role.yaml
+++ b/deploy/helm/charts/chrek/templates/role.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+{{- if .Values.rbac.create }}
+{{- if .Values.rbac.namespaceRestricted }}
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: {{ include "chrek.fullname" . }}-agent
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "chrek.labels" . | nindent 4 }}
+    app.kubernetes.io/component: checkpoint-agent
+rules:
+  # Watch pods in this namespace to detect checkpoint-source pods becoming ready
+  - apiGroups: [""]
+    resources: ["pods"]
+    verbs: ["get", "list", "watch"]
+{{- else }}
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: {{ include "chrek.fullname" . }}-agent
+  labels:
+    {{- include "chrek.labels" . | nindent 4 }}
+    app.kubernetes.io/component: checkpoint-agent
+rules:
+  # Watch pods cluster-wide to detect checkpoint-source pods on assigned nodes
+  - apiGroups: [""]
+    resources: ["pods"]
+    verbs: ["get", "list", "watch"]
+{{- end }}
+{{- end }}
--- a/deploy/helm/charts/chrek/templates/rolebinding.yaml
+++ b/deploy/helm/charts/chrek/templates/rolebinding.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+{{- if .Values.rbac.create }}
+{{- if .Values.rbac.namespaceRestricted }}
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: {{ include "chrek.fullname" . }}-agent
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "chrek.labels" . | nindent 4 }}
+    app.kubernetes.io/component: checkpoint-agent
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: {{ include "chrek.fullname" . }}-agent
+subjects:
+  - kind: ServiceAccount
+    name: {{ include "chrek.serviceAccountName" . }}
+    namespace: {{ .Release.Namespace }}
+{{- else }}
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: {{ include "chrek.fullname" . }}-agent
+  labels:
+    {{- include "chrek.labels" . | nindent 4 }}
+    app.kubernetes.io/component: checkpoint-agent
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: {{ include "chrek.fullname" . }}-agent
+subjects:
+  - kind: ServiceAccount
+    name: {{ include "chrek.serviceAccountName" . }}
+    namespace: {{ .Release.Namespace }}
+{{- end }}
+{{- end }}
--- a/deploy/helm/charts/chrek/templates/seccomp-configmap.yaml
+++ b/deploy/helm/charts/chrek/templates/seccomp-configmap.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+{{- if .Values.seccomp.deploy }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ include "chrek.fullname" . }}-seccomp
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "chrek.labels" . | nindent 4 }}
+    app.kubernetes.io/component: seccomp
+data:
+  block-iouring.json: |
+    {
+      "defaultAction": "SCMP_ACT_ALLOW",
+      "architectures": ["SCMP_ARCH_X86_64", "SCMP_ARCH_X86", "SCMP_ARCH_X32"],
+      "syscalls": [
+        {
+          "names": ["io_uring_setup", "io_uring_enter", "io_uring_register"],
+          "action": "SCMP_ACT_ERRNO",
+          "comment": "Block io_uring syscalls - CRIU doesn't support io_uring memory mappings"
+        }
+      ]
+    }
+{{- end }}
--- a/deploy/helm/charts/chrek/templates/serviceaccount.yaml
+++ b/deploy/helm/charts/chrek/templates/serviceaccount.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+{{- if .Values.serviceAccount.create }}
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: {{ include "chrek.serviceAccountName" . }}
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "chrek.labels" . | nindent 4 }}
+    app.kubernetes.io/component: checkpoint-agent
+  {{- with .Values.serviceAccount.annotations }}
+  annotations:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+{{- end }}
--- a/deploy/helm/charts/chrek/values.yaml
+++ b/deploy/helm/charts/chrek/values.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Chrek - Checkpoint/Restore Infrastructure
+# This chart deploys the checkpoint storage (PVC) and CRIU agent (DaemonSet)
+# in a single namespace. Install this chart in each namespace where you want
+# to enable checkpoint/restore functionality for DynamoGraphDeployments.
+# Storage configuration for checkpoints
+storage:
+  # Storage type: pvc (default), s3, or oci
+  type: pvc
+  # PVC configuration (when type=pvc)
+  pvc:
+    # Create a new PVC (set to false if using existing PVC)
+    create: true
+    # PVC name - must match operator configuration
+    name: chrek-pvc
+    # PVC size
+    size: 100Gi
+    # Storage class (leave empty for default)
+    storageClass: ""
+    # Access mode - ReadWriteMany required for multi-pod access
+    accessMode: ReadWriteMany
+    # Base path for checkpoints (mounted in pods)
+    basePath: /checkpoints
+  # S3 configuration (when type=s3)
+  s3:
+    # S3 URI (e.g., s3://my-bucket/checkpoints)
+    uri: ""
+    # Credentials are expected via IRSA or mounted secrets
+  # OCI configuration (when type=oci)
+  oci:
+    # OCI URI (e.g., oci://registry.io/repo/checkpoints)
+    uri: ""
+  # Host path for signal files (inter-pod communication)
+  signalHostPath: /var/lib/chrek/signals
+# DaemonSet configuration for chrek (checkpoint/restore) agent
+daemonset:
+  # Container image
+  image:
+    repository: nvcr.io/nvidian/dynamo-dev/chrek-agent
+    tag: latest
+    pullPolicy: Always
+  # Image pull secrets
+  imagePullSecrets:
+    - name: ngc-secret
+  # Resource limits and requests
+  resources:
+    limits:
+      cpu: 2
+      memory: 4Gi
+    requests:
+      cpu: 500m
+      memory: 1Gi
+  # Node selector - target GPU nodes
+  nodeSelector:
+    nvidia.com/gpu.present: "true"
+  # Tolerations for GPU nodes
+  tolerations:
+    - key: nvidia.com/gpu
+      operator: Exists
+      effect: NoSchedule
+    - key: dedicated
+      operator: Exists
+      effect: NoSchedule
+  # Runtime class name for GPU access
+  runtimeClassName: nvidia
+  # Pod labels
+  podLabels: {}
+  # Pod annotations
+  podAnnotations: {}
+  # Affinity rules
+  affinity: {}
+  # CRIU configuration
+  criu:
+    # CUDA plugin directory
+    cudaPluginDir: /usr/local/lib/criu
+    # CRIU timeout in seconds (6 hours)
+    timeout: "21600"
+    # Ghost file size limit in bytes
+    # 512MB is recommended for GPU workloads with large memory allocations
+    ghostLimit: "536870912"
+  # Container runtime socket path
+  containerRuntimeSocket: /run/containerd/containerd.sock
+# Seccomp profile configuration
+seccomp:
+  # Deploy seccomp profile for blocking io_uring (required for CRIU)
+  deploy: true
+# Service account configuration
+serviceAccount:
+  # Create service account
+  create: true
+  # Service account name (generated if not set)
+  name: ""
+  # Annotations for service account (e.g., for IRSA)
+  annotations: {}
+# RBAC configuration
+rbac:
+  # Create RBAC resources
+  create: true
+  # Namespace-scoped RBAC (recommended, required for PVC storage)
+  # - true (default): Creates Role/RoleBinding, agent watches pods in chart's namespace only
+  # - false: Creates ClusterRole/ClusterRoleBinding, agent watches all pods on assigned nodes
+  # Note: PVC storage requires namespace-scoped mode (true) as PVCs are namespace-scoped
+  namespaceRestricted: true
--- a/deploy/helm/charts/crds/templates/nvidia.com_dynamocheckpoints.yaml
+++ b/deploy/helm/charts/crds/templates/nvidia.com_dynamocheckpoints.yaml