feat(chrek): config refactor, /dev/shm support, and mount-policy rewrite (#5946)

d381e6ff · Schwinn Saereesitthipitak · GitHub · b6824ae0 · d381e6ff · b6824ae0
Unverified Commit d381e6ff authored Feb 11, 2026 by Schwinn Saereesitthipitak Committed by GitHub Feb 12, 2026
20 changed files
--- a/deploy/chrek/pkg/common/criu.go
+++ b/deploy/chrek/pkg/common/criu.go
@@ -10,10 +10,10 @@ import (
 	"golang.org/x/sys/unix"
 )
-// OpenDirForCRIU opens a directory and clears the CLOEXEC flag so the FD
+// OpenPathForCRIU opens a path (directory or file) and clears the CLOEXEC flag
-// can be inherited by CRIU child processes.
+// so the FD can be inherited by CRIU child processes.
 // Returns the opened file and its FD. Caller must close the file when done.
-func OpenDirForCRIU(path string) (*os.File, int32, error) {
+func OpenPathForCRIU(path string) (*os.File, int32, error) {
 	dir, err := os.Open(path)
 	if err != nil {
 		return nil, 0, fmt.Errorf("failed to open %s: %w", path, err)
@@ -30,41 +30,6 @@ func OpenDirForCRIU(path string) (*os.File, int32, error) {
 	return dir, int32(dir.Fd()), nil
 }
-// DefaultMaskedPaths returns the standard OCI masked paths.
-// These paths are typically masked (made inaccessible) in containers.
-// Used as fallback when checkpoint metadata doesn't include OCI-derived paths.
-func DefaultMaskedPaths() []string {
-	return []string{
-		"/proc/bus",
-		"/proc/fs",
-		"/proc/irq",
-		"/proc/sys",
-		"/proc/sysrq-trigger",
-		"/proc/acpi",
-		"/proc/kcore",
-		"/proc/keys",
-		"/proc/latency_stats",
-		"/proc/timer_list",
-		"/proc/scsi",
-		"/proc/interrupts",
-		"/proc/asound",
-		"/sys/firmware",
-		"/sys/devices/virtual/powercap",
-	}
-}
-// DefaultReadonlyPaths returns the standard OCI readonly paths.
-// These paths are typically mounted read-only in containers.
-func DefaultReadonlyPaths() []string {
-	return []string{
-		"/proc/bus",
-		"/proc/fs",
-		"/proc/irq",
-		"/proc/sys",
-		"/proc/sysrq-trigger",
-	}
-}
 // CRIUMountPoint represents a parsed mount point from /proc/pid/mountinfo.
 type CRIUMountPoint struct {
 	MountID   string // Mount ID

--- a/deploy/chrek/pkg/common/metadata.go
+++ b/deploy/chrek/pkg/common/metadata.go
-// metadata.go handles checkpoint metadata for cross-node restore operations.
-package common
-import (
-	"encoding/json"
-	"fmt"
-	"os"
-	"path/filepath"
-	"time"
-)
-const (
-	// MetadataFilename is the name of the metadata file in checkpoint directories
-	MetadataFilename = "metadata.json"
-	// DescriptorsFilename is the name of the file descriptors file
-	DescriptorsFilename = "descriptors.json"
-)
-// CheckpointMetadata stores information needed for cross-node restore
-type CheckpointMetadata struct {
-	// Checkpoint identification
-	CheckpointID string    `json:"checkpoint_id"`
-	CreatedAt    time.Time `json:"created_at"`
-	// Source information
-	SourceNode   string `json:"source_node"`
-	SourcePodIP  string `json:"source_pod_ip,omitempty"` // For cross-node TCP detection
-	ContainerID  string `json:"container_id"`
-	PodName      string `json:"pod_name"`
-	PodNamespace string `json:"pod_namespace"`
-	Image        string `json:"image"`
-	// Process information
-	PID int `json:"pid"`
-	// Filesystem information
-	RootfsDiffPath  string `json:"rootfs_diff_path,omitempty"`   // Path to rootfs-diff.tar
-	UpperDir        string `json:"upper_dir,omitempty"`          // Original overlay upperdir
-	HasRootfsDiff   bool   `json:"has_rootfs_diff"`              // Whether rootfs diff was captured
-	HasDeletedFiles bool   `json:"has_deleted_files"`            // Whether deleted files were tracked
-	// Mount mappings from original container
-	Mounts []MountMetadata `json:"mounts"`
-	// OCI spec derived paths (populated from containerd, used at restore)
-	// These replace hardcoded values with runtime-discovered configuration
-	MaskedPaths    []string `json:"masked_paths,omitempty"`     // From OCI spec Linux.MaskedPaths
-	ReadonlyPaths  []string `json:"readonly_paths,omitempty"`   // From OCI spec Linux.ReadonlyPaths
-	BindMountDests []string `json:"bind_mount_dests,omitempty"` // Destinations of bind mounts (for tar exclusions)
-	// Namespace information
-	Namespaces []NamespaceMetadata `json:"namespaces"`
-	// CRIU options used during checkpoint (for restore compatibility)
-	CRIUOptions CRIUOptionsMetadata `json:"criu_options"`
-}
-// CRIUOptionsMetadata stores CRIU options used during checkpoint.
-// This allows restore to use compatible options.
-// Note: In our implementation, most options are hardcoded as always-on for K8s,
-// but we store them for compatibility and debugging purposes.
-type CRIUOptionsMetadata struct {
-	TcpEstablished bool `json:"tcp_established"`
-	TcpClose       bool `json:"tcp_close"`
-	ShellJob       bool `json:"shell_job"`
-	FileLocks      bool `json:"file_locks"`
-	LeaveRunning   bool `json:"leave_running"`
-	LinkRemap      bool `json:"link_remap"`
-	ExtMasters     bool `json:"ext_masters"`
-}
-// MountMetadata stores information about a mount for remapping during restore
-type MountMetadata struct {
-	ContainerPath string   `json:"container_path"`           // Path inside container (e.g., /usr/share/nginx/html)
-	HostPath      string   `json:"host_path"`                // Original host path from mountinfo
-	OCISource     string   `json:"oci_source,omitempty"`     // Source path from OCI spec (may differ from HostPath)
-	OCIType       string   `json:"oci_type,omitempty"`       // Mount type from OCI spec (bind, tmpfs, etc.)
-	OCIOptions    []string `json:"oci_options,omitempty"`    // Mount options from OCI spec
-	VolumeType    string   `json:"volume_type"`              // emptyDir, pvc, configMap, secret, hostPath (best-effort)
-	VolumeName    string   `json:"volume_name"`              // Kubernetes volume name (best-effort from path parsing)
-	FSType        string   `json:"fs_type"`                  // Filesystem type from mountinfo
-	ReadOnly      bool     `json:"read_only"`                // Whether mount is read-only
-}
-// NamespaceMetadata stores namespace information
-type NamespaceMetadata struct {
-	Type       string `json:"type"`        // net, pid, mnt, etc.
-	Inode      uint64 `json:"inode"`       // Namespace inode
-	IsExternal bool   `json:"is_external"` // Whether namespace is external (shared)
-}
-// NewCheckpointMetadata creates a new metadata instance
-func NewCheckpointMetadata(checkpointID string) *CheckpointMetadata {
-	return &CheckpointMetadata{
-		CheckpointID: checkpointID,
-		CreatedAt:    time.Now().UTC(),
-		Mounts:       make([]MountMetadata, 0),
-		Namespaces:   make([]NamespaceMetadata, 0),
-	}
-}
-// SaveMetadata writes metadata to a JSON file in the checkpoint directory
-func SaveMetadata(checkpointDir string, meta *CheckpointMetadata) error {
-	data, err := json.MarshalIndent(meta, "", "  ")
-	if err != nil {
-		return fmt.Errorf("failed to marshal metadata: %w", err)
-	}
-	metadataPath := filepath.Join(checkpointDir, MetadataFilename)
-	if err := os.WriteFile(metadataPath, data, 0644); err != nil {
-		return fmt.Errorf("failed to write metadata file: %w", err)
-	}
-	return nil
-}
-// LoadMetadata reads metadata from a checkpoint directory
-func LoadMetadata(checkpointDir string) (*CheckpointMetadata, error) {
-	metadataPath := filepath.Join(checkpointDir, MetadataFilename)
-	data, err := os.ReadFile(metadataPath)
-	if err != nil {
-		return nil, fmt.Errorf("failed to read metadata file: %w", err)
-	}
-	var meta CheckpointMetadata
-	if err := json.Unmarshal(data, &meta); err != nil {
-		return nil, fmt.Errorf("failed to unmarshal metadata: %w", err)
-	}
-	return &meta, nil
-}
-// SaveDescriptors writes file descriptor information to the checkpoint directory
-func SaveDescriptors(checkpointDir string, descriptors []string) error {
-	data, err := json.Marshal(descriptors)
-	if err != nil {
-		return fmt.Errorf("failed to marshal descriptors: %w", err)
-	}
-	descriptorsPath := filepath.Join(checkpointDir, DescriptorsFilename)
-	if err := os.WriteFile(descriptorsPath, data, 0600); err != nil {
-		return fmt.Errorf("failed to write descriptors file: %w", err)
-	}
-	return nil
-}
-// LoadDescriptors reads file descriptor information from checkpoint directory
-func LoadDescriptors(checkpointDir string) ([]string, error) {
-	descriptorsPath := filepath.Join(checkpointDir, DescriptorsFilename)
-	data, err := os.ReadFile(descriptorsPath)
-	if err != nil {
-		return nil, fmt.Errorf("failed to read descriptors file: %w", err)
-	}
-	var descriptors []string
-	if err := json.Unmarshal(data, &descriptors); err != nil {
-		return nil, fmt.Errorf("failed to unmarshal descriptors: %w", err)
-	}
-	return descriptors, nil
-}
-// GetCheckpointDir returns the path to a checkpoint directory
-func GetCheckpointDir(baseDir, checkpointID string) string {
-	return filepath.Join(baseDir, checkpointID)
-}
-// ListCheckpoints returns all checkpoint IDs in the base directory
-func ListCheckpoints(baseDir string) ([]string, error) {
-	entries, err := os.ReadDir(baseDir)
-	if err != nil {
-		return nil, fmt.Errorf("failed to read checkpoint directory: %w", err)
-	}
-	var checkpoints []string
-	for _, entry := range entries {
-		if !entry.IsDir() {
-			continue
-		}
-		// Check if metadata file exists
-		metadataPath := filepath.Join(baseDir, entry.Name(), MetadataFilename)
-		if _, err := os.Stat(metadataPath); err == nil {
-			checkpoints = append(checkpoints, entry.Name())
-		}
-	}
-	return checkpoints, nil
-}
-// GetCheckpointInfo returns metadata for a specific checkpoint
-func GetCheckpointInfo(baseDir, checkpointID string) (*CheckpointMetadata, error) {
-	checkpointDir := GetCheckpointDir(baseDir, checkpointID)
-	return LoadMetadata(checkpointDir)
-}
-// DeleteCheckpoint removes a checkpoint directory
-func DeleteCheckpoint(baseDir, checkpointID string) error {
-	checkpointDir := GetCheckpointDir(baseDir, checkpointID)
-	return os.RemoveAll(checkpointDir)
-}
--- a/deploy/chrek/pkg/http_api_server/handlers.go
+++ b/deploy/chrek/pkg/http_api_server/handlers.go
+// handlers.go provides HTTP handlers for the checkpoint agent server.
+package httpApiServer
+import (
+	"encoding/json"
+	"fmt"
+	"log"
+	"net/http"
+	"os"
+	"path/filepath"
+	"time"
+	"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/checkpoint"
+)
+// Handlers holds dependencies for HTTP handlers.
+type Handlers struct {
+	cfg          ServerConfig
+	checkpointer *checkpoint.Checkpointer
+}
+// NewHandlers creates a new Handlers instance.
+func NewHandlers(cfg ServerConfig, checkpointer *checkpoint.Checkpointer) *Handlers {
+	return &Handlers{
+		cfg:          cfg,
+		checkpointer: checkpointer,
+	}
+}
+// HandleHealth handles GET /health requests.
+func (h *Handlers) HandleHealth(w http.ResponseWriter, r *http.Request) {
+	if r.Method != http.MethodGet {
+		http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+		return
+	}
+	resp := HealthResponse{
+		Status:   "healthy",
+		NodeName: h.cfg.NodeName,
+	}
+	writeJSON(w, http.StatusOK, resp)
+}
+// HandleCheckpoint handles POST /checkpoint requests.
+func (h *Handlers) HandleCheckpoint(w http.ResponseWriter, r *http.Request) {
+	if r.Method != http.MethodPost {
+		http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+		return
+	}
+	var req CheckpointRequest
+	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+		writeJSON(w, http.StatusBadRequest, CheckpointResponse{
+			Success: false,
+			Error:   fmt.Sprintf("Invalid request body: %v", err),
+		})
+		return
+	}
+	if req.ContainerID == "" {
+		writeJSON(w, http.StatusBadRequest, CheckpointResponse{
+			Success: false,
+			Error:   "container_id is required",
+		})
+		return
+	}
+	if req.CheckpointID == "" {
+		req.CheckpointID = fmt.Sprintf("ckpt-%d", time.Now().UnixNano())
+	}
+	// Build checkpoint params
+	params := checkpoint.CheckpointRequest{
+		ContainerID:   req.ContainerID,
+		ContainerName: req.ContainerName,
+		CheckpointID:  req.CheckpointID,
+		CheckpointDir: h.cfg.CheckpointSpec.BasePath,
+		NodeName:      h.cfg.NodeName,
+		PodName:       req.PodName,
+		PodNamespace:  req.PodNamespace,
+	}
+	// Copy checkpoint spec and disable CUDA if requested.
+	checkpointSpec := *h.cfg.CheckpointSpec
+	if req.DisableCUDA {
+		checkpointSpec.CRIU.LibDir = ""
+	}
+	ctx := r.Context()
+	result, err := h.checkpointer.Checkpoint(ctx, params, &checkpointSpec)
+	if err != nil {
+		log.Printf("Checkpoint failed: %v", err)
+		writeJSON(w, http.StatusInternalServerError, CheckpointResponse{
+			Success: false,
+			Error:   err.Error(),
+		})
+		return
+	}
+	// Write checkpoint.done marker so restore-entrypoint can detect this checkpoint
+	checkpointDonePath := result.CheckpointDir + "/" + checkpoint.CheckpointDoneFilename
+	if err := os.WriteFile(checkpointDonePath, []byte(time.Now().Format(time.RFC3339)), 0644); err != nil {
+		log.Printf("Failed to write checkpoint.done marker: %v", err)
+		writeJSON(w, http.StatusInternalServerError, CheckpointResponse{
+			Success: false,
+			Error:   fmt.Sprintf("Checkpoint succeeded but failed to write done marker: %v", err),
+		})
+		return
+	}
+	log.Printf("Wrote checkpoint.done marker: %s", checkpointDonePath)
+	log.Printf("Checkpoint successful: %s", result.CheckpointID)
+	writeJSON(w, http.StatusOK, CheckpointResponse{
+		Success:      true,
+		CheckpointID: result.CheckpointID,
+		Message:      fmt.Sprintf("Checkpoint created successfully at %s", result.CheckpointDir),
+	})
+}
+// HandleListCheckpoints handles GET /checkpoints requests.
+func (h *Handlers) HandleListCheckpoints(w http.ResponseWriter, r *http.Request) {
+	if r.Method != http.MethodGet {
+		http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+		return
+	}
+	checkpointIDs, err := checkpoint.ListCheckpoints(h.cfg.CheckpointSpec.BasePath)
+	if err != nil {
+		writeJSON(w, http.StatusInternalServerError, map[string]string{
+			"error": err.Error(),
+		})
+		return
+	}
+	var checkpoints []CheckpointInfo
+	for _, id := range checkpointIDs {
+		meta, err := checkpoint.ReadCheckpointManifest(filepath.Join(h.cfg.CheckpointSpec.BasePath, id))
+		if err != nil {
+			continue
+		}
+		checkpoints = append(checkpoints, CheckpointInfo{
+			ID:           meta.CheckpointID,
+			CreatedAt:    meta.CreatedAt,
+			SourceNode:   meta.K8s.SourceNode,
+			ContainerID:  meta.K8s.ContainerID,
+			PodName:      meta.K8s.PodName,
+			PodNamespace: meta.K8s.PodNamespace,
+		})
+	}
+	writeJSON(w, http.StatusOK, ListCheckpointsResponse{
+		Checkpoints: checkpoints,
+	})
+}
+// writeJSON writes a JSON response.
+func writeJSON(w http.ResponseWriter, status int, data interface{}) {
+	w.Header().Set("Content-Type", "application/json")
+	w.WriteHeader(status)
+	json.NewEncoder(w).Encode(data)
+}
--- a/deploy/chrek/pkg/http_api_server/middleware.go
+++ b/deploy/chrek/pkg/http_api_server/middleware.go
+// middleware.go provides HTTP middleware for the server.
+package httpApiServer
+import (
+	"log"
+	"net/http"
+	"time"
+)
+// LoggingMiddleware wraps an HTTP handler and logs request details.
+func LoggingMiddleware(next http.Handler) http.Handler {
+	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		start := time.Now()
+		log.Printf("Started %s %s", r.Method, r.URL.Path)
+		next.ServeHTTP(w, r)
+		log.Printf("Completed %s %s in %v", r.Method, r.URL.Path, time.Since(start))
+	})
+}
--- a/deploy/chrek/pkg/http_api_server/server.go
+++ b/deploy/chrek/pkg/http_api_server/server.go
+// server.go provides the HTTP server for the checkpoint agent.
+package httpApiServer
+import (
+	"context"
+	"log"
+	"net/http"
+	"time"
+	"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/checkpoint"
+)
+// ServerConfig holds the configuration for the HTTP API server.
+type ServerConfig struct {
+	ListenAddr     string
+	NodeName       string
+	CheckpointSpec *checkpoint.CheckpointSpec
+}
+// Server is the HTTP API server for checkpoint operations.
+type Server struct {
+	cfg        ServerConfig
+	handlers   *Handlers
+	httpServer *http.Server
+}
+// NewServer creates a new Server instance.
+func NewServer(cfg ServerConfig, checkpointer *checkpoint.Checkpointer) *Server {
+	handlers := NewHandlers(cfg, checkpointer)
+	// Setup routes
+	mux := http.NewServeMux()
+	mux.HandleFunc("/health", handlers.HandleHealth)
+	mux.HandleFunc("/checkpoint", handlers.HandleCheckpoint)
+	mux.HandleFunc("/checkpoints", handlers.HandleListCheckpoints)
+	// WriteTimeout must exceed the CRIU checkpoint timeout since /checkpoint
+	// blocks until the dump completes. Add 60s buffer for pre/post work.
+	writeTimeout := time.Duration(cfg.CheckpointSpec.CRIU.Timeout)*time.Second + 60*time.Second
+	if writeTimeout < 300*time.Second {
+		writeTimeout = 300 * time.Second
+	}
+	httpServer := &http.Server{
+		Addr:         cfg.ListenAddr,
+		Handler:      LoggingMiddleware(mux),
+		ReadTimeout:  30 * time.Second,
+		WriteTimeout: writeTimeout,
+		IdleTimeout:  120 * time.Second,
+	}
+	return &Server{
+		cfg:        cfg,
+		handlers:   handlers,
+		httpServer: httpServer,
+	}
+}
+// Start starts the HTTP server.
+// This method blocks until the server is shut down.
+func (s *Server) Start() error {
+	log.Printf("HTTP API server listening on %s", s.cfg.ListenAddr)
+	return s.httpServer.ListenAndServe()
+}
+// Shutdown gracefully shuts down the server.
+func (s *Server) Shutdown(ctx context.Context) error {
+	log.Println("Shutting down HTTP server...")
+	return s.httpServer.Shutdown(ctx)
+}
+// Addr returns the server's listen address.
+func (s *Server) Addr() string {
+	return s.cfg.ListenAddr
+}
--- a/deploy/chrek/pkg/http_api_server/types.go
+++ b/deploy/chrek/pkg/http_api_server/types.go
+// Package server provides HTTP server functionality for the checkpoint agent.
+package httpApiServer
+import "time"
+// CheckpointRequest is the request body for checkpoint operations.
+type CheckpointRequest struct {
+	ContainerID   string `json:"container_id"`
+	ContainerName string `json:"container_name,omitempty"` // K8s container name (for volume type lookup)
+	CheckpointID  string `json:"checkpoint_id"`
+	PodName       string `json:"pod_name,omitempty"`
+	PodNamespace  string `json:"pod_namespace,omitempty"`
+	DisableCUDA   bool   `json:"disable_cuda,omitempty"` // Disable CUDA plugin for non-GPU workloads
+}
+// CheckpointResponse is the response for checkpoint operations.
+type CheckpointResponse struct {
+	Success      bool   `json:"success"`
+	CheckpointID string `json:"checkpoint_id,omitempty"`
+	Message      string `json:"message,omitempty"`
+	Error        string `json:"error,omitempty"`
+}
+// CheckpointInfo represents information about a checkpoint.
+type CheckpointInfo struct {
+	ID           string    `json:"id"`
+	CreatedAt    time.Time `json:"created_at"`
+	SourceNode   string    `json:"source_node"`
+	ContainerID  string    `json:"container_id"`
+	PodName      string    `json:"pod_name"`
+	PodNamespace string    `json:"pod_namespace"`
+}
+// ListCheckpointsResponse is the response for list checkpoints.
+type ListCheckpointsResponse struct {
+	Checkpoints []CheckpointInfo `json:"checkpoints"`
+}
+// HealthResponse is the response for health check.
+type HealthResponse struct {
+	Status   string `json:"status"`
+	NodeName string `json:"node_name"`
+}
--- a/deploy/chrek/pkg/restore/config.go
+++ b/deploy/chrek/pkg/restore/config.go
+// config.go defines the RestoreRequest struct for CRIU restore operations.
+// CRIU options come from the saved CheckpointManifest, not from this request.
+//
+// The restore-entrypoint runs in placeholder containers which do NOT mount the
+// ConfigMap. Static defaults are hardcoded here; per-pod dynamic values come
+// from environment variables injected by the operator.
+package restore
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"os"
+	"strings"
+	"time"
+	"github.com/sirupsen/logrus"
+	"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/checkpoint"
+)
+const (
+	// RestoreLogFilename is the CRIU restore log filename.
+	RestoreLogFilename = "restore.log"
+	// CRIULogDir is the directory where CRIU restore logs are copied for debugging.
+	CRIULogDir = "/checkpoints/restore-logs"
+	// RestoreTriggerPath is the default path to the trigger file for trigger-based restore.
+	RestoreTriggerPath = "/tmp/restore-trigger"
+)
+// RestoreRequest holds runtime request inputs for the restore entrypoint.
+// CRIU options are NOT stored here - they come from the saved CheckpointManifest.
+type RestoreRequest struct {
+	// === Per-pod dynamic values (from operator-injected env vars) ===
+	// CheckpointPath is the base directory containing checkpoints.
+	CheckpointPath string
+	// CheckpointHash is the ID/hash of the checkpoint to restore.
+	CheckpointHash string
+	// CheckpointLocation is the full resolved path to the checkpoint directory.
+	CheckpointLocation string
+	// SkipWaitForCheckpoint controls the entrypoint behavior.
+	SkipWaitForCheckpoint bool
+	// ColdStartArgs is the command+args to exec if no checkpoint is available.
+	ColdStartArgs []string
+	// Debug enables debug logging.
+	Debug bool
+	// === Static defaults (hardcoded) ===
+	// RestoreMarkerFilePath is where restore-entrypoint writes a marker before CRIU restore.
+	RestoreMarkerFilePath string
+	// RestoreTrigger is the path to the trigger file that signals restore should start.
+	RestoreTrigger string
+	// WaitTimeout is the maximum time to wait for a checkpoint.
+	// Zero means wait indefinitely.
+	WaitTimeout time.Duration
+}
+// ConfigError represents a configuration validation error.
+type ConfigError struct {
+	Field   string
+	Message string
+}
+func (e *ConfigError) Error() string {
+	return fmt.Sprintf("config error: %s: %s", e.Field, e.Message)
+}
+// NewRestoreRequest creates a RestoreRequest with hardcoded defaults and
+// operator-injected environment variable values.
+func NewRestoreRequest(args []string) (*RestoreRequest, error) {
+	cfg := &RestoreRequest{
+		RestoreTrigger: RestoreTriggerPath,
+		ColdStartArgs:  args,
+	}
+	if v := os.Getenv("DYN_CHECKPOINT_PATH"); v != "" {
+		cfg.CheckpointPath = v
+	}
+	if v := os.Getenv("DYN_CHECKPOINT_HASH"); v != "" {
+		cfg.CheckpointHash = v
+	}
+	if v := os.Getenv("DYN_CHECKPOINT_LOCATION"); v != "" {
+		cfg.CheckpointLocation = v
+	} else if cfg.CheckpointPath != "" && cfg.CheckpointHash != "" {
+		cfg.CheckpointLocation = cfg.CheckpointPath + "/" + cfg.CheckpointHash
+	}
+	cfg.SkipWaitForCheckpoint = os.Getenv("SKIP_WAIT_FOR_CHECKPOINT") == "1"
+	cfg.Debug = os.Getenv("DEBUG") == "1"
+	cfg.RestoreMarkerFilePath = os.Getenv("DYN_RESTORE_MARKER_FILE")
+	if cfg.RestoreMarkerFilePath == "" {
+		return nil, &ConfigError{
+			Field:   "DYN_RESTORE_MARKER_FILE",
+			Message: "must be set",
+		}
+	}
+	return cfg, nil
+}
+type checkpointDoneMarker struct {
+	Success bool   `json:"success"`
+	Error   string `json:"error,omitempty"`
+}
+func checkpointDoneSucceeded(donePath string, log *logrus.Entry) bool {
+	data, err := os.ReadFile(donePath)
+	if err != nil {
+		log.WithError(err).WithField("path", donePath).Warn("Failed to read checkpoint.done marker")
+		return false
+	}
+	var marker checkpointDoneMarker
+	if err := json.Unmarshal(data, &marker); err != nil {
+		log.WithError(err).WithField("path", donePath).Warn("Failed to parse checkpoint.done marker")
+		return false
+	}
+	if !marker.Success {
+		fields := logrus.Fields{"path": donePath}
+		if marker.Error != "" {
+			fields["error"] = marker.Error
+		}
+		log.WithFields(fields).Warn("checkpoint.done marker reports failed checkpoint")
+		return false
+	}
+	return true
+}
+// ShouldRestore checks if a restore should be performed.
+// Returns the checkpoint path and true if restore should proceed.
+func ShouldRestore(cfg *RestoreRequest, log *logrus.Entry) (string, bool) {
+	// Method 1: Checkpoint location is set and checkpoint is fully complete
+	if cfg.CheckpointLocation != "" {
+		donePath := cfg.CheckpointLocation + "/" + checkpoint.CheckpointDoneFilename
+		if _, err := os.Stat(donePath); err == nil {
+			if checkpointDoneSucceeded(donePath, log) {
+				log.WithField("path", cfg.CheckpointLocation).Info("Checkpoint found (checkpoint.done success=true)")
+				return cfg.CheckpointLocation, true
+			}
+		}
+		// Fallback: check for manifest.yaml but warn about potential race condition.
+		manifestPath := cfg.CheckpointLocation + "/" + checkpoint.CheckpointManifestFilename
+		if _, err := os.Stat(manifestPath); err == nil {
+			log.WithFields(logrus.Fields{
+				"path":    cfg.CheckpointLocation,
+				"warning": "checkpoint.done marker not found, checkpoint may be incomplete",
+			}).Warn("Checkpoint manifest found but checkpoint.done missing - checkpoint may still be in progress")
+		}
+	}
+	// Method 2: Restore trigger file exists with checkpoint path
+	if cfg.RestoreTrigger != "" {
+		data, err := os.ReadFile(cfg.RestoreTrigger)
+		if err == nil {
+			checkpointPath := strings.TrimSpace(string(data))
+			if checkpointPath != "" {
+				donePath := checkpointPath + "/" + checkpoint.CheckpointDoneFilename
+				if _, err := os.Stat(donePath); err == nil {
+					if checkpointDoneSucceeded(donePath, log) {
+						log.WithField("path", checkpointPath).Info("Restore triggered via file (checkpoint.done success=true)")
+						return checkpointPath, true
+					}
+				}
+			}
+		}
+	}
+	return "", false
+}
+// WaitForCheckpoint waits for a checkpoint to become available.
+// If cfg.WaitTimeout is zero, waits indefinitely (until ctx is cancelled).
+func WaitForCheckpoint(ctx context.Context, cfg *RestoreRequest, log *logrus.Entry) (string, error) {
+	if cfg.WaitTimeout > 0 {
+		log.WithField("timeout", cfg.WaitTimeout).Info("Waiting for checkpoint")
+	} else {
+		log.Info("Waiting for checkpoint indefinitely")
+	}
+	startTime := time.Now()
+	ticker := time.NewTicker(time.Second)
+	defer ticker.Stop()
+	lastLog := time.Now()
+	for {
+		select {
+		case <-ctx.Done():
+			return "", ctx.Err()
+		case <-ticker.C:
+			if path, ok := ShouldRestore(cfg, log); ok {
+				return path, nil
+			}
+			// Log progress every 30 seconds
+			if time.Since(lastLog) >= 30*time.Second {
+				elapsed := time.Since(startTime)
+				log.WithField("elapsed", elapsed).Info("Still waiting for checkpoint...")
+				lastLog = time.Now()
+			}
+			// Only enforce deadline if WaitTimeout is set (non-zero)
+			if cfg.WaitTimeout > 0 && time.Since(startTime) >= cfg.WaitTimeout {
+				return "", fmt.Errorf("timed out waiting for checkpoint after %s", cfg.WaitTimeout)
+			}
+		}
+	}
+}
--- a/deploy/chrek/pkg/restore/criu.go
+++ b/deploy/chrek/pkg/restore/criu.go
@@ -12,28 +12,42 @@ import (
 	"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/common"
 )
-// CRIURestoreConfig holds configuration for CRIU restore operations.
+// CRIURestorePlan holds configuration for CRIU restore operations.
-// Most options are always-on with safe defaults for K8s environments.
+// Most fields come from the saved CheckpointManifest.CRIUDump.CRIU settings.
-type CRIURestoreConfig struct {
+type CRIURestorePlan struct {
-	ImageDirFD   int32
+	// File descriptors
-	RootPath     string
+	ImageDirFD int32
-	LogLevel     int32
+	WorkDirFD  int32
-	LogFile      string
+	NetNsFD    int32
-	WorkDirFD    int32
-	NetNsFD      int32
+	// Paths
+	RootPath string
+	LogFile  string
+	// Options from CheckpointManifest.CRIUDump.CRIU.
+	LogLevel          int32
+	Timeout           uint32 // CRIU timeout in seconds (0 = no timeout, required for CUDA)
+	ShellJob          bool   // Allow session leaders (containers are often session leaders)
+	TcpClose          bool   // Close TCP connections (pod IPs change on restore)
+	FileLocks         bool   // Allow file locks
+	ExtUnixSk         bool   // Allow external Unix sockets
+	LinkRemap         bool   // Handle deleted-but-open files via CRIU link remap
+	ManageCgroupsMode string // Cgroup handling mode: "ignore" lets K8s manage cgroups
+	// External mount mappings (from CheckpointManifest.CRIUDump.ExtMnt).
 	ExtMountMaps []*criurpc.ExtMountMap
 }
 // OpenImageDir opens a checkpoint directory and clears CLOEXEC for CRIU.
 // Returns the opened file and its FD. Caller must close the file when done.
 func OpenImageDir(checkpointPath string) (*os.File, int32, error) {
-	return common.OpenDirForCRIU(checkpointPath)
+	return common.OpenPathForCRIU(checkpointPath)
 }
 // OpenNetworkNamespace opens the target network namespace for restore.
 // Returns the opened file and its FD. Caller must close the file when done.
 func OpenNetworkNamespace(nsPath string) (*os.File, int32, error) {
-	return common.OpenDirForCRIU(nsPath)
+	return common.OpenPathForCRIU(nsPath)
 }
 // OpenWorkDir opens a work directory for CRIU and clears CLOEXEC.
@@ -65,62 +79,87 @@ func OpenWorkDir(workDir string, log *logrus.Entry) (*os.File, int32) {
 	return workDirFile, int32(workDirFile.Fd())
 }
-// BuildRestoreCRIUOpts creates CRIU options for restore from a config struct.
+// BuildCRIURestoreOptions creates CRIU options for restore from a runtime plan.
 //
-// Always-on options for K8s:
+// Options from CheckpointManifest.CRIUDump.CRIU (saved at checkpoint time):
-//   - ShellJob: containers are often session leaders
+//   - ShellJob, TcpClose, FileLocks, ExtUnixSk, LinkRemap, ManageCgroupsMode
-//   - TcpClose: pod IPs change on restore/migration
+//
-//   - FileLocks: applications use file locks
+// Hardcoded restore-specific options:
-//   - ExtUnixSk: containers have external Unix sockets
+//   - RstSibling: restore in detached mode
-//   - ManageCgroups (IGNORE): let K8s manage cgroups
+//   - MntnsCompatMode: cross-container restore
-func BuildRestoreCRIUOpts(cfg CRIURestoreConfig) *criurpc.CriuOpts {
+//   - EvasiveDevices, ForceIrmap: device/inode handling
-	cgMode := criurpc.CriuCgMode_IGNORE
+func BuildCRIURestoreOptions(plan CRIURestorePlan) *criurpc.CriuOpts {
+	// Map cgroup management mode from plan.
+	var cgMode criurpc.CriuCgMode
+	switch plan.ManageCgroupsMode {
+	case "soft":
+		cgMode = criurpc.CriuCgMode_SOFT
+	case "full":
+		cgMode = criurpc.CriuCgMode_FULL
+	case "strict":
+		cgMode = criurpc.CriuCgMode_STRICT
+	case "ignore", "":
+		cgMode = criurpc.CriuCgMode_IGNORE
+	default:
+		cgMode = criurpc.CriuCgMode_IGNORE
+	}
 	criuOpts := &criurpc.CriuOpts{
-		ImagesDirFd: proto.Int32(cfg.ImageDirFD),
+		ImagesDirFd: proto.Int32(plan.ImageDirFD),
-		LogLevel:    proto.Int32(cfg.LogLevel),
+		LogLevel:    proto.Int32(plan.LogLevel),
-		LogFile:     proto.String(cfg.LogFile),
+		LogFile:     proto.String(plan.LogFile),
 		// Root filesystem - use current container's root
-		Root: proto.String(cfg.RootPath),
+		Root: proto.String(plan.RootPath),
-		// Restore in detached mode - process runs in background
+		// Restore in detached mode - process runs in background (restore-specific)
 		RstSibling: proto.Bool(true),
-		// Mount namespace compatibility mode for cross-container restore
+		// Mount namespace mode:
-		MntnsCompatMode: proto.Bool(true),
+		// - MntnsCompatMode=false (default): Uses mount-v2 with MOVE_MOUNT_SET_GROUP (kernel 5.15+)
+		//   This is preferred as it doesn't create temp dirs in /tmp
-		// Always-on for K8s environments
+		// - MntnsCompatMode=true: Uses compat mode which creates /tmp/cr-tmpfs.XXX
-		ShellJob:  proto.Bool(true),
+		//   This can cause "Device or resource busy" errors on cleanup
-		TcpClose:  proto.Bool(true),
+		// We explicitly set to false to use mount-v2 (requires kernel 5.15+)
-		FileLocks: proto.Bool(true),
+		MntnsCompatMode: proto.Bool(false),
-		ExtUnixSk: proto.Bool(true),
+		// Options from saved CheckpointManifest.CRIUDump.CRIU.
-		// Cgroup management - ignore to avoid conflicts
+		ShellJob:  proto.Bool(plan.ShellJob),
+		TcpClose:  proto.Bool(plan.TcpClose),
+		FileLocks: proto.Bool(plan.FileLocks),
+		ExtUnixSk: proto.Bool(plan.ExtUnixSk),
+		LinkRemap: proto.Bool(plan.LinkRemap),
+		// Cgroup management from saved settings.
 		ManageCgroups:     proto.Bool(true),
 		ManageCgroupsMode: &cgMode,
-		// Device and inode handling
+		// Device and inode handling (restore-specific)
 		EvasiveDevices: proto.Bool(true),
 		ForceIrmap:     proto.Bool(true),
 		// External mount mappings
-		ExtMnt: cfg.ExtMountMaps,
+		ExtMnt: plan.ExtMountMaps,
 	}
 	// Add network namespace inheritance if provided
-	if cfg.NetNsFD >= 0 {
+	if plan.NetNsFD >= 0 {
 		criuOpts.InheritFd = []*criurpc.InheritFd{
 			{
 				Key: proto.String("extNetNs"),
-				Fd:  proto.Int32(cfg.NetNsFD),
+				Fd:  proto.Int32(plan.NetNsFD),
 			},
 		}
 	}
 	// Add work directory if specified
-	if cfg.WorkDirFD >= 0 {
+	if plan.WorkDirFD >= 0 {
-		criuOpts.WorkDirFd = proto.Int32(cfg.WorkDirFD)
+		criuOpts.WorkDirFd = proto.Int32(plan.WorkDirFD)
+	}
+	// Add timeout if specified (required for CUDA restores)
+	if plan.Timeout > 0 {
+		criuOpts.Timeout = proto.Uint32(plan.Timeout)
 	}
 	return criuOpts

--- a/deploy/chrek/pkg/restore/filesystem.go
+++ b/deploy/chrek/pkg/restore/filesystem.go
@@ -8,19 +8,14 @@ import (
 	"path/filepath"
 	"github.com/sirupsen/logrus"
-)
-const (
+	"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/checkpoint"
-	// RootfsDiffFilename is the name of the rootfs diff tar file
-	RootfsDiffFilename = "rootfs-diff.tar"
-	// DeletedFilesFilename is the name of the deleted files JSON
-	DeletedFilesFilename = "deleted-files.json"
 )
 // ApplyRootfsDiff extracts the rootfs-diff.tar from the checkpoint to the target root.
 // This restores filesystem changes that were made in the original container.
 func ApplyRootfsDiff(checkpointPath, targetRoot string, log *logrus.Entry) error {
-	rootfsDiffPath := filepath.Join(checkpointPath, RootfsDiffFilename)
+	rootfsDiffPath := filepath.Join(checkpointPath, checkpoint.RootfsDiffFilename)
 	// Check if rootfs-diff.tar exists
 	if _, err := os.Stat(rootfsDiffPath); os.IsNotExist(err) {
@@ -30,15 +25,10 @@ func ApplyRootfsDiff(checkpointPath, targetRoot string, log *logrus.Entry) error
 	log.WithField("path", rootfsDiffPath).Info("Applying rootfs diff")
-	// Build tar command with options to handle conflicts:
+	// Exclusions are already applied at checkpoint time (bind mounts, system dirs, etc.)
-	// --keep-old-files: Don't overwrite existing files (may already be mounted)
+	// so we just extract with --keep-old-files to avoid overwriting existing files.
-	// Exclude paths that are typically mounted read-only by the container runtime
 	cmd := exec.Command("tar",
 		"--keep-old-files",
-		"--exclude=./run/secrets",
-		"--exclude=./etc/resolv.conf",
-		"--exclude=./etc/hostname",
-		"--exclude=./etc/hosts",
 		"-C", targetRoot,
 		"-xf", rootfsDiffPath,
 	)
@@ -61,7 +51,7 @@ func ApplyRootfsDiff(checkpointPath, targetRoot string, log *logrus.Entry) error
 // ApplyDeletedFiles removes files that were deleted in the original container.
 // These are tracked via overlay whiteout markers (.wh.<filename>).
 func ApplyDeletedFiles(checkpointPath, targetRoot string, log *logrus.Entry) error {
-	deletedFilesPath := filepath.Join(checkpointPath, DeletedFilesFilename)
+	deletedFilesPath := filepath.Join(checkpointPath, checkpoint.DeletedFilesFilename)
 	// Check if deleted-files.json exists
 	data, err := os.ReadFile(deletedFilesPath)
@@ -109,8 +99,5 @@ func ApplyDeletedFiles(checkpointPath, targetRoot string, log *logrus.Entry) err
 func CheckpointFilesExist(checkpointPath string) bool {
 	// Check for CRIU image files (core-*.img is always present)
 	matches, err := filepath.Glob(filepath.Join(checkpointPath, "core-*.img"))
-	if err != nil || len(matches) == 0 {
+	return err == nil && len(matches) > 0
-		return false
-	}
-	return true
 }
--- a/deploy/chrek/pkg/restore/link_remap.go
+++ b/deploy/chrek/pkg/restore/link_remap.go
+// Package restore provides CRIU restore operations.
+package restore
+import (
+	"encoding/binary"
+	"fmt"
+	"io"
+	"os"
+	"path/filepath"
+	"strings"
+	"github.com/checkpoint-restore/go-criu/v7/crit"
+	"github.com/checkpoint-restore/go-criu/v7/crit/images/fdinfo"
+	"github.com/checkpoint-restore/go-criu/v7/crit/images/regfile"
+	remap_file_path "github.com/checkpoint-restore/go-criu/v7/crit/images/remap-file-path"
+	"github.com/sirupsen/logrus"
+	"google.golang.org/protobuf/proto"
+)
+// CreateLinkRemapStubs parses CRIU images to find remapped files and creates
+// the link_remap stub files needed for CRIU restore.
+//
+// Background: When a file is unlink()'d but a process still has an open FD to it,
+// CRIU handles this via "link remapping":
+//
+//   - During dump: CRIU creates a hardlink link_remap.<id> -> original_file
+//   - During restore: CRIU does linkat(link_remap.<id>, original_path) to recreate it
+//
+// The link_remap file only exists on the original node's filesystem. For cross-node
+// restore, we must create stub files so CRIU can hardlink from them.
+//
+// Without these stubs, CRIU fails with:
+//
+//	"Can't link <path>/link_remap.X -> <path>/original: No such file or directory"
+func CreateLinkRemapStubs(checkpointPath string, log *logrus.Entry) error {
+	// 1. Parse remap-fpath.img to find files that need remapping
+	remapPath := filepath.Join(checkpointPath, "remap-fpath.img")
+	remaps, err := parseRemapFpath(remapPath)
+	if err != nil {
+		if os.IsNotExist(err) {
+			log.Debug("No remap-fpath.img found, no link_remap stubs needed")
+			return nil
+		}
+		return fmt.Errorf("failed to parse remap-fpath.img: %w", err)
+	}
+	if len(remaps) == 0 {
+		log.Debug("No file remaps found in checkpoint")
+		return nil
+	}
+	// 2. Parse file info to build ID -> fileInfo mapping
+	// Try reg-files.img first (older CRIU format), fall back to files.img (newer format)
+	regFilesPath := filepath.Join(checkpointPath, "reg-files.img")
+	filesPath := filepath.Join(checkpointPath, "files.img")
+	var fileMap map[uint32]fileInfo
+	var parseErr error
+	// Try reg-files.img first (older CRIU format)
+	fileMap, parseErr = parseRegFilesWithMode(regFilesPath)
+	if parseErr != nil {
+		log.WithError(parseErr).Debug("Could not parse reg-files.img, trying files.img")
+		// Fall back to files.img (newer format)
+		fileMap, parseErr = parseFilesImgWithMode(filesPath)
+		if parseErr != nil {
+			log.WithError(parseErr).WithField("remap_count", len(remaps)).Warn(
+				"Found remap entries but could not parse reg-files.img or files.img — link_remap stubs will not be created")
+			return fmt.Errorf("found %d remap entries but could not build file map: %w", len(remaps), parseErr)
+		}
+	}
+	// 3. Create link_remap stub files for all remapped files
+	var created []string
+	for _, remap := range remaps {
+		// Look up the original file by ID
+		origInfo, ok := fileMap[remap.origID]
+		if !ok {
+			log.WithField("orig_id", remap.origID).Debug("Original file ID not found in file map, skipping")
+			continue
+		}
+		// Look up the remap file path by remap ID
+		// This is the link_remap.XXX file that CRIU will hardlink FROM
+		remapInfo, ok := fileMap[remap.remapID]
+		var remapName string
+		var mode os.FileMode
+		if ok {
+			remapName = remapInfo.name
+			mode = remapInfo.mode
+		} else {
+			// If we can't find the remap file in fileMap, construct it
+			// CRIU creates link_remap files in the same directory as the original
+			// with format: link_remap.<remap_id>
+			dir := filepath.Dir(origInfo.name)
+			if !strings.HasPrefix(dir, "/") {
+				dir = "/" + dir
+			}
+			remapName = filepath.Join(dir, fmt.Sprintf("link_remap.%d", remap.remapID))
+			// Use original file's mode since we don't have the remap file's mode
+			mode = origInfo.mode
+			log.WithFields(logrus.Fields{
+				"orig_id":    remap.origID,
+				"remap_id":   remap.remapID,
+				"orig_path":  origInfo.name,
+				"remap_path": remapName,
+				"mode":       fmt.Sprintf("%04o", mode),
+			}).Debug("Constructed link_remap path from remap ID")
+		}
+		// Normalize path
+		if !strings.HasPrefix(remapName, "/") {
+			remapName = "/" + remapName
+		}
+		// Check if the link_remap file already exists
+		if _, err := os.Stat(remapName); err == nil {
+			log.WithField("remap_file", remapName).Debug("Link remap file already exists")
+			continue
+		}
+		// Create the link_remap stub file with correct permissions
+		// CRIU will hardlink FROM this file TO the original path
+		if err := createLinkRemapStub(remapName, mode); err != nil {
+			log.WithError(err).WithFields(logrus.Fields{
+				"remap_file": remapName,
+				"target":     origInfo.name,
+				"mode":       fmt.Sprintf("%04o", mode),
+			}).Warn("Failed to create link_remap stub")
+			continue
+		}
+		created = append(created, filepath.Base(remapName))
+		log.WithFields(logrus.Fields{
+			"remap_file": remapName,
+			"target":     origInfo.name,
+			"mode":       fmt.Sprintf("%04o", mode),
+		}).Debug("Created link_remap stub file")
+	}
+	if len(created) > 0 {
+		log.WithFields(logrus.Fields{
+			"count":       len(created),
+			"remap_files": created,
+		}).Info("Created link_remap stub files for CRIU restore")
+	} else {
+		log.Debug("No link_remap stubs needed")
+	}
+	return nil
+}
+// fileInfo holds file metadata from CRIU checkpoint images
+type fileInfo struct {
+	name string
+	mode os.FileMode
+}
+// remapEntry represents a file remap entry from CRIU
+type remapEntry struct {
+	origID    uint32
+	remapID   uint32
+	remapType int32
+}
+// parseRemapFpath parses the remap-fpath.img file
+func parseRemapFpath(path string) ([]remapEntry, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+	// Read and validate magic number using go-criu's ReadMagic
+	magic, err := crit.ReadMagic(f)
+	if err != nil {
+		return nil, fmt.Errorf("failed to read magic: %w", err)
+	}
+	if magic != "REMAP_FPATH" {
+		return nil, fmt.Errorf("unexpected magic: %s (expected REMAP_FPATH)", magic)
+	}
+	var entries []remapEntry
+	sizeBuf := make([]byte, 4)
+	for {
+		// Read entry size
+		_, err := io.ReadFull(f, sizeBuf)
+		if err == io.EOF || err == io.ErrUnexpectedEOF {
+			break
+		}
+		if err != nil {
+			return nil, fmt.Errorf("failed to read entry size: %w", err)
+		}
+		entrySize := binary.LittleEndian.Uint32(sizeBuf)
+		entryBuf := make([]byte, entrySize)
+		if _, err := io.ReadFull(f, entryBuf); err != nil {
+			return nil, fmt.Errorf("failed to read entry data: %w", err)
+		}
+		// Parse protobuf
+		entry := &remap_file_path.RemapFilePathEntry{}
+		if err := proto.Unmarshal(entryBuf, entry); err != nil {
+			return nil, fmt.Errorf("failed to unmarshal entry: %w", err)
+		}
+		entries = append(entries, remapEntry{
+			origID:    entry.GetOrigId(),
+			remapID:   entry.GetRemapId(),
+			remapType: int32(entry.GetRemapType()),
+		})
+	}
+	return entries, nil
+}
+// parseRegFilesWithMode parses the reg-files.img file and returns a map of ID -> fileInfo
+func parseRegFilesWithMode(path string) (map[uint32]fileInfo, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+	// Read and validate magic number using go-criu's ReadMagic
+	magic, err := crit.ReadMagic(f)
+	if err != nil {
+		return nil, fmt.Errorf("failed to read magic: %w", err)
+	}
+	if magic != "REG_FILES" {
+		return nil, fmt.Errorf("unexpected magic: %s (expected REG_FILES)", magic)
+	}
+	fileMap := make(map[uint32]fileInfo)
+	sizeBuf := make([]byte, 4)
+	for {
+		// Read entry size
+		_, err := io.ReadFull(f, sizeBuf)
+		if err == io.EOF || err == io.ErrUnexpectedEOF {
+			break
+		}
+		if err != nil {
+			return nil, fmt.Errorf("failed to read entry size: %w", err)
+		}
+		entrySize := binary.LittleEndian.Uint32(sizeBuf)
+		entryBuf := make([]byte, entrySize)
+		if _, err := io.ReadFull(f, entryBuf); err != nil {
+			return nil, fmt.Errorf("failed to read entry data: %w", err)
+		}
+		// Parse protobuf
+		entry := &regfile.RegFileEntry{}
+		if err := proto.Unmarshal(entryBuf, entry); err != nil {
+			return nil, fmt.Errorf("failed to unmarshal entry: %w", err)
+		}
+		// Convert CRIU mode (includes file type bits) to os.FileMode
+		// CRIU stores the full st_mode, we need just the permission bits
+		mode := os.FileMode(entry.GetMode() & 0777)
+		if mode == 0 {
+			mode = 0600 // Default to owner read/write if mode not set
+		}
+		fileMap[entry.GetId()] = fileInfo{
+			name: entry.GetName(),
+			mode: mode,
+		}
+	}
+	return fileMap, nil
+}
+// parseFilesImgWithMode parses the files.img file and returns a map of ID -> fileInfo
+// This is the newer CRIU format where file info is embedded in FileEntry messages
+func parseFilesImgWithMode(path string) (map[uint32]fileInfo, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+	// Read and validate magic number using go-criu's ReadMagic
+	magic, err := crit.ReadMagic(f)
+	if err != nil {
+		return nil, fmt.Errorf("failed to read magic: %w", err)
+	}
+	if magic != "FILES" {
+		return nil, fmt.Errorf("unexpected magic: %s (expected FILES)", magic)
+	}
+	fileMap := make(map[uint32]fileInfo)
+	sizeBuf := make([]byte, 4)
+	for {
+		// Read entry size
+		_, err := io.ReadFull(f, sizeBuf)
+		if err == io.EOF || err == io.ErrUnexpectedEOF {
+			break
+		}
+		if err != nil {
+			return nil, fmt.Errorf("failed to read entry size: %w", err)
+		}
+		entrySize := binary.LittleEndian.Uint32(sizeBuf)
+		entryBuf := make([]byte, entrySize)
+		if _, err := io.ReadFull(f, entryBuf); err != nil {
+			return nil, fmt.Errorf("failed to read entry data: %w", err)
+		}
+		// Parse protobuf as FileEntry
+		entry := &fdinfo.FileEntry{}
+		if err := proto.Unmarshal(entryBuf, entry); err != nil {
+			return nil, fmt.Errorf("failed to unmarshal entry: %w", err)
+		}
+		// Extract fileinfo from embedded RegFileEntry if present
+		if entry.GetReg() != nil {
+			reg := entry.GetReg()
+			// Convert CRIU mode to os.FileMode (permission bits only)
+			mode := os.FileMode(reg.GetMode() & 0777)
+			if mode == 0 {
+				mode = 0600 // Default to owner read/write if mode not set
+			}
+			fileMap[entry.GetId()] = fileInfo{
+				name: reg.GetName(),
+				mode: mode,
+			}
+		}
+	}
+	return fileMap, nil
+}
+// createLinkRemapStub creates an empty stub file for CRIU link_remap.
+// The file is created with the specified mode to match what CRIU expects.
+func createLinkRemapStub(path string, mode os.FileMode) error {
+	// Ensure parent directory exists
+	dir := filepath.Dir(path)
+	if err := os.MkdirAll(dir, 0755); err != nil {
+		return fmt.Errorf("failed to create directory %s: %w", dir, err)
+	}
+	// Create file with the specified mode
+	// CRIU validates the file mode matches what was recorded at checkpoint time
+	f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, mode)
+	if err != nil {
+		return fmt.Errorf("failed to create file: %w", err)
+	}
+	defer f.Close()
+	// Write 32 bytes of zeros as stub content
+	// This provides a minimal valid file for CRIU to hardlink from
+	stub := make([]byte, 32)
+	if _, err := f.Write(stub); err != nil {
+		return fmt.Errorf("failed to write stub data: %w", err)
+	}
+	return nil
+}
--- a/deploy/chrek/pkg/restore/mounts.go
+++ b/deploy/chrek/pkg/restore/mounts.go
@@ -6,81 +6,44 @@ import (
 	criurpc "github.com/checkpoint-restore/go-criu/v7/rpc"
 	"google.golang.org/protobuf/proto"
-	"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/common"
+	"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/checkpoint"
 )
 // GenerateExtMountMaps generates external mount mappings for CRIU restore.
-// It parses /proc/1/mountinfo (the restore container's mounts) and adds
+// It reuses the exact dump-time ext-mount plan persisted in checkpoint manifest.
-// mappings for all mount points plus masked/readonly paths from common.
+func GenerateExtMountMaps(data *checkpoint.CheckpointManifest) ([]*criurpc.ExtMountMap, error) {
-//
+	if data == nil {
-// If meta is nil or doesn't have OCI-derived paths, falls back to defaults.
+		return nil, fmt.Errorf("checkpoint manifest is required")
-func GenerateExtMountMaps(meta *common.CheckpointMetadata) ([]*criurpc.ExtMountMap, error) {
+	}
-	var maps []*criurpc.ExtMountMap
+	if len(data.CRIUDump.ExtMnt) == 0 {
-	addedMounts := make(map[string]bool)
+		return nil, fmt.Errorf("checkpoint manifest is missing criuDump.extMnt")
+	}
-	// Add root filesystem mapping first
+	maps := []*criurpc.ExtMountMap{{
-	maps = append(maps, &criurpc.ExtMountMap{
 		Key: proto.String("/"),
 		Val: proto.String("."),
-	})
+	}}
-	addedMounts["/"] = true
+	addedMounts := map[string]struct{}{"/": {}}
-	// Parse /proc/1/mountinfo for all current mount points
+	// Replay dump-time ext-mount plan exactly, with restore-specific root remap.
-	mountPoints, err := common.GetMountPointPaths("/proc/1/mountinfo")
+	for _, mount := range data.CRIUDump.ExtMnt {
-	if err != nil {
+		key := mount.Key
-		return nil, fmt.Errorf("failed to parse mountinfo: %w", err)
+		if key == "" || key == "/" {
-	}
-	for _, mountPoint := range mountPoints {
-		if addedMounts[mountPoint] || mountPoint == "/" {
 			continue
 		}
-		maps = append(maps, &criurpc.ExtMountMap{
+		if _, exists := addedMounts[key]; exists {
-			Key: proto.String(mountPoint),
-			Val: proto.String(mountPoint),
-		})
-		addedMounts[mountPoint] = true
-	}
-	// Use masked paths from checkpoint metadata (OCI spec derived)
-	// Fall back to defaults for backwards compatibility
-	maskedPaths := common.DefaultMaskedPaths()
-	if meta != nil && len(meta.MaskedPaths) > 0 {
-		maskedPaths = meta.MaskedPaths
-	}
-	for _, path := range maskedPaths {
-		if addedMounts[path] {
 			continue
 		}
+		val := mount.Val
+		if val == "" {
+			val = key
+		}
 		maps = append(maps, &criurpc.ExtMountMap{
-			Key: proto.String(path),
+			Key: proto.String(key),
-			Val: proto.String(path),
+			Val: proto.String(val),
 		})
-		addedMounts[path] = true
+		addedMounts[key] = struct{}{}
-	}
-	// Also add readonly paths from metadata if available
-	if meta != nil {
-		for _, path := range meta.ReadonlyPaths {
-			if addedMounts[path] {
-				continue
-			}
-			maps = append(maps, &criurpc.ExtMountMap{
-				Key: proto.String(path),
-				Val: proto.String(path),
-			})
-			addedMounts[path] = true
-		}
 	}
 	return maps, nil
 }
-// AddExtMountMap is a helper to create a single ExtMountMap entry.
-func AddExtMountMap(key, val string) *criurpc.ExtMountMap {
-	return &criurpc.ExtMountMap{
-		Key: proto.String(key),
-		Val: proto.String(val),
-	}
-}
--- a/deploy/chrek/pkg/restore/options.go
+++ b/deploy/chrek/pkg/restore/options.go
-// Package restore provides CRIU restore operations for self-restoring placeholder containers.
-package restore
-import (
-	"context"
-	"os"
-	"strconv"
-	"time"
-	criurpc "github.com/checkpoint-restore/go-criu/v7/rpc"
-	"github.com/sirupsen/logrus"
-	"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/common"
-)
-// Config holds the configuration for the restore entrypoint.
-// These values are typically set via environment variables.
-type Config struct {
-	// CheckpointPath is the base directory containing checkpoints (default: /checkpoints)
-	// Env: DYN_CHECKPOINT_PATH
-	CheckpointPath string
-	// CheckpointHash is the ID/hash of the checkpoint to restore
-	// Env: DYN_CHECKPOINT_HASH
-	CheckpointHash string
-	// RestoreTrigger is the path to the trigger file that signals restore should start
-	RestoreTrigger string
-	// WaitForCheckpoint indicates whether to wait for a checkpoint to appear
-	WaitForCheckpoint bool
-	// WaitTimeout is the maximum time to wait for a checkpoint to become available
-	WaitTimeout time.Duration
-	// CRIULogLevel is the CRIU verbosity level (0-4, default: 4)
-	CRIULogLevel int32
-	// DefaultCmd is the command to run if no checkpoint is available
-	DefaultCmd string
-	// Debug enables debug logging
-	Debug bool
-	// EmbeddedCheckpointPath is the path to an embedded checkpoint within the image
-	// When set, the checkpoint data is baked into the container image itself
-	EmbeddedCheckpointPath string
-	// SkipInFlightConnections skips in-flight TCP connections during restore
-	SkipInFlightConnections bool
-	// AutoDedup enables auto-deduplication of memory pages
-	AutoDedup bool
-	// LazyPages enables lazy page migration (experimental)
-	LazyPages bool
-	// CRIUWorkDir is an alternative work directory for CRIU (instead of /tmp)
-	// Useful when /tmp has mount issues
-	CRIUWorkDir string
-	// CUDAPluginDir is the path to CRIU CUDA plugin directory (e.g., /usr/local/lib/criu)
-	// When set, a CRIU config file is created with libdir for CUDA plugin discovery during restore.
-	CUDAPluginDir string
-	// CRIUTimeout is the CRIU timeout in seconds (required for CUDA restores)
-	CRIUTimeout uint32
-	// RestoreMarkerFile is the path to a marker file created before CRIU restore.
-	// The restored process can check for this file to detect it was restored.
-	RestoreMarkerFile string
-}
-// DefaultEmbeddedCheckpointPath is the default path for embedded checkpoints
-const DefaultEmbeddedCheckpointPath = "/embedded-checkpoint"
-// ConfigFromEnv creates a Config from environment variables.
-func ConfigFromEnv() *Config {
-	cfg := &Config{
-		CheckpointPath:          getEnvOrDefault("DYN_CHECKPOINT_PATH", "/checkpoints"),
-		CheckpointHash:          os.Getenv("DYN_CHECKPOINT_HASH"),
-		RestoreTrigger:          getEnvOrDefault("RESTORE_TRIGGER", "/tmp/restore-trigger"),
-		WaitForCheckpoint:       os.Getenv("WAIT_FOR_CHECKPOINT") == "1",
-		WaitTimeout:             parseDurationOrDefault("RESTORE_WAIT_TIMEOUT", 300*time.Second),
-		CRIULogLevel:            parseIntOrDefault("CRIU_LOG_LEVEL", 4),
-		DefaultCmd:              os.Getenv("DEFAULT_CMD"),
-		Debug:                   os.Getenv("DEBUG") == "1",
-		EmbeddedCheckpointPath:  getEnvOrDefault("EMBEDDED_CHECKPOINT_PATH", DefaultEmbeddedCheckpointPath),
-		SkipInFlightConnections: os.Getenv("CRIU_SKIP_IN_FLIGHT") == "1",
-		AutoDedup:               os.Getenv("CRIU_AUTO_DEDUP") == "1",
-		LazyPages:               os.Getenv("CRIU_LAZY_PAGES") == "1",
-		CRIUWorkDir:             getEnvOrDefault("CRIU_WORK_DIR", ""),
-		CUDAPluginDir:           os.Getenv("CUDA_PLUGIN_DIR"), // For CUDA plugin discovery during restore
-		CRIUTimeout:             uint32(parseIntOrDefault("CRIU_TIMEOUT", 0)),
-		RestoreMarkerFile:       getEnvOrDefault("DYN_RESTORE_MARKER_FILE", "/tmp/dynamo-restored"),
-	}
-	return cfg
-}
-// RestoreOptions holds the options for a CRIU restore operation.
-// Most CRIU options are hardcoded with safe K8s defaults.
-type RestoreOptions struct {
-	// CheckpointPath is the path to the checkpoint directory
-	CheckpointPath string
-	// RootPath is the root filesystem path for restore (typically "/")
-	RootPath string
-	// PidFile is the path where CRIU writes the restored process PID
-	PidFile string
-	// LogFile is the name of the CRIU restore log file
-	LogFile string
-	// LogLevel is the CRIU logging verbosity (0-4)
-	LogLevel int32
-	// ExtMountMaps contains external mount mappings for CRIU
-	ExtMountMaps []*criurpc.ExtMountMap
-	// WorkDir is an alternative work directory for CRIU (instead of /tmp)
-	WorkDir string
-	// LibDir is the path to CRIU plugin directory (e.g., /usr/local/lib/criu)
-	// When set, a CRIU config file is created with libdir for CUDA plugin discovery.
-	LibDir string
-	// Timeout is the CRIU timeout in seconds (required for CUDA restores)
-	Timeout uint32
-}
-// DefaultRestoreOptions returns RestoreOptions with sensible defaults.
-func DefaultRestoreOptions(checkpointPath string) *RestoreOptions {
-	return &RestoreOptions{
-		CheckpointPath: checkpointPath,
-		RootPath:       "/",
-		PidFile:        "/tmp/restored.pid",
-		LogFile:        "restore.log",
-		LogLevel:       4,
-	}
-}
-// LoadRestoreOptions creates RestoreOptions from checkpoint metadata.
-// CRIU options are hardcoded with safe K8s defaults; metadata is only used for mount mappings.
-func LoadRestoreOptions(checkpointPath string, logLevel int32) (*RestoreOptions, error) {
-	opts := DefaultRestoreOptions(checkpointPath)
-	opts.LogLevel = logLevel
-	// Load metadata for OCI-derived paths (masked/readonly paths for external mounts)
-	meta, err := common.LoadMetadata(checkpointPath)
-	if err != nil {
-		// Return defaults if metadata is unavailable
-		// GenerateExtMountMaps with nil will use fallback defaults
-		return opts, nil
-	}
-	// Pre-generate external mount maps using OCI-derived paths from metadata
-	// This uses masked/readonly paths from the OCI spec instead of hardcoded defaults
-	extMounts, err := GenerateExtMountMaps(meta)
-	if err != nil {
-		// Fall back to defaults if generation fails
-		return opts, nil
-	}
-	opts.ExtMountMaps = extMounts
-	return opts, nil
-}
-// ShouldRestore checks if a restore should be performed.
-// Returns the checkpoint path and true if restore should proceed.
-// IMPORTANT: We check for checkpoint.done marker (not just metadata.json or inventory.img) because
-// checkpoint.done is written LAST in the checkpoint process, after rootfs-diff.tar completes.
-// Order: metadata.json -> CRIU dump (*.img files) -> rootfs-diff.tar -> checkpoint.done
-func ShouldRestore(cfg *Config, log *logrus.Entry) (string, bool) {
-	// Method 0: Embedded checkpoint in image (highest priority)
-	// This is for self-contained checkpoint images where data is baked in
-	if cfg.EmbeddedCheckpointPath != "" {
-		metadataPath := cfg.EmbeddedCheckpointPath + "/" + common.MetadataFilename
-		if _, err := os.Stat(metadataPath); err == nil {
-			log.WithField("path", cfg.EmbeddedCheckpointPath).Info("Embedded checkpoint found in image")
-			return cfg.EmbeddedCheckpointPath, true
-		}
-	}
-	// Method 1: DYN_CHECKPOINT_HASH is set and checkpoint is fully complete
-	if cfg.CheckpointHash != "" {
-		checkpointPath := cfg.CheckpointPath + "/" + cfg.CheckpointHash
-		// Check for checkpoint.done marker (written LAST after rootfs-diff.tar completes)
-		donePath := checkpointPath + "/checkpoint.done"
-		if _, err := os.Stat(donePath); err == nil {
-			log.WithField("path", checkpointPath).Info("Checkpoint found (checkpoint.done marker present)")
-			return checkpointPath, true
-		}
-		// Fallback: check for metadata.json but warn about potential race condition
-		metadataPath := checkpointPath + "/" + common.MetadataFilename
-		if _, err := os.Stat(metadataPath); err == nil {
-			log.WithFields(logrus.Fields{
-				"path":    checkpointPath,
-				"warning": "checkpoint.done marker not found, checkpoint may be incomplete",
-			}).Warn("Checkpoint metadata found but checkpoint.done missing - checkpoint may still be in progress")
-			// Don't return true here - wait for checkpoint.done
-		}
-	}
-	// Method 2: Restore trigger file exists with checkpoint path
-	if cfg.RestoreTrigger != "" {
-		data, err := os.ReadFile(cfg.RestoreTrigger)
-		if err == nil {
-			checkpointPath := string(data)
-			if checkpointPath != "" {
-				donePath := checkpointPath + "/checkpoint.done"
-				if _, err := os.Stat(donePath); err == nil {
-					log.WithField("path", checkpointPath).Info("Restore triggered via file (checkpoint.done marker present)")
-					return checkpointPath, true
-				}
-			}
-		}
-	}
-	return "", false
-}
-// WaitForCheckpoint waits for a checkpoint to become available.
-func WaitForCheckpoint(ctx context.Context, cfg *Config, log *logrus.Entry) (string, error) {
-	log.WithField("timeout", cfg.WaitTimeout).Info("Waiting for checkpoint")
-	deadline := time.Now().Add(cfg.WaitTimeout)
-	ticker := time.NewTicker(time.Second)
-	defer ticker.Stop()
-	lastLog := time.Now()
-	for {
-		select {
-		case <-ctx.Done():
-			return "", ctx.Err()
-		case <-ticker.C:
-			if path, ok := ShouldRestore(cfg, log); ok {
-				return path, nil
-			}
-			// Log progress every 30 seconds
-			if time.Since(lastLog) >= 30*time.Second {
-				elapsed := time.Since(deadline.Add(-cfg.WaitTimeout))
-				log.WithField("elapsed", elapsed).Info("Still waiting for checkpoint...")
-				lastLog = time.Now()
-			}
-			if time.Now().After(deadline) {
-				return "", context.DeadlineExceeded
-			}
-		}
-	}
-}
-// Helper functions
-func getEnvOrDefault(key, defaultValue string) string {
-	if value := os.Getenv(key); value != "" {
-		return value
-	}
-	return defaultValue
-}
-func parseDurationOrDefault(key string, defaultValue time.Duration) time.Duration {
-	value := os.Getenv(key)
-	if value == "" {
-		return defaultValue
-	}
-	seconds, err := strconv.Atoi(value)
-	if err != nil {
-		return defaultValue
-	}
-	return time.Duration(seconds) * time.Second
-}
-func parseIntOrDefault(key string, defaultValue int32) int32 {
-	value := os.Getenv(key)
-	if value == "" {
-		return defaultValue
-	}
-	i, err := strconv.Atoi(value)
-	if err != nil {
-		return defaultValue
-	}
-	return int32(i)
-}
--- a/deploy/chrek/pkg/restore/process.go
+++ b/deploy/chrek/pkg/restore/process.go
 package restore
 import (
+	"errors"
 	"fmt"
 	"io"
 	"os"
@@ -8,6 +9,7 @@ import (
 	"os/signal"
 	"strconv"
 	"strings"
+	"sync"
 	"syscall"
 	"time"
@@ -54,31 +56,39 @@ func ForwardProcessOutput(pid int, log *logrus.Entry) int {
 	// Try to open the process's stdout and stderr via /proc
 	stdoutPath := fmt.Sprintf("/proc/%d/fd/1", pid)
 	stderrPath := fmt.Sprintf("/proc/%d/fd/2", pid)
+	var wg sync.WaitGroup
-	// Channel to signal when copying goroutines should stop
-	done := make(chan struct{})
 	// Forward stdout
-	go forwardFD(stdoutPath, os.Stdout, "stdout", log, done)
+	wg.Add(1)
+	go forwardFD(stdoutPath, os.Stdout, "stdout", log, &wg)
 	// Forward stderr
-	go forwardFD(stderrPath, os.Stderr, "stderr", log, done)
+	wg.Add(1)
+	go forwardFD(stderrPath, os.Stderr, "stderr", log, &wg)
-	// Wait for process to exit
+	// Wait for process to exit (and reap it if it's our child).
 	exitCode := waitForProcess(pid, log)
-	// Signal goroutines to stop
+	// Give copy goroutines a short window to flush/finish.
-	close(done)
+	done := make(chan struct{})
+	go func() {
-	// Give goroutines a moment to flush any remaining output
+		wg.Wait()
-	time.Sleep(100 * time.Millisecond)
+		close(done)
+	}()
+	select {
+	case <-done:
+	case <-time.After(2 * time.Second):
+		log.WithField("pid", pid).Warn("Timed out waiting for output forwarding goroutines to finish")
+	}
 	return exitCode
 }
 // forwardFD copies data from a file descriptor path to a writer.
 // It handles the case where the FD may not be readable.
-func forwardFD(fdPath string, dst io.Writer, name string, log *logrus.Entry, done <-chan struct{}) {
+func forwardFD(fdPath string, dst io.Writer, name string, log *logrus.Entry, wg *sync.WaitGroup) {
+	defer wg.Done()
 	// Try to open the FD path
 	src, err := os.Open(fdPath)
 	if err != nil {
@@ -100,54 +110,71 @@ func forwardFD(fdPath string, dst io.Writer, name string, log *logrus.Entry, don
 		"path": fdPath,
 	}).Debug("Forwarding process output")
-	// Copy data until done or EOF
+	_, err = io.Copy(dst, src)
-	buf := make([]byte, 4096)
+	if err != nil && !errors.Is(err, io.EOF) {
-	for {
+		log.WithError(err).WithField("name", name).Debug("Error reading from process FD")
-		select {
-		case <-done:
-			return
-		default:
-			// Set a read deadline to allow checking done channel periodically
-			src.SetReadDeadline(time.Now().Add(100 * time.Millisecond))
-			n, err := src.Read(buf)
-			if n > 0 {
-				dst.Write(buf[:n])
-			}
-			if err != nil {
-				if os.IsTimeout(err) {
-					continue
-				}
-				if err != io.EOF {
-					log.WithError(err).WithField("name", name).Debug("Error reading from process FD")
-				}
-				return
-			}
-		}
 	}
 }
 // waitForProcess waits for a process to exit and returns its exit code.
 func waitForProcess(pid int, log *logrus.Entry) int {
+	// Preferred path: restored process is typically our direct child.
+	// Use wait4() so zombies are reaped and exit status is reliable.
+	var status syscall.WaitStatus
 	for {
-		// Check if process still exists by sending signal 0
+		wpid, err := syscall.Wait4(pid, &status, 0, nil)
-		proc, err := os.FindProcess(pid)
+		if errors.Is(err, syscall.EINTR) {
+			continue
+		}
 		if err != nil {
-			log.WithError(err).Error("Failed to find process")
+			if errors.Is(err, syscall.ECHILD) {
+				log.WithField("pid", pid).Warn("Restored process is not a child; falling back to signal-based monitoring")
+				return waitForProcessBySignal(pid, log)
+			}
+			log.WithError(err).WithField("pid", pid).Error("Wait4 failed for restored process")
 			return 1
 		}
+		if wpid != pid {
-		err = proc.Signal(syscall.Signal(0))
+			continue
-		if err != nil {
+		}
-			// Process has exited
+		if status.Exited() {
-			log.WithField("pid", pid).Info("Restored process exited")
+			exitCode := status.ExitStatus()
+			log.WithFields(logrus.Fields{
-			// Try to get exit status
+				"pid":       pid,
-			exitCode := getExitCode(pid)
+				"exit_code": exitCode,
-			log.WithField("exit_code", exitCode).Info("Restored process exit status")
+			}).Info("Restored process exited")
+			return exitCode
+		}
+		if status.Signaled() {
+			exitCode := 128 + int(status.Signal())
+			log.WithFields(logrus.Fields{
+				"pid":       pid,
+				"signal":    status.Signal().String(),
+				"exit_code": exitCode,
+			}).Warn("Restored process terminated by signal")
 			return exitCode
 		}
+		log.WithField("pid", pid).Warn("Restored process exited with unexpected wait status")
+		return 1
+	}
+}
+func waitForProcessBySignal(pid int, log *logrus.Entry) int {
+	for {
+		proc, err := os.FindProcess(pid)
+		if err != nil {
+			log.WithError(err).WithField("pid", pid).Error("Failed to find restored process")
+			return 1
+		}
+		if err := proc.Signal(syscall.Signal(0)); err != nil {
+			log.WithField("pid", pid).Info("Restored process no longer exists")
+			return 0
+		}
+		// Detect zombie state when wait4 is unavailable.
+		if state, err := readProcState(pid); err == nil && state == "Z" {
+			log.WithField("pid", pid).Warn("Restored process is zombie while not reaped by this process")
+			return 1
+		}
 		time.Sleep(100 * time.Millisecond)
 	}
 }
@@ -182,6 +209,23 @@ func getExitCode(pid int) int {
 	return 0
 }
+func readProcState(pid int) (string, error) {
+	data, err := os.ReadFile(fmt.Sprintf("/proc/%d/status", pid))
+	if err != nil {
+		return "", err
+	}
+	for _, line := range strings.Split(string(data), "\n") {
+		if strings.HasPrefix(line, "State:") {
+			fields := strings.Fields(line)
+			if len(fields) >= 2 {
+				return fields[1], nil
+			}
+			break
+		}
+	}
+	return "", fmt.Errorf("state field not found in /proc/%d/status", pid)
+}
 // SetupSignalForwarding sets up signal forwarding to the restored process.
 // Returns a cleanup function that should be called when done.
 func SetupSignalForwarding(pid int, log *logrus.Entry) func() {
@@ -232,52 +276,36 @@ func WaitForPidFile(pidFile string, timeout time.Duration, log *logrus.Entry) (i
 	return 0, fmt.Errorf("timeout waiting for PID file %s after %v", pidFile, timeout)
 }
-// RunDefault runs the default command when no checkpoint is available.
+// ExecColdStart execs the cold start command (ColdStartArgs), replacing the current process.
-// It attempts to detect and run the appropriate default command for the container.
+// If no args are provided, falls back to sleep infinity.
-func RunDefault(cfg *Config, log *logrus.Entry) error {
+func ExecColdStart(cfg *RestoreRequest, log *logrus.Entry) error {
-	// If DEFAULT_CMD is set, use it
+	if len(cfg.ColdStartArgs) == 0 {
-	if cfg.DefaultCmd != "" {
+		log.Warn("No cold start command provided, sleeping indefinitely")
-		log.WithField("cmd", cfg.DefaultCmd).Info("Running default command")
+		return ExecArgs([]string{"sleep", "infinity"}, log)
-		return execCommand(cfg.DefaultCmd)
 	}
-	// Try common application entrypoints
+	log.WithField("cmd", cfg.ColdStartArgs).Info("Executing cold start command")
-	if _, err := os.Stat("/docker-entrypoint.sh"); err == nil {
+	return ExecArgs(cfg.ColdStartArgs, log)
-		log.Info("Running docker-entrypoint.sh")
-		return execCommand("/docker-entrypoint.sh nginx -g 'daemon off;'")
-	}
-	// Check for nginx
-	if _, err := exec.LookPath("nginx"); err == nil {
-		log.Info("Running nginx")
-		return execCommand("nginx -g 'daemon off;'")
-	}
-	// Fallback to sleep infinity
-	log.Warn("No default command specified and no known entrypoint found, sleeping")
-	return execCommand("sleep infinity")
 }
-// execCommand executes a command by replacing the current process.
+// ExecArgs replaces the current process with the given command and arguments.
-func execCommand(cmdLine string) error {
+// Uses syscall.Exec for proper PID 1 behavior in containers.
-	// Parse command line - simple split by spaces
+func ExecArgs(args []string, log *logrus.Entry) error {
-	// For complex commands, shell wrapper is needed
+	if len(args) == 0 {
-	parts := strings.Fields(cmdLine)
-	if len(parts) == 0 {
 		return fmt.Errorf("empty command")
 	}
-	cmd := parts[0]
-	args := parts
 	// Find the executable path
-	path, err := exec.LookPath(cmd)
+	path, err := exec.LookPath(args[0])
 	if err != nil {
-		// Try running through shell for complex commands
+		return fmt.Errorf("command not found: %s: %w", args[0], err)
-		path = "/bin/sh"
-		args = []string{"sh", "-c", cmdLine}
 	}
+	log.WithFields(logrus.Fields{
+		"path": path,
+		"args": args,
+	}).Debug("Replacing process via syscall.Exec")
 	// Replace current process with the command
 	return syscall.Exec(path, args, os.Environ())
 }
--- a/deploy/chrek/pkg/restore/restore.go
+++ b/deploy/chrek/pkg/restore/restore.go
 package restore
 import (
+	"bufio"
 	"context"
 	"fmt"
 	"os"
+	"os/exec"
 	"path/filepath"
+	"sort"
 	"strings"
+	"syscall"
 	"time"
 	criu "github.com/checkpoint-restore/go-criu/v7"
 	"github.com/sirupsen/logrus"
 	"google.golang.org/protobuf/proto"
+	"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/checkpoint"
 )
+// LogGPUDiagnostics logs nvidia-smi and /dev/nvidia* for debugging GPU visibility.
+func LogGPUDiagnostics(label string, log *logrus.Entry) {
+	log.Infof("=== GPU DIAGNOSTICS [%s] ===", label)
+	diagCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+	defer cancel()
+	if out, err := exec.CommandContext(diagCtx, "nvidia-smi", "-L").CombinedOutput(); err != nil {
+		log.Infof("nvidia-smi -L: error: %v", err)
+	} else {
+		log.Infof("nvidia-smi -L:\n%s", string(out))
+	}
+	// Also log memory usage per GPU to detect OOM conditions
+	diagCtx2, cancel2 := context.WithTimeout(context.Background(), 10*time.Second)
+	defer cancel2()
+	if out, err := exec.CommandContext(diagCtx2, "nvidia-smi", "--query-gpu=index,uuid,memory.used,memory.total,memory.free", "--format=csv,noheader").CombinedOutput(); err != nil {
+		log.Infof("nvidia-smi memory query: error: %v", err)
+	} else {
+		log.Infof("nvidia-smi memory:\n%s", string(out))
+	}
+	matches, _ := filepath.Glob("/dev/nvidia*")
+	log.Infof("/dev/nvidia* devices: %s", strings.Join(matches, ", "))
+	log.Infof("NVIDIA_VISIBLE_DEVICES=%s", os.Getenv("NVIDIA_VISIBLE_DEVICES"))
+	log.Infof("=== END GPU DIAGNOSTICS [%s] ===", label)
+}
+func processSnapshotPIDs(restoredPID int) []int {
+	pidSet := map[int]struct{}{
+		1:           {},
+		os.Getpid(): {},
+	}
+	if restoredPID > 0 {
+		pidSet[restoredPID] = struct{}{}
+	}
+	pids := make([]int, 0, len(pidSet))
+	for pid := range pidSet {
+		pids = append(pids, pid)
+	}
+	sort.Ints(pids)
+	return pids
+}
+func logProcessNamespaces(pid int, log *logrus.Entry) {
+	for _, ns := range []string{"mnt", "pid", "ipc", "net", "uts", "cgroup"} {
+		nsPath := fmt.Sprintf("/proc/%d/ns/%s", pid, ns)
+		link, err := os.Readlink(nsPath)
+		if err != nil {
+			log.WithError(err).WithFields(logrus.Fields{
+				"pid":  pid,
+				"path": nsPath,
+			}).Warn("Failed to read namespace symlink")
+			continue
+		}
+		log.WithFields(logrus.Fields{
+			"pid":       pid,
+			"namespace": ns,
+			"value":     link,
+		}).Info("Namespace snapshot")
+	}
+}
+func logProcessCgroupPath(pid int, log *logrus.Entry) {
+	path := fmt.Sprintf("/proc/%d/cgroup", pid)
+	data, err := os.ReadFile(path)
+	if err != nil {
+		log.WithError(err).WithFields(logrus.Fields{
+			"pid":  pid,
+			"path": path,
+		}).Warn("Failed to read cgroup path")
+		return
+	}
+	log.WithFields(logrus.Fields{
+		"pid":      pid,
+		"path":     path,
+		"contents": strings.TrimSpace(string(data)),
+	}).Info("Cgroup membership snapshot")
+}
+func logProcessFilteredMountInfo(pid int, log *logrus.Entry) {
+	// Mountinfo dumps are very large; only emit them in DEBUG mode.
+	if !log.Logger.IsLevelEnabled(logrus.DebugLevel) {
+		return
+	}
+	path := fmt.Sprintf("/proc/%d/mountinfo", pid)
+	f, err := os.Open(path)
+	if err != nil {
+		log.WithError(err).WithFields(logrus.Fields{
+			"pid":  pid,
+			"path": path,
+		}).Warn("Failed to open mountinfo")
+		return
+	}
+	defer f.Close()
+	var selected []string
+	scanner := bufio.NewScanner(f)
+	scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024)
+	for scanner.Scan() {
+		line := scanner.Text()
+		if strings.Contains(line, " /dev ") ||
+			strings.Contains(line, "/dev/") ||
+			strings.Contains(line, "nvidia") ||
+			strings.Contains(line, "cgroup2") {
+			selected = append(selected, line)
+		}
+	}
+	if err := scanner.Err(); err != nil {
+		log.WithError(err).WithFields(logrus.Fields{
+			"pid":  pid,
+			"path": path,
+		}).Warn("Failed while scanning mountinfo")
+		return
+	}
+	log.WithFields(logrus.Fields{
+		"pid":   pid,
+		"path":  path,
+		"count": len(selected),
+	}).Debug("Filtered mountinfo snapshot count")
+	if len(selected) > 0 {
+		for i, line := range selected {
+			log.WithFields(logrus.Fields{
+				"pid":   pid,
+				"index": i + 1,
+				"total": len(selected),
+			}).Debugf("Filtered mountinfo: %s", line)
+		}
+	}
+}
+func logNvidiaDeviceNodeMetadata(log *logrus.Entry) {
+	devices, err := filepath.Glob("/dev/nvidia*")
+	if err != nil {
+		log.WithError(err).Warn("Failed to glob /dev/nvidia*")
+		return
+	}
+	if len(devices) == 0 {
+		log.Info("No /dev/nvidia* entries found")
+		return
+	}
+	for _, path := range devices {
+		fi, err := os.Lstat(path)
+		if err != nil {
+			log.WithError(err).WithField("path", path).Warn("Failed to stat NVIDIA device entry")
+			continue
+		}
+		stat, ok := fi.Sys().(*syscall.Stat_t)
+		if !ok {
+			log.WithFields(logrus.Fields{
+				"path": path,
+				"mode": fi.Mode().String(),
+			}).Warn("Unexpected stat type for NVIDIA device entry")
+			continue
+		}
+		log.WithFields(logrus.Fields{
+			"path":  path,
+			"mode":  fi.Mode().String(),
+			"inode": stat.Ino,
+			"rdev":  fmt.Sprintf("0x%x", stat.Rdev),
+		}).Info("NVIDIA device entry metadata")
+	}
+}
+func logCgroupV2HostInfo(log *logrus.Entry) {
+	const controllersPath = "/sys/fs/cgroup/cgroup.controllers"
+	data, err := os.ReadFile(controllersPath)
+	if err != nil {
+		log.WithError(err).WithField("path", controllersPath).Warn("Failed to read cgroup v2 controllers")
+		return
+	}
+	log.WithFields(logrus.Fields{
+		"path":        controllersPath,
+		"controllers": strings.TrimSpace(string(data)),
+	}).Info("cgroup v2 controllers")
+}
+// LogRestoreBoundaryDiagnostics captures cgroup and namespace state around CRIU restore.
+func LogRestoreBoundaryDiagnostics(label string, restoredPID int, log *logrus.Entry) {
+	log.Infof("=== RESTORE BOUNDARY DIAGNOSTICS [%s] ===", label)
+	for _, pid := range processSnapshotPIDs(restoredPID) {
+		logProcessNamespaces(pid, log)
+		logProcessCgroupPath(pid, log)
+		logProcessFilteredMountInfo(pid, log)
+	}
+	logCgroupV2HostInfo(log)
+	logNvidiaDeviceNodeMetadata(log)
+	log.Infof("=== END RESTORE BOUNDARY DIAGNOSTICS [%s] ===", label)
+}
 // Restore performs the CRIU restore operation using go-criu.
+// All CRIU options are read from the saved CheckpointManifest - no hardcoding.
 // Returns the PID of the restored process.
-func Restore(ctx context.Context, opts *RestoreOptions, log *logrus.Entry) (int, error) {
+func Restore(ctx context.Context, checkpointPath string, data *checkpoint.CheckpointManifest, log *logrus.Entry) (int, error) {
-	log.WithField("checkpoint", opts.CheckpointPath).Info("Starting CRIU restore")
+	if data == nil {
+		return 0, fmt.Errorf("checkpoint manifest is required")
+	}
+	// Hardcoded restore constants
+	const (
+		rootPath = "/"
+		pidFile  = "/tmp/restored.pid"
+		logFile  = RestoreLogFilename
+	)
+	log.WithField("checkpoint", checkpointPath).Info("Starting CRIU restore")
 	// 1. Open checkpoint directory
-	imageDir, imageDirFD, err := OpenImageDir(opts.CheckpointPath)
+	imageDir, imageDirFD, err := OpenImageDir(checkpointPath)
 	if err != nil {
 		return 0, err
 	}
 	defer imageDir.Close()
-	log.WithField("fd", imageDirFD).Debug("Opened checkpoint directory")
-	// 2. Generate external mount mappings if not already set
+	// 2. Generate external mount mappings from saved CheckpointManifest
-	if opts.ExtMountMaps == nil {
+	extMounts, err := GenerateExtMountMaps(data)
-		extMounts, err := GenerateExtMountMaps(nil)
+	if err != nil {
-		if err != nil {
+		return 0, fmt.Errorf("failed to generate mount maps: %w", err)
-			return 0, fmt.Errorf("failed to generate mount maps: %w", err)
-		}
-		opts.ExtMountMaps = extMounts
 	}
-	log.WithField("mount_count", len(opts.ExtMountMaps)).Debug("External mount maps ready")
 	// 3. Open target network namespace
 	netNsFile, netNsFD, err := OpenNetworkNamespace("/proc/1/ns/net")
@@ -42,53 +244,44 @@ func Restore(ctx context.Context, opts *RestoreOptions, log *logrus.Entry) (int,
 		return 0, err
 	}
 	defer netNsFile.Close()
-	log.WithField("fd", netNsFD).Debug("Opened target network namespace")
-	// 4. Open work directory if specified
+	// 4. Open work directory if specified in checkpoint dump settings.
 	var workDirFile *os.File
 	var workDirFD int32 = -1
-	if opts.WorkDir != "" {
+	if data.CRIUDump.CRIU.WorkDir != "" {
-		workDirFile, workDirFD = OpenWorkDir(opts.WorkDir, log)
+		workDirFile, workDirFD = OpenWorkDir(data.CRIUDump.CRIU.WorkDir, log)
 		if workDirFile != nil {
 			defer workDirFile.Close()
 		}
 	}
-	// 5. Build CRIU options
+	// 5. Build CRIU options from saved checkpoint manifest.
-	cfg := CRIURestoreConfig{
+	plan := CRIURestorePlan{
-		ImageDirFD:   imageDirFD,
+		// File descriptors
-		RootPath:     opts.RootPath,
+		ImageDirFD: imageDirFD,
-		LogLevel:     opts.LogLevel,
+		WorkDirFD:  workDirFD,
-		LogFile:      opts.LogFile,
+		NetNsFD:    netNsFD,
-		WorkDirFD:    workDirFD,
+		// Paths
-		NetNsFD:      netNsFD,
+		RootPath: rootPath,
-		ExtMountMaps: opts.ExtMountMaps,
+		LogFile:  logFile,
+		// Options from CheckpointManifest.CRIUDump.CRIU
+		LogLevel:          data.CRIUDump.CRIU.LogLevel,
+		Timeout:           data.CRIUDump.CRIU.Timeout,
+		ShellJob:          data.CRIUDump.CRIU.ShellJob,
+		TcpClose:          data.CRIUDump.CRIU.TcpClose,
+		FileLocks:         data.CRIUDump.CRIU.FileLocks,
+		ExtUnixSk:         data.CRIUDump.CRIU.ExtUnixSk,
+		LinkRemap:         data.CRIUDump.CRIU.LinkRemap,
+		ManageCgroupsMode: data.CRIUDump.CRIU.ManageCgroupsMode,
+		// External mounts
+		ExtMountMaps: extMounts,
 	}
-	criuOpts := BuildRestoreCRIUOpts(cfg)
+	criuOpts := BuildCRIURestoreOptions(plan)
-	// 6. Create CRIU config file for CUDA plugin if libdir is specified
+	// 6. Reuse criu.conf from checkpoint time if it exists.
-	if opts.LibDir != "" {
+	criuConfPath := filepath.Join(checkpointPath, checkpoint.CheckpointCRIUConfFilename)
-		if opts.Timeout == 0 {
+	if _, err := os.Stat(criuConfPath); err == nil {
-			return 0, fmt.Errorf("CRIU_TIMEOUT environment variable must be set for CUDA restores")
+		criuOpts.ConfigFile = proto.String(criuConfPath)
-		}
-		configPath := filepath.Join(opts.CheckpointPath, "restore-criu.conf")
-		configContent := fmt.Sprintf(`enable-external-masters
-libdir %s
-tcp-close
-link-remap
-timeout %d
-allow-uprobes
-skip-in-flight
-`, opts.LibDir, opts.Timeout)
-		if err := os.WriteFile(configPath, []byte(configContent), 0644); err != nil {
-			log.WithError(err).Warn("Failed to write CRIU config file for restore")
-		} else {
-			criuOpts.ConfigFile = proto.String(configPath)
-			log.WithFields(logrus.Fields{
-				"config_path": configPath,
-				"lib_dir":     opts.LibDir,
-			}).Info("Created CRIU config file with libdir for CUDA plugin")
-		}
 	}
 	// 7. Execute CRIU restore
@@ -99,7 +292,7 @@ skip-in-flight
 	criuExecStart := time.Now()
 	if err := c.Restore(criuOpts, notify); err != nil {
 		log.WithField("duration", time.Since(criuExecStart)).Error("CRIU c.Restore failed")
-		logCRIUErrors(opts.CheckpointPath, opts.LogFile, log)
+		logCRIUErrors(checkpointPath, logFile, log)
 		return 0, fmt.Errorf("CRIU restore failed: %w", err)
 	}
@@ -114,15 +307,11 @@ skip-in-flight
 	}
 	// Fallback: try to read from PID file
-	if opts.PidFile != "" {
+	pid, err := WaitForPidFile(pidFile, 10*time.Second, log)
-		pid, err := WaitForPidFile(opts.PidFile, 10*time.Second, log)
+	if err != nil {
-		if err != nil {
+		return 0, fmt.Errorf("failed to get restored PID: %w", err)
-			return 0, fmt.Errorf("failed to get restored PID: %w", err)
-		}
-		return pid, nil
 	}
+	return pid, nil
-	return 0, fmt.Errorf("could not determine restored process PID")
 }
 // logCRIUErrors reads CRIU log file and logs errors.
@@ -142,62 +331,58 @@ func logCRIUErrors(checkpointPath, logFile string, log *logrus.Entry) {
 	}
 	log.Error("=== CRIU RESTORE LOG END ===")
-	// Copy log to shared directory if CRIU_LOG_DIR is set
+	// Copy log to shared directory for debugging
-	if logDir := os.Getenv("CRIU_LOG_DIR"); logDir != "" {
+	if err := os.MkdirAll(CRIULogDir, 0755); err == nil {
-		if err := os.MkdirAll(logDir, 0755); err == nil {
+		destPath := filepath.Join(CRIULogDir, fmt.Sprintf("restore-%d.log", time.Now().Unix()))
-			destPath := filepath.Join(logDir, fmt.Sprintf("restore-%d.log", time.Now().Unix()))
+		if err := os.WriteFile(destPath, data, 0644); err == nil {
-			if err := os.WriteFile(destPath, data, 0644); err == nil {
+			log.WithField("path", destPath).Info("CRIU log copied to shared directory")
-				log.WithField("path", destPath).Info("CRIU log copied to shared directory")
-			}
 		}
 	}
 }
 // Run is the main entry point for the restore entrypoint.
 // It orchestrates the entire restore process.
-func Run(ctx context.Context, cfg *Config, log *logrus.Entry) error {
+func Run(ctx context.Context, cfg *RestoreRequest, log *logrus.Entry) error {
-	log.Info("=== Self-Restoring Placeholder Entrypoint ===")
+	log.Info("=== Restore Entrypoint ===")
 	log.WithFields(logrus.Fields{
 		"checkpoint_path":          cfg.CheckpointPath,
 		"checkpoint_hash":          cfg.CheckpointHash,
-		"embedded_checkpoint_path": cfg.EmbeddedCheckpointPath,
+		"checkpoint_location":      cfg.CheckpointLocation,
-		"wait_for_checkpoint":      cfg.WaitForCheckpoint,
+		"skip_wait_for_checkpoint": cfg.SkipWaitForCheckpoint,
-		"restore_marker_file":      cfg.RestoreMarkerFile,
+		"cold_start_args":          cfg.ColdStartArgs,
-	}).Info("Configuration")
+	}).Debug("Configuration")
 	// Check CRIU availability
 	c := criu.MakeCriu()
-	version, err := c.GetCriuVersion()
+	if _, err := c.GetCriuVersion(); err != nil {
-	if err != nil {
 		log.WithError(err).Error("CRIU is not available")
-		log.Info("Falling back to default command")
+		return ExecColdStart(cfg, log)
-		return RunDefault(cfg, log)
 	}
-	log.WithField("version", version).Info("CRIU version")
-	// Determine checkpoint path
+	// Determine checkpoint path based on mode
 	var checkpointPath string
-	var shouldRestore bool
-	// Check if we should restore immediately
-	checkpointPath, shouldRestore = ShouldRestore(cfg, log)
-	// If not and we're configured to wait, wait for checkpoint
+	if cfg.SkipWaitForCheckpoint {
-	if !shouldRestore && cfg.WaitForCheckpoint {
+		// Operator path: check once, restore if ready, otherwise cold start
-		log.Info("Waiting for checkpoint...")
+		var ready bool
-		var err error
+		checkpointPath, ready = ShouldRestore(cfg, log)
-		checkpointPath, err = WaitForCheckpoint(ctx, cfg, log)
+		if !ready {
-		if err != nil {
+			log.Info("No checkpoint ready, executing cold start command")
-			log.WithError(err).Info("No checkpoint received, running default command")
+			return ExecColdStart(cfg, log)
-			return RunDefault(cfg, log)
+		}
+	} else {
+		// Standalone/DaemonSet path: check first, then poll if needed
+		var ready bool
+		checkpointPath, ready = ShouldRestore(cfg, log)
+		if !ready {
+			log.Info("Waiting for checkpoint...")
+			var err error
+			checkpointPath, err = WaitForCheckpoint(ctx, cfg, log)
+			if err != nil {
+				log.WithError(err).Info("No checkpoint received")
+				return ExecColdStart(cfg, log)
+			}
 		}
-		shouldRestore = true
-	}
-	// If no checkpoint, run default command
-	if !shouldRestore {
-		log.Info("No checkpoint configured, running default command")
-		return RunDefault(cfg, log)
 	}
 	// Perform restore
@@ -205,68 +390,61 @@ func Run(ctx context.Context, cfg *Config, log *logrus.Entry) error {
 	restoreStart := time.Now()
 	// Apply filesystem changes
-	rootfsDiffStart := time.Now()
 	if err := ApplyRootfsDiff(checkpointPath, "/", log); err != nil {
 		log.WithError(err).Error("Failed to apply rootfs diff")
 	}
-	log.WithField("duration", time.Since(rootfsDiffStart)).Info("ApplyRootfsDiff completed")
-	deletedFilesStart := time.Now()
 	if err := ApplyDeletedFiles(checkpointPath, "/", log); err != nil {
 		log.WithError(err).Error("Failed to apply deleted files")
 	}
-	log.WithField("duration", time.Since(deletedFilesStart)).Info("ApplyDeletedFiles completed")
-	// Load restore options from metadata
+	// Load checkpoint manifest (contains CRIU settings + mounts + namespaces).
-	loadOptsStart := time.Now()
+	data, err := checkpoint.ReadCheckpointManifest(checkpointPath)
-	opts, err := LoadRestoreOptions(checkpointPath, cfg.CRIULogLevel)
 	if err != nil {
-		log.WithError(err).Warn("Could not load restore options from metadata, using defaults")
+		log.WithError(err).Error("Failed to load checkpoint manifest")
+		return ExecColdStart(cfg, log)
 	}
-	log.WithField("duration", time.Since(loadOptsStart)).Info("LoadRestoreOptions completed")
-	// Apply additional config options
+	// Write restore marker file before CRIU restore
-	if cfg.CRIUWorkDir != "" {
+	restoreMarkerFile := cfg.RestoreMarkerFilePath
-		opts.WorkDir = cfg.CRIUWorkDir
+	if err := os.MkdirAll(filepath.Dir(restoreMarkerFile), 0755); err != nil {
+		log.WithError(err).Warn("Failed to create restore marker directory")
+	}
+	if err := os.WriteFile(restoreMarkerFile, []byte("restored"), 0644); err != nil {
+		log.WithError(err).Warn("Failed to write restore marker file")
 	}
-	// Set CUDA plugin directory and timeout for restore config file
+	// Restore /dev/shm contents before CRIU restore
-	if cfg.CUDAPluginDir != "" {
+	if err := RestoreDevShm(checkpointPath, log); err != nil {
-		if cfg.CRIUTimeout == 0 {
+		log.WithError(err).Error("Failed to restore /dev/shm contents - CRIU restore may fail with missing FD errors")
-			return fmt.Errorf("CRIU_TIMEOUT environment variable must be set for CUDA restores")
-		}
-		opts.LibDir = cfg.CUDAPluginDir
-		opts.Timeout = cfg.CRIUTimeout
-		log.WithFields(logrus.Fields{
-			"lib_dir": cfg.CUDAPluginDir,
-			"timeout": cfg.CRIUTimeout,
-		}).Info("CUDA plugin directory and timeout configured for restore")
 	}
-	// Write restore marker file before CRIU restore
+	// Create link_remap stub files for unlinked files referenced in CRIU images
-	// This allows the restored process to detect it's been restored
+	if err := CreateLinkRemapStubs(checkpointPath, log); err != nil {
-	if cfg.RestoreMarkerFile != "" {
+		log.WithError(err).Warn("Failed to create link_remap stubs")
-		if err := os.WriteFile(cfg.RestoreMarkerFile, []byte("restored"), 0644); err != nil {
-			log.WithError(err).Warn("Failed to write restore marker file")
-		} else {
-			log.WithField("path", cfg.RestoreMarkerFile).Info("Wrote restore marker file")
-		}
 	}
+	// Log GPU diagnostics right before CRIU restore to track device visibility changes
+	LogGPUDiagnostics("PRE-CRIU-RESTORE", log)
+	LogRestoreBoundaryDiagnostics("PRE-CRIU-RESTORE", 0, log)
 	// Perform CRIU restore (CUDA plugin handles CUDA state automatically)
 	criuRestoreStart := time.Now()
-	pid, err := Restore(ctx, opts, log)
+	pid, err := Restore(ctx, checkpointPath, data, log)
 	if err != nil {
 		log.WithField("duration", time.Since(criuRestoreStart)).WithError(err).Error("Restore failed, falling back to default command")
 		if cfg.Debug {
 			log.Info("DEBUG mode: sleeping 300s to allow log collection...")
 			time.Sleep(300 * time.Second)
 		}
-		return RunDefault(cfg, log)
+		return ExecColdStart(cfg, log)
 	}
 	criuRestoreDuration := time.Since(criuRestoreStart)
 	log.WithField("duration", criuRestoreDuration).Info("CRIU Restore completed (CUDA state restored by plugin)")
+	// Log GPU diagnostics AFTER restore to compare with pre-restore
+	LogGPUDiagnostics("POST-RESTORE", log)
+	LogRestoreBoundaryDiagnostics("POST-RESTORE", pid, log)
 	totalDuration := time.Since(restoreStart)
 	log.WithFields(logrus.Fields{
 		"total_duration":        totalDuration,

--- a/deploy/chrek/pkg/restore/shm.go
+++ b/deploy/chrek/pkg/restore/shm.go
+// Package restore provides CRIU restore operations.
+package restore
+import (
+	"fmt"
+	"io"
+	"os"
+	"path/filepath"
+	"github.com/sirupsen/logrus"
+	"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/checkpoint"
+)
+// RestoreDevShm restores files from the checkpoint's dev-shm directory to /dev/shm.
+// This must be called BEFORE CRIU restore so that the shared memory files exist
+// when CRIU tries to restore file descriptors pointing to them.
+func RestoreDevShm(checkpointPath string, log *logrus.Entry) error {
+	srcDir := filepath.Join(checkpointPath, checkpoint.DevShmDirName)
+	// Check if dev-shm directory exists in checkpoint
+	entries, err := os.ReadDir(srcDir)
+	if err != nil {
+		if os.IsNotExist(err) {
+			log.Debug("No dev-shm directory in checkpoint, skipping restore")
+			return nil
+		}
+		return fmt.Errorf("failed to read checkpoint dev-shm directory: %w", err)
+	}
+	if len(entries) == 0 {
+		log.Debug("Checkpoint dev-shm directory is empty")
+		return nil
+	}
+	// Ensure /dev/shm exists and is writable
+	destDir := "/dev/shm"
+	if err := os.MkdirAll(destDir, 0777); err != nil {
+		return fmt.Errorf("failed to ensure /dev/shm exists: %w", err)
+	}
+	var restored []string
+	var totalSize int64
+	for _, entry := range entries {
+		if entry.IsDir() {
+			continue
+		}
+		name := entry.Name()
+		srcPath := filepath.Join(srcDir, name)
+		destPath := filepath.Join(destDir, name)
+		info, err := entry.Info()
+		if err != nil {
+			log.WithError(err).WithField("file", name).Warn("Failed to get file info, skipping")
+			continue
+		}
+		size := info.Size()
+		// Copy the file to /dev/shm
+		if err := copyFileToShm(srcPath, destPath, info.Mode()); err != nil {
+			log.WithError(err).WithField("file", name).Warn("Failed to restore file, skipping")
+			continue
+		}
+		restored = append(restored, name)
+		totalSize += size
+		log.WithFields(logrus.Fields{
+			"file": name,
+			"size": size,
+		}).Debug("Restored /dev/shm file")
+	}
+	if len(restored) > 0 {
+		log.WithFields(logrus.Fields{
+			"count":      len(restored),
+			"total_size": totalSize,
+			"files":      restored,
+		}).Info("Restored /dev/shm files from checkpoint")
+	}
+	return nil
+}
+// copyFileToShm copies a file from src to dest in /dev/shm.
+// Uses mode 0666 as default when mode is 0, otherwise preserves the original mode.
+func copyFileToShm(src, dest string, mode os.FileMode) error {
+	srcFile, err := os.Open(src)
+	if err != nil {
+		return fmt.Errorf("failed to open source: %w", err)
+	}
+	defer srcFile.Close()
+	// Default to 0666 when mode is not set (mode == 0)
+	if mode == 0 {
+		mode = 0666
+	}
+	destFile, err := os.OpenFile(dest, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, mode)
+	if err != nil {
+		return fmt.Errorf("failed to create destination: %w", err)
+	}
+	defer destFile.Close()
+	if _, err := io.Copy(destFile, srcFile); err != nil {
+		return fmt.Errorf("failed to copy contents: %w", err)
+	}
+	return nil
+}
--- a/deploy/chrek/pkg/watcher/watcher.go
+++ b/deploy/chrek/pkg/watcher/watcher.go
@@ -21,18 +21,6 @@ import (
 	"k8s.io/client-go/tools/cache"
 	"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/checkpoint"
-	checkpointk8s "github.com/ai-dynamo/dynamo/deploy/chrek/pkg/checkpoint/k8s"
-)
-const (
-	// LabelCheckpointSource is the label that triggers automatic checkpointing
-	LabelCheckpointSource = "nvidia.com/checkpoint-source"
-	// LabelCheckpointHash is the label specifying the checkpoint identity hash
-	LabelCheckpointHash = "nvidia.com/checkpoint-hash"
-	// EnvCheckpointSignalFile is the env var in the pod specifying the signal file path
-	EnvCheckpointSignalFile = "DYN_CHECKPOINT_SIGNAL_FILE"
 )
 // SignalFile represents the content of a checkpoint completion signal file
@@ -44,26 +32,21 @@ type SignalFile struct {
 	Error          string    `json:"error,omitempty"`
 }
-// Config holds watcher configuration
+// WatcherConfig holds watcher configuration.
-type Config struct {
+type WatcherConfig struct {
 	NodeName            string
-	CheckpointDir       string
-	HostProc            string
 	ListenAddr          string // HTTP server address for health checks (e.g., ":8080")
 	RestrictedNamespace string // Optional: restrict watching to this namespace (empty = cluster-wide)
-	// GPU/CUDA checkpoint options (passed to checkpoint.Options)
+	// Checkpoint configuration (from ConfigMap)
-	CUDAPluginDir  string   // Path to CRIU CUDA plugin directory
+	CheckpointSpec *checkpoint.CheckpointSpec
-	GhostLimit     uint32   // Ghost file size limit in bytes (default: 512MB for GPU)
-	Timeout        uint32   // CRIU timeout in seconds
-	ExternalMounts []string // Additional external mount mappings
 }
 // Watcher watches for pods with checkpoint labels and triggers checkpoints
 type Watcher struct {
-	config          Config
+	config          WatcherConfig
 	clientset       kubernetes.Interface
-	discoveryClient *checkpointk8s.DiscoveryClient
+	discoveryClient *checkpoint.DiscoveryClient
 	checkpointer    *checkpoint.Checkpointer
 	log             *logrus.Entry
@@ -75,7 +58,7 @@ type Watcher struct {
 }
 // NewWatcher creates a new pod watcher
-func NewWatcher(cfg Config, discoveryClient *checkpointk8s.DiscoveryClient, checkpointer *checkpoint.Checkpointer) (*Watcher, error) {
+func NewWatcher(cfg WatcherConfig, discoveryClient *checkpoint.DiscoveryClient, checkpointer *checkpoint.Checkpointer) (*Watcher, error) {
 	// Create in-cluster Kubernetes client
 	restConfig, err := rest.InClusterConfig()
 	if err != nil {
@@ -100,10 +83,13 @@ func NewWatcher(cfg Config, discoveryClient *checkpointk8s.DiscoveryClient, chec
 // Start begins watching for pods and starts the health check server
 func (w *Watcher) Start(ctx context.Context) error {
+	if w.config.CheckpointSpec == nil {
+		return fmt.Errorf("checkpoint spec is required")
+	}
 	w.log.WithFields(logrus.Fields{
-		"node":            w.config.NodeName,
+		"node":  w.config.NodeName,
-		"label":           LabelCheckpointSource,
+		"label": checkpoint.KubeLabelCheckpointSource,
-		"signal_file_env": EnvCheckpointSignalFile,
 	}).Info("Starting pod watcher")
 	// Start health check HTTP server if address is configured
@@ -118,7 +104,7 @@ func (w *Watcher) Start(ctx context.Context) error {
 	// Create informer factory with label selector and optional namespace restriction
 	labelSelector := labels.SelectorFromSet(labels.Set{
-		LabelCheckpointSource: "true",
+		checkpoint.KubeLabelCheckpointSource: "true",
 	}).String()
 	factoryOptions := []informers.SharedInformerOption{
@@ -232,7 +218,7 @@ func (w *Watcher) handlePodEvent(ctx context.Context, pod *corev1.Pod) {
 	podKey := fmt.Sprintf("%s/%s", pod.Namespace, pod.Name)
 	// Get checkpoint ID from label (uses the checkpoint hash)
-	checkpointID, ok := pod.Labels[LabelCheckpointHash]
+	checkpointID, ok := pod.Labels[checkpoint.KubeLabelCheckpointHash]
 	if !ok || checkpointID == "" {
 		w.log.WithField("pod", podKey).Warn("Pod has checkpoint label but no checkpoint-hash label")
 		return
@@ -282,12 +268,14 @@ func (w *Watcher) doCheckpoint(ctx context.Context, pod *corev1.Pod, checkpointI
 	// Find the main container and get signal file path from env
 	var containerID string
+	var containerName string
 	var signalFilePath string
 	for _, container := range pod.Spec.Containers {
 		if container.Name == "main" || len(pod.Spec.Containers) == 1 {
+			containerName = container.Name
 			// Get signal file path from environment
 			for _, env := range container.Env {
-				if env.Name == EnvCheckpointSignalFile {
+				if env.Name == "DYN_CHECKPOINT_SIGNAL_FILE" {
 					signalFilePath = env.Value
 					break
 				}
@@ -325,8 +313,8 @@ func (w *Watcher) doCheckpoint(ctx context.Context, pod *corev1.Pod, checkpointI
 		"signal_file_path": signalFilePath,
 	}).Info("Found container, starting checkpoint")
-	// Resolve container to get PID for signal file writing
+	// Resolve container to get PID for signal file writing.
-	containerInfo, err := w.discoveryClient.ResolveContainer(ctx, containerID)
+	containerPID, _, err := w.discoveryClient.ResolveContainer(ctx, containerID)
 	if err != nil {
 		log.WithError(err).Error("Failed to resolve container")
 		w.checkpointedMu.Lock()
@@ -335,28 +323,34 @@ func (w *Watcher) doCheckpoint(ctx context.Context, pod *corev1.Pod, checkpointI
 		return
 	}
+	// Validate CheckpointSpec is set
+	if w.config.CheckpointSpec == nil {
+		log.Error("CheckpointSpec is nil - cannot perform checkpoint")
+		w.checkpointedMu.Lock()
+		delete(w.checkpointed, podKey)
+		w.checkpointedMu.Unlock()
+		return
+	}
 	// Perform checkpoint
-	opts := checkpoint.Options{
+	params := checkpoint.CheckpointRequest{
-		ContainerID:    containerID,
+		ContainerID:   containerID,
-		CheckpointID:   checkpointID,
+		ContainerName: containerName,
-		CheckpointDir:  w.config.CheckpointDir,
+		CheckpointID:  checkpointID,
-		NodeName:       w.config.NodeName,
+		CheckpointDir: w.config.CheckpointSpec.BasePath,
-		PodName:        pod.Name,
+		NodeName:      w.config.NodeName,
-		PodNamespace:   pod.Namespace,
+		PodName:       pod.Name,
-		CUDAPluginDir:  w.config.CUDAPluginDir,
+		PodNamespace:  pod.Namespace,
-		GhostLimit:     w.config.GhostLimit,
-		Timeout:        w.config.Timeout,
-		ExternalMounts: w.config.ExternalMounts,
 	}
-	result, err := w.checkpointer.Checkpoint(ctx, opts)
+	result, err := w.checkpointer.Checkpoint(ctx, params, w.config.CheckpointSpec)
 	if err != nil {
 		log.WithError(err).Error("Checkpoint failed")
 		// Write failure marker to PVC so restore pods know checkpoint failed
-		checkpointDir := filepath.Join(w.config.CheckpointDir, checkpointID)
+		checkpointDir := filepath.Join(w.config.CheckpointSpec.BasePath, checkpointID)
 		w.writeCheckpointDoneMarker(checkpointDir, checkpointID, false, err.Error(), log)
 		if signalFilePath != "" {
-			w.writeSignalFileToPod(int(containerInfo.PID), signalFilePath, checkpointID, "", false, err.Error())
+			w.writeSignalFileToPod(containerPID, signalFilePath, checkpointID, "", false, err.Error())
 		}
 		// Clear the in_progress status so checkpoint can be retried
 		w.checkpointedMu.Lock()
@@ -368,12 +362,11 @@ func (w *Watcher) doCheckpoint(ctx context.Context, pod *corev1.Pod, checkpointI
 	log.WithField("checkpoint_dir", result.CheckpointDir).Info("Checkpoint completed successfully")
 	// Write checkpoint.done marker to PVC for cross-node restore detection
-	// This is written AFTER rootfs-diff.tar is complete, so it's safe to use as a completion marker
 	w.writeCheckpointDoneMarker(result.CheckpointDir, checkpointID, true, "", log)
 	// Write signal file to pod's hostPath for checkpoint job pod to exit
 	if signalFilePath != "" {
-		w.writeSignalFileToPod(int(containerInfo.PID), signalFilePath, checkpointID, result.CheckpointDir, true, "")
+		w.writeSignalFileToPod(containerPID, signalFilePath, checkpointID, result.CheckpointDir, true, "")
 	}
 	// Mark as completed so we don't checkpoint again
@@ -400,8 +393,7 @@ func (w *Watcher) writeSignalFileToPod(pid int, signalFilePath, checkpointID, ch
 	}
 	// Write to the pod's filesystem via /proc/<pid>/root
-	// signalFilePath is the path inside the pod (e.g., /var/lib/dynamo-checkpoint/signal.done)
+	hostSignalPath := fmt.Sprintf("%s/%d/root%s", checkpoint.HostProcPath, pid, signalFilePath)
-	hostSignalPath := fmt.Sprintf("%s/%d/root%s", w.config.HostProc, pid, signalFilePath)
 	// Ensure signal directory exists in pod's filesystem
 	signalDir := filepath.Dir(hostSignalPath)
@@ -424,11 +416,8 @@ func (w *Watcher) writeSignalFileToPod(pid int, signalFilePath, checkpointID, ch
 }
 // writeCheckpointDoneMarker writes a checkpoint.done marker file to the checkpoint directory on shared PVC.
-// This file is written AFTER all checkpoint steps complete (including rootfs-diff.tar).
-// Restore pods on ANY node check for this file to know the checkpoint is complete and safe to restore.
-// This is separate from writeSignalFileToPod which signals the checkpoint job pod to exit.
 func (w *Watcher) writeCheckpointDoneMarker(checkpointDir, checkpointID string, success bool, errMsg string, log *logrus.Entry) {
-	markerPath := filepath.Join(checkpointDir, "checkpoint.done")
+	markerPath := filepath.Join(checkpointDir, checkpoint.CheckpointDoneFilename)
 	marker := SignalFile{
 		CheckpointID:   checkpointID,

--- a/deploy/chrek/scripts/smart-entrypoint.sh
+++ b/deploy/chrek/scripts/smart-entrypoint.sh
-#!/bin/bash
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Smart entrypoint wrapper for CRIU checkpoint/restore
-# Automatically detects checkpoints and falls back to cold start if not found
-#
-# Behavior:
-# 1. If DYN_CHECKPOINT_HASH is set and checkpoint exists -> restore
-# 2. If WAIT_FOR_CHECKPOINT=1 -> wait for checkpoint (restore-entrypoint handles this)
-# 3. Otherwise -> execute provided command (cold start)
-set -e
-# Enable debug output if DEBUG=1
-if [ "${DEBUG:-0}" = "1" ]; then
-  set -x
-fi
-# Configuration from environment
-CHECKPOINT_PATH="${DYN_CHECKPOINT_PATH:-/checkpoints}"
-CHECKPOINT_HASH="${DYN_CHECKPOINT_HASH:-}"
-WAIT_FOR_CHECKPOINT="${WAIT_FOR_CHECKPOINT:-0}"
-# Log function for consistent output
-log() {
-  echo "[smart-entrypoint] $*" >&2
-}
-# Check if a checkpoint exists and should be restored
-should_restore_checkpoint() {
-  # If WAIT_FOR_CHECKPOINT is set, always use restore-entrypoint
-  # (it will wait for the checkpoint to appear)
-  if [ "$WAIT_FOR_CHECKPOINT" = "1" ]; then
-    log "WAIT_FOR_CHECKPOINT=1, will wait for checkpoint via restore-entrypoint"
-    return 0
-  fi
-  # If checkpoint hash is not set, no restore
-  if [ -z "$CHECKPOINT_HASH" ]; then
-    log "DYN_CHECKPOINT_HASH not set, no checkpoint to restore"
-    return 1
-  fi
-  # Check if checkpoint directory exists
-  CHECKPOINT_DIR="$CHECKPOINT_PATH/$CHECKPOINT_HASH"
-  if [ ! -d "$CHECKPOINT_DIR" ]; then
-    log "Checkpoint directory not found: $CHECKPOINT_DIR"
-    return 1
-  fi
-  # Check for checkpoint.done marker which is written LAST in the checkpoint process
-  # This is more reliable than inventory.img (created by CRIU) or rootfs-diff.tar (may be mid-write)
-  # Order: metadata.json -> CRIU dump (*.img) -> rootfs-diff.tar -> checkpoint.done
-  DONE_MARKER="$CHECKPOINT_DIR/checkpoint.done"
-  if [ ! -f "$DONE_MARKER" ]; then
-    log "Checkpoint incomplete - checkpoint.done not found in: $CHECKPOINT_DIR"
-    log "Checkpoint may still be in progress..."
-    return 1
-  fi
-  log "Checkpoint found: $CHECKPOINT_HASH (checkpoint.done marker present)"
-  return 0
-}
-# Main logic
-if should_restore_checkpoint; then
-  log "=========================================="
-  log "CHECKPOINT RESTORE MODE"
-  log "=========================================="
-  log "Checkpoint: $CHECKPOINT_HASH"
-  log "Location: $CHECKPOINT_PATH/$CHECKPOINT_HASH"
-  log "Invoking restore-entrypoint..."
-  log "=========================================="
-  # Execute restore-entrypoint
-  # Any args passed to this script are forwarded (though restore-entrypoint ignores them)
-  exec /restore-entrypoint "$@"
-else
-  log "=========================================="
-  log "COLD START MODE"
-  log "=========================================="
-  # No checkpoint found or not requested - fall back to cold start
-  if [ $# -eq 0 ]; then
-    # No args provided - this is likely an error
-    log "ERROR: No checkpoint to restore and no command provided"
-    log "Set DYN_CHECKPOINT_HASH to restore a checkpoint, or provide a command to run"
-    exit 1
-  fi
-  log "No checkpoint to restore"
-  log "Executing command: $*"
-  log "=========================================="
-  # Execute the provided command
-  exec "$@"
-fi
--- a/deploy/helm/charts/chrek/README.md
+++ b/deploy/helm/charts/chrek/README.md
@@ -63,7 +63,6 @@ See `values.yaml` for all configuration options.
 | `storage.pvc.name` | PVC name (must match operator config) | `chrek-pvc` |
 | `storage.pvc.size` | PVC size | `100Gi` |
 | `storage.pvc.storageClass` | Storage class name | `""` (default) |
-| `storage.signalHostPath` | Host path for signal files | `/var/lib/chrek/signals` |
 | `daemonset.image.repository` | DaemonSet image repository | `nvidia/chrek-agent` |
 | `daemonset.nodeSelector` | Node selector for GPU nodes | `nvidia.com/gpu.present: "true"` |
 | `daemonset.runtimeClassName` | Runtime class for GPU access | `nvidia` |
@@ -175,4 +174,3 @@ Ensure your storage class supports `ReadWriteMany` access mode for multi-node de
 ## License
 Apache License 2.0
--- a/deploy/helm/charts/chrek/templates/configmap.yaml
+++ b/deploy/helm/charts/chrek/templates/configmap.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ include "chrek.fullname" . }}-config
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "chrek.labels" . | nindent 4 }}
+data:
+  config.yaml: |
+    # Chrek Configuration
+    # This ConfigMap provides static configuration for the checkpoint agent.
+    # Dynamic values (NODE_NAME, RESTRICTED_NAMESPACE, etc.) come from environment variables.
+    agent:
+      # How checkpoints are triggered: "http" for REST API, "watcher" for auto-checkpoint
+      signalSource: {{ .Values.config.agent.signalSource | quote }}
+      # Watcher/HTTP server address
+      listenAddr: {{ .Values.config.agent.listenAddr | quote }}
+    checkpoint:
+      # Base path for checkpoint directories (shared PVC mount path)
+      basePath: {{ .Values.storage.pvc.basePath | quote }}
+      criu:
+        # RPC options
+        ghostLimit: {{ .Values.config.checkpoint.criu.ghostLimit }}
+        timeout: {{ .Values.config.checkpoint.criu.timeout }}
+        logLevel: {{ .Values.config.checkpoint.criu.logLevel }}
+        workDir: {{ .Values.config.checkpoint.criu.workDir | quote }}
+        # K8s-specific options
+        leaveRunning: {{ .Values.config.checkpoint.criu.leaveRunning }}
+        shellJob: {{ .Values.config.checkpoint.criu.shellJob }}
+        tcpClose: {{ .Values.config.checkpoint.criu.tcpClose }}
+        fileLocks: {{ .Values.config.checkpoint.criu.fileLocks }}
+        orphanPtsMaster: {{ .Values.config.checkpoint.criu.orphanPtsMaster }}
+        extUnixSk: {{ .Values.config.checkpoint.criu.extUnixSk }}
+        linkRemap: {{ .Values.config.checkpoint.criu.linkRemap }}
+        extMasters: {{ .Values.config.checkpoint.criu.extMasters }}
+        manageCgroupsMode: {{ .Values.config.checkpoint.criu.manageCgroupsMode | quote }}
+        # Advanced options
+        autoDedup: {{ .Values.config.checkpoint.criu.autoDedup }}
+        lazyPages: {{ .Values.config.checkpoint.criu.lazyPages }}
+        # Config file options (NOT available via RPC)
+        libDir: {{ .Values.config.checkpoint.criu.libDir | quote }}
+        allowUprobes: {{ .Values.config.checkpoint.criu.allowUprobes }}
+        skipInFlight: {{ .Values.config.checkpoint.criu.skipInFlight }}
+      rootfsExclusions:
+        # System directories excluded from rootfs diff (NVIDIA GPU Operator injected)
+        systemDirs: {{ toYaml .Values.config.checkpoint.rootfsExclusions.systemDirs | nindent 10 }}
+        # Cache directories to exclude (reduces checkpoint size)
+        cacheDirs: {{ toYaml .Values.config.checkpoint.rootfsExclusions.cacheDirs | nindent 10 }}
+        # Additional custom exclusions
+        additionalExclusions: {{ toYaml .Values.config.checkpoint.rootfsExclusions.additionalExclusions | nindent 10 }}
+    # NOTE: Restore runtime configuration is NOT in this ConfigMap.
+    # Placeholder containers do not mount it. Restore uses hardcoded defaults
+    # + operator-injected env vars. CRIU options come from saved checkpoint manifest.
--- a/deploy/helm/charts/chrek/templates/daemonset.yaml
+++ b/deploy/helm/charts/chrek/templates/daemonset.yaml
@@ -76,13 +76,11 @@ spec:
          securityContext:
            privileged: true
          env:
+            # Dynamic values from Kubernetes downward API
            - name: NODE_NAME
              valueFrom:
                fieldRef:
                  fieldPath: spec.nodeName
-            # Agent mode: use "watcher" to watch for labeled pods
-            - name: CHECKPOINT_SIGNAL_FROM
-              value: "watcher"
            {{- if .Values.rbac.namespaceRestricted }}
            # Restrict pod watching to this namespace (namespace-scoped RBAC)
            - name: RESTRICTED_NAMESPACE
@@ -90,34 +88,11 @@ spec:
                fieldRef:
                  fieldPath: metadata.namespace
            {{- end }}
-            # Checkpoint storage directory
-            - name: CHECKPOINT_DIR
-              value: {{ .Values.storage.pvc.basePath | quote }}
-            # Host proc mount point for CRIU operations
-            - name: HOST_PROC
-              value: "/host/proc"
-            # Containerd socket path
-            - name: CONTAINERD_SOCKET
-              value: {{ .Values.daemonset.containerRuntimeSocket }}
-            {{- if .Values.daemonset.criu.cudaPluginDir }}
-            # CUDA plugin directory for GPU checkpoint support
-            - name: CUDA_PLUGIN_DIR
-              value: {{ .Values.daemonset.criu.cudaPluginDir | quote }}
-            {{- end }}
-            {{- if .Values.daemonset.criu.ghostLimit }}
-            # CRIU ghost file size limit in bytes
-            - name: CRIU_GHOST_LIMIT
-              value: {{ .Values.daemonset.criu.ghostLimit | quote }}
-            {{- end }}
-            {{- if .Values.daemonset.criu.timeout }}
-            # CRIU timeout in seconds
-            - name: CRIU_TIMEOUT
-              value: {{ .Values.daemonset.criu.timeout | quote }}
-            {{- end }}
-            # Storage type (for future S3/OCI support)
-            - name: DYN_CHECKPOINT_STORAGE_TYPE
-              value: {{ .Values.storage.type | quote }}
          volumeMounts:
+            # Mount configuration ConfigMap
+            - name: config
+              mountPath: /etc/chrek
+              readOnly: true
            {{- if eq .Values.storage.type "pvc" }}
            # Mount the checkpoint PVC (only for PVC storage type)
            - name: checkpoints
@@ -155,6 +130,10 @@ spec:
          resources:
            {{- toYaml .Values.daemonset.resources | nindent 12 }}
      volumes:
+        # Configuration ConfigMap
+        - name: config
+          configMap:
+            name: {{ include "chrek.fullname" . }}-config
        {{- if .Values.seccomp.deploy }}
        # Seccomp profile ConfigMap (used by initContainer)
        - name: seccomp-profiles