Unverified Commit d381e6ff authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

feat(chrek): config refactor, /dev/shm support, and mount-policy rewrite (#5946)

parent b6824ae0
...@@ -10,10 +10,10 @@ import ( ...@@ -10,10 +10,10 @@ import (
"golang.org/x/sys/unix" "golang.org/x/sys/unix"
) )
// OpenDirForCRIU opens a directory and clears the CLOEXEC flag so the FD // OpenPathForCRIU opens a path (directory or file) and clears the CLOEXEC flag
// can be inherited by CRIU child processes. // so the FD can be inherited by CRIU child processes.
// Returns the opened file and its FD. Caller must close the file when done. // Returns the opened file and its FD. Caller must close the file when done.
func OpenDirForCRIU(path string) (*os.File, int32, error) { func OpenPathForCRIU(path string) (*os.File, int32, error) {
dir, err := os.Open(path) dir, err := os.Open(path)
if err != nil { if err != nil {
return nil, 0, fmt.Errorf("failed to open %s: %w", path, err) return nil, 0, fmt.Errorf("failed to open %s: %w", path, err)
...@@ -30,41 +30,6 @@ func OpenDirForCRIU(path string) (*os.File, int32, error) { ...@@ -30,41 +30,6 @@ func OpenDirForCRIU(path string) (*os.File, int32, error) {
return dir, int32(dir.Fd()), nil return dir, int32(dir.Fd()), nil
} }
// DefaultMaskedPaths returns the standard OCI masked paths.
// These paths are typically masked (made inaccessible) in containers.
// Used as fallback when checkpoint metadata doesn't include OCI-derived paths.
func DefaultMaskedPaths() []string {
return []string{
"/proc/bus",
"/proc/fs",
"/proc/irq",
"/proc/sys",
"/proc/sysrq-trigger",
"/proc/acpi",
"/proc/kcore",
"/proc/keys",
"/proc/latency_stats",
"/proc/timer_list",
"/proc/scsi",
"/proc/interrupts",
"/proc/asound",
"/sys/firmware",
"/sys/devices/virtual/powercap",
}
}
// DefaultReadonlyPaths returns the standard OCI readonly paths.
// These paths are typically mounted read-only in containers.
func DefaultReadonlyPaths() []string {
return []string{
"/proc/bus",
"/proc/fs",
"/proc/irq",
"/proc/sys",
"/proc/sysrq-trigger",
}
}
// CRIUMountPoint represents a parsed mount point from /proc/pid/mountinfo. // CRIUMountPoint represents a parsed mount point from /proc/pid/mountinfo.
type CRIUMountPoint struct { type CRIUMountPoint struct {
MountID string // Mount ID MountID string // Mount ID
......
// metadata.go handles checkpoint metadata for cross-node restore operations.
package common
import (
"encoding/json"
"fmt"
"os"
"path/filepath"
"time"
)
const (
// MetadataFilename is the name of the metadata file in checkpoint directories
MetadataFilename = "metadata.json"
// DescriptorsFilename is the name of the file descriptors file
DescriptorsFilename = "descriptors.json"
)
// CheckpointMetadata stores information needed for cross-node restore
type CheckpointMetadata struct {
// Checkpoint identification
CheckpointID string `json:"checkpoint_id"`
CreatedAt time.Time `json:"created_at"`
// Source information
SourceNode string `json:"source_node"`
SourcePodIP string `json:"source_pod_ip,omitempty"` // For cross-node TCP detection
ContainerID string `json:"container_id"`
PodName string `json:"pod_name"`
PodNamespace string `json:"pod_namespace"`
Image string `json:"image"`
// Process information
PID int `json:"pid"`
// Filesystem information
RootfsDiffPath string `json:"rootfs_diff_path,omitempty"` // Path to rootfs-diff.tar
UpperDir string `json:"upper_dir,omitempty"` // Original overlay upperdir
HasRootfsDiff bool `json:"has_rootfs_diff"` // Whether rootfs diff was captured
HasDeletedFiles bool `json:"has_deleted_files"` // Whether deleted files were tracked
// Mount mappings from original container
Mounts []MountMetadata `json:"mounts"`
// OCI spec derived paths (populated from containerd, used at restore)
// These replace hardcoded values with runtime-discovered configuration
MaskedPaths []string `json:"masked_paths,omitempty"` // From OCI spec Linux.MaskedPaths
ReadonlyPaths []string `json:"readonly_paths,omitempty"` // From OCI spec Linux.ReadonlyPaths
BindMountDests []string `json:"bind_mount_dests,omitempty"` // Destinations of bind mounts (for tar exclusions)
// Namespace information
Namespaces []NamespaceMetadata `json:"namespaces"`
// CRIU options used during checkpoint (for restore compatibility)
CRIUOptions CRIUOptionsMetadata `json:"criu_options"`
}
// CRIUOptionsMetadata stores CRIU options used during checkpoint.
// This allows restore to use compatible options.
// Note: In our implementation, most options are hardcoded as always-on for K8s,
// but we store them for compatibility and debugging purposes.
type CRIUOptionsMetadata struct {
TcpEstablished bool `json:"tcp_established"`
TcpClose bool `json:"tcp_close"`
ShellJob bool `json:"shell_job"`
FileLocks bool `json:"file_locks"`
LeaveRunning bool `json:"leave_running"`
LinkRemap bool `json:"link_remap"`
ExtMasters bool `json:"ext_masters"`
}
// MountMetadata stores information about a mount for remapping during restore
type MountMetadata struct {
ContainerPath string `json:"container_path"` // Path inside container (e.g., /usr/share/nginx/html)
HostPath string `json:"host_path"` // Original host path from mountinfo
OCISource string `json:"oci_source,omitempty"` // Source path from OCI spec (may differ from HostPath)
OCIType string `json:"oci_type,omitempty"` // Mount type from OCI spec (bind, tmpfs, etc.)
OCIOptions []string `json:"oci_options,omitempty"` // Mount options from OCI spec
VolumeType string `json:"volume_type"` // emptyDir, pvc, configMap, secret, hostPath (best-effort)
VolumeName string `json:"volume_name"` // Kubernetes volume name (best-effort from path parsing)
FSType string `json:"fs_type"` // Filesystem type from mountinfo
ReadOnly bool `json:"read_only"` // Whether mount is read-only
}
// NamespaceMetadata stores namespace information
type NamespaceMetadata struct {
Type string `json:"type"` // net, pid, mnt, etc.
Inode uint64 `json:"inode"` // Namespace inode
IsExternal bool `json:"is_external"` // Whether namespace is external (shared)
}
// NewCheckpointMetadata creates a new metadata instance
func NewCheckpointMetadata(checkpointID string) *CheckpointMetadata {
return &CheckpointMetadata{
CheckpointID: checkpointID,
CreatedAt: time.Now().UTC(),
Mounts: make([]MountMetadata, 0),
Namespaces: make([]NamespaceMetadata, 0),
}
}
// SaveMetadata writes metadata to a JSON file in the checkpoint directory
func SaveMetadata(checkpointDir string, meta *CheckpointMetadata) error {
data, err := json.MarshalIndent(meta, "", " ")
if err != nil {
return fmt.Errorf("failed to marshal metadata: %w", err)
}
metadataPath := filepath.Join(checkpointDir, MetadataFilename)
if err := os.WriteFile(metadataPath, data, 0644); err != nil {
return fmt.Errorf("failed to write metadata file: %w", err)
}
return nil
}
// LoadMetadata reads metadata from a checkpoint directory
func LoadMetadata(checkpointDir string) (*CheckpointMetadata, error) {
metadataPath := filepath.Join(checkpointDir, MetadataFilename)
data, err := os.ReadFile(metadataPath)
if err != nil {
return nil, fmt.Errorf("failed to read metadata file: %w", err)
}
var meta CheckpointMetadata
if err := json.Unmarshal(data, &meta); err != nil {
return nil, fmt.Errorf("failed to unmarshal metadata: %w", err)
}
return &meta, nil
}
// SaveDescriptors writes file descriptor information to the checkpoint directory
func SaveDescriptors(checkpointDir string, descriptors []string) error {
data, err := json.Marshal(descriptors)
if err != nil {
return fmt.Errorf("failed to marshal descriptors: %w", err)
}
descriptorsPath := filepath.Join(checkpointDir, DescriptorsFilename)
if err := os.WriteFile(descriptorsPath, data, 0600); err != nil {
return fmt.Errorf("failed to write descriptors file: %w", err)
}
return nil
}
// LoadDescriptors reads file descriptor information from checkpoint directory
func LoadDescriptors(checkpointDir string) ([]string, error) {
descriptorsPath := filepath.Join(checkpointDir, DescriptorsFilename)
data, err := os.ReadFile(descriptorsPath)
if err != nil {
return nil, fmt.Errorf("failed to read descriptors file: %w", err)
}
var descriptors []string
if err := json.Unmarshal(data, &descriptors); err != nil {
return nil, fmt.Errorf("failed to unmarshal descriptors: %w", err)
}
return descriptors, nil
}
// GetCheckpointDir returns the path to a checkpoint directory
func GetCheckpointDir(baseDir, checkpointID string) string {
return filepath.Join(baseDir, checkpointID)
}
// ListCheckpoints returns all checkpoint IDs in the base directory
func ListCheckpoints(baseDir string) ([]string, error) {
entries, err := os.ReadDir(baseDir)
if err != nil {
return nil, fmt.Errorf("failed to read checkpoint directory: %w", err)
}
var checkpoints []string
for _, entry := range entries {
if !entry.IsDir() {
continue
}
// Check if metadata file exists
metadataPath := filepath.Join(baseDir, entry.Name(), MetadataFilename)
if _, err := os.Stat(metadataPath); err == nil {
checkpoints = append(checkpoints, entry.Name())
}
}
return checkpoints, nil
}
// GetCheckpointInfo returns metadata for a specific checkpoint
func GetCheckpointInfo(baseDir, checkpointID string) (*CheckpointMetadata, error) {
checkpointDir := GetCheckpointDir(baseDir, checkpointID)
return LoadMetadata(checkpointDir)
}
// DeleteCheckpoint removes a checkpoint directory
func DeleteCheckpoint(baseDir, checkpointID string) error {
checkpointDir := GetCheckpointDir(baseDir, checkpointID)
return os.RemoveAll(checkpointDir)
}
// handlers.go provides HTTP handlers for the checkpoint agent server.
package httpApiServer
import (
"encoding/json"
"fmt"
"log"
"net/http"
"os"
"path/filepath"
"time"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/checkpoint"
)
// Handlers holds dependencies for HTTP handlers.
type Handlers struct {
cfg ServerConfig
checkpointer *checkpoint.Checkpointer
}
// NewHandlers creates a new Handlers instance.
func NewHandlers(cfg ServerConfig, checkpointer *checkpoint.Checkpointer) *Handlers {
return &Handlers{
cfg: cfg,
checkpointer: checkpointer,
}
}
// HandleHealth handles GET /health requests.
func (h *Handlers) HandleHealth(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodGet {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
resp := HealthResponse{
Status: "healthy",
NodeName: h.cfg.NodeName,
}
writeJSON(w, http.StatusOK, resp)
}
// HandleCheckpoint handles POST /checkpoint requests.
func (h *Handlers) HandleCheckpoint(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
var req CheckpointRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
writeJSON(w, http.StatusBadRequest, CheckpointResponse{
Success: false,
Error: fmt.Sprintf("Invalid request body: %v", err),
})
return
}
if req.ContainerID == "" {
writeJSON(w, http.StatusBadRequest, CheckpointResponse{
Success: false,
Error: "container_id is required",
})
return
}
if req.CheckpointID == "" {
req.CheckpointID = fmt.Sprintf("ckpt-%d", time.Now().UnixNano())
}
// Build checkpoint params
params := checkpoint.CheckpointRequest{
ContainerID: req.ContainerID,
ContainerName: req.ContainerName,
CheckpointID: req.CheckpointID,
CheckpointDir: h.cfg.CheckpointSpec.BasePath,
NodeName: h.cfg.NodeName,
PodName: req.PodName,
PodNamespace: req.PodNamespace,
}
// Copy checkpoint spec and disable CUDA if requested.
checkpointSpec := *h.cfg.CheckpointSpec
if req.DisableCUDA {
checkpointSpec.CRIU.LibDir = ""
}
ctx := r.Context()
result, err := h.checkpointer.Checkpoint(ctx, params, &checkpointSpec)
if err != nil {
log.Printf("Checkpoint failed: %v", err)
writeJSON(w, http.StatusInternalServerError, CheckpointResponse{
Success: false,
Error: err.Error(),
})
return
}
// Write checkpoint.done marker so restore-entrypoint can detect this checkpoint
checkpointDonePath := result.CheckpointDir + "/" + checkpoint.CheckpointDoneFilename
if err := os.WriteFile(checkpointDonePath, []byte(time.Now().Format(time.RFC3339)), 0644); err != nil {
log.Printf("Failed to write checkpoint.done marker: %v", err)
writeJSON(w, http.StatusInternalServerError, CheckpointResponse{
Success: false,
Error: fmt.Sprintf("Checkpoint succeeded but failed to write done marker: %v", err),
})
return
}
log.Printf("Wrote checkpoint.done marker: %s", checkpointDonePath)
log.Printf("Checkpoint successful: %s", result.CheckpointID)
writeJSON(w, http.StatusOK, CheckpointResponse{
Success: true,
CheckpointID: result.CheckpointID,
Message: fmt.Sprintf("Checkpoint created successfully at %s", result.CheckpointDir),
})
}
// HandleListCheckpoints handles GET /checkpoints requests.
func (h *Handlers) HandleListCheckpoints(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodGet {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
checkpointIDs, err := checkpoint.ListCheckpoints(h.cfg.CheckpointSpec.BasePath)
if err != nil {
writeJSON(w, http.StatusInternalServerError, map[string]string{
"error": err.Error(),
})
return
}
var checkpoints []CheckpointInfo
for _, id := range checkpointIDs {
meta, err := checkpoint.ReadCheckpointManifest(filepath.Join(h.cfg.CheckpointSpec.BasePath, id))
if err != nil {
continue
}
checkpoints = append(checkpoints, CheckpointInfo{
ID: meta.CheckpointID,
CreatedAt: meta.CreatedAt,
SourceNode: meta.K8s.SourceNode,
ContainerID: meta.K8s.ContainerID,
PodName: meta.K8s.PodName,
PodNamespace: meta.K8s.PodNamespace,
})
}
writeJSON(w, http.StatusOK, ListCheckpointsResponse{
Checkpoints: checkpoints,
})
}
// writeJSON writes a JSON response.
func writeJSON(w http.ResponseWriter, status int, data interface{}) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(status)
json.NewEncoder(w).Encode(data)
}
// middleware.go provides HTTP middleware for the server.
package httpApiServer
import (
"log"
"net/http"
"time"
)
// LoggingMiddleware wraps an HTTP handler and logs request details.
func LoggingMiddleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
start := time.Now()
log.Printf("Started %s %s", r.Method, r.URL.Path)
next.ServeHTTP(w, r)
log.Printf("Completed %s %s in %v", r.Method, r.URL.Path, time.Since(start))
})
}
// server.go provides the HTTP server for the checkpoint agent.
package httpApiServer
import (
"context"
"log"
"net/http"
"time"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/checkpoint"
)
// ServerConfig holds the configuration for the HTTP API server.
type ServerConfig struct {
ListenAddr string
NodeName string
CheckpointSpec *checkpoint.CheckpointSpec
}
// Server is the HTTP API server for checkpoint operations.
type Server struct {
cfg ServerConfig
handlers *Handlers
httpServer *http.Server
}
// NewServer creates a new Server instance.
func NewServer(cfg ServerConfig, checkpointer *checkpoint.Checkpointer) *Server {
handlers := NewHandlers(cfg, checkpointer)
// Setup routes
mux := http.NewServeMux()
mux.HandleFunc("/health", handlers.HandleHealth)
mux.HandleFunc("/checkpoint", handlers.HandleCheckpoint)
mux.HandleFunc("/checkpoints", handlers.HandleListCheckpoints)
// WriteTimeout must exceed the CRIU checkpoint timeout since /checkpoint
// blocks until the dump completes. Add 60s buffer for pre/post work.
writeTimeout := time.Duration(cfg.CheckpointSpec.CRIU.Timeout)*time.Second + 60*time.Second
if writeTimeout < 300*time.Second {
writeTimeout = 300 * time.Second
}
httpServer := &http.Server{
Addr: cfg.ListenAddr,
Handler: LoggingMiddleware(mux),
ReadTimeout: 30 * time.Second,
WriteTimeout: writeTimeout,
IdleTimeout: 120 * time.Second,
}
return &Server{
cfg: cfg,
handlers: handlers,
httpServer: httpServer,
}
}
// Start starts the HTTP server.
// This method blocks until the server is shut down.
func (s *Server) Start() error {
log.Printf("HTTP API server listening on %s", s.cfg.ListenAddr)
return s.httpServer.ListenAndServe()
}
// Shutdown gracefully shuts down the server.
func (s *Server) Shutdown(ctx context.Context) error {
log.Println("Shutting down HTTP server...")
return s.httpServer.Shutdown(ctx)
}
// Addr returns the server's listen address.
func (s *Server) Addr() string {
return s.cfg.ListenAddr
}
// Package server provides HTTP server functionality for the checkpoint agent.
package httpApiServer
import "time"
// CheckpointRequest is the request body for checkpoint operations.
type CheckpointRequest struct {
ContainerID string `json:"container_id"`
ContainerName string `json:"container_name,omitempty"` // K8s container name (for volume type lookup)
CheckpointID string `json:"checkpoint_id"`
PodName string `json:"pod_name,omitempty"`
PodNamespace string `json:"pod_namespace,omitempty"`
DisableCUDA bool `json:"disable_cuda,omitempty"` // Disable CUDA plugin for non-GPU workloads
}
// CheckpointResponse is the response for checkpoint operations.
type CheckpointResponse struct {
Success bool `json:"success"`
CheckpointID string `json:"checkpoint_id,omitempty"`
Message string `json:"message,omitempty"`
Error string `json:"error,omitempty"`
}
// CheckpointInfo represents information about a checkpoint.
type CheckpointInfo struct {
ID string `json:"id"`
CreatedAt time.Time `json:"created_at"`
SourceNode string `json:"source_node"`
ContainerID string `json:"container_id"`
PodName string `json:"pod_name"`
PodNamespace string `json:"pod_namespace"`
}
// ListCheckpointsResponse is the response for list checkpoints.
type ListCheckpointsResponse struct {
Checkpoints []CheckpointInfo `json:"checkpoints"`
}
// HealthResponse is the response for health check.
type HealthResponse struct {
Status string `json:"status"`
NodeName string `json:"node_name"`
}
// config.go defines the RestoreRequest struct for CRIU restore operations.
// CRIU options come from the saved CheckpointManifest, not from this request.
//
// The restore-entrypoint runs in placeholder containers which do NOT mount the
// ConfigMap. Static defaults are hardcoded here; per-pod dynamic values come
// from environment variables injected by the operator.
package restore
import (
"context"
"encoding/json"
"fmt"
"os"
"strings"
"time"
"github.com/sirupsen/logrus"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/checkpoint"
)
const (
// RestoreLogFilename is the CRIU restore log filename.
RestoreLogFilename = "restore.log"
// CRIULogDir is the directory where CRIU restore logs are copied for debugging.
CRIULogDir = "/checkpoints/restore-logs"
// RestoreTriggerPath is the default path to the trigger file for trigger-based restore.
RestoreTriggerPath = "/tmp/restore-trigger"
)
// RestoreRequest holds runtime request inputs for the restore entrypoint.
// CRIU options are NOT stored here - they come from the saved CheckpointManifest.
type RestoreRequest struct {
// === Per-pod dynamic values (from operator-injected env vars) ===
// CheckpointPath is the base directory containing checkpoints.
CheckpointPath string
// CheckpointHash is the ID/hash of the checkpoint to restore.
CheckpointHash string
// CheckpointLocation is the full resolved path to the checkpoint directory.
CheckpointLocation string
// SkipWaitForCheckpoint controls the entrypoint behavior.
SkipWaitForCheckpoint bool
// ColdStartArgs is the command+args to exec if no checkpoint is available.
ColdStartArgs []string
// Debug enables debug logging.
Debug bool
// === Static defaults (hardcoded) ===
// RestoreMarkerFilePath is where restore-entrypoint writes a marker before CRIU restore.
RestoreMarkerFilePath string
// RestoreTrigger is the path to the trigger file that signals restore should start.
RestoreTrigger string
// WaitTimeout is the maximum time to wait for a checkpoint.
// Zero means wait indefinitely.
WaitTimeout time.Duration
}
// ConfigError represents a configuration validation error.
type ConfigError struct {
Field string
Message string
}
func (e *ConfigError) Error() string {
return fmt.Sprintf("config error: %s: %s", e.Field, e.Message)
}
// NewRestoreRequest creates a RestoreRequest with hardcoded defaults and
// operator-injected environment variable values.
func NewRestoreRequest(args []string) (*RestoreRequest, error) {
cfg := &RestoreRequest{
RestoreTrigger: RestoreTriggerPath,
ColdStartArgs: args,
}
if v := os.Getenv("DYN_CHECKPOINT_PATH"); v != "" {
cfg.CheckpointPath = v
}
if v := os.Getenv("DYN_CHECKPOINT_HASH"); v != "" {
cfg.CheckpointHash = v
}
if v := os.Getenv("DYN_CHECKPOINT_LOCATION"); v != "" {
cfg.CheckpointLocation = v
} else if cfg.CheckpointPath != "" && cfg.CheckpointHash != "" {
cfg.CheckpointLocation = cfg.CheckpointPath + "/" + cfg.CheckpointHash
}
cfg.SkipWaitForCheckpoint = os.Getenv("SKIP_WAIT_FOR_CHECKPOINT") == "1"
cfg.Debug = os.Getenv("DEBUG") == "1"
cfg.RestoreMarkerFilePath = os.Getenv("DYN_RESTORE_MARKER_FILE")
if cfg.RestoreMarkerFilePath == "" {
return nil, &ConfigError{
Field: "DYN_RESTORE_MARKER_FILE",
Message: "must be set",
}
}
return cfg, nil
}
type checkpointDoneMarker struct {
Success bool `json:"success"`
Error string `json:"error,omitempty"`
}
func checkpointDoneSucceeded(donePath string, log *logrus.Entry) bool {
data, err := os.ReadFile(donePath)
if err != nil {
log.WithError(err).WithField("path", donePath).Warn("Failed to read checkpoint.done marker")
return false
}
var marker checkpointDoneMarker
if err := json.Unmarshal(data, &marker); err != nil {
log.WithError(err).WithField("path", donePath).Warn("Failed to parse checkpoint.done marker")
return false
}
if !marker.Success {
fields := logrus.Fields{"path": donePath}
if marker.Error != "" {
fields["error"] = marker.Error
}
log.WithFields(fields).Warn("checkpoint.done marker reports failed checkpoint")
return false
}
return true
}
// ShouldRestore checks if a restore should be performed.
// Returns the checkpoint path and true if restore should proceed.
func ShouldRestore(cfg *RestoreRequest, log *logrus.Entry) (string, bool) {
// Method 1: Checkpoint location is set and checkpoint is fully complete
if cfg.CheckpointLocation != "" {
donePath := cfg.CheckpointLocation + "/" + checkpoint.CheckpointDoneFilename
if _, err := os.Stat(donePath); err == nil {
if checkpointDoneSucceeded(donePath, log) {
log.WithField("path", cfg.CheckpointLocation).Info("Checkpoint found (checkpoint.done success=true)")
return cfg.CheckpointLocation, true
}
}
// Fallback: check for manifest.yaml but warn about potential race condition.
manifestPath := cfg.CheckpointLocation + "/" + checkpoint.CheckpointManifestFilename
if _, err := os.Stat(manifestPath); err == nil {
log.WithFields(logrus.Fields{
"path": cfg.CheckpointLocation,
"warning": "checkpoint.done marker not found, checkpoint may be incomplete",
}).Warn("Checkpoint manifest found but checkpoint.done missing - checkpoint may still be in progress")
}
}
// Method 2: Restore trigger file exists with checkpoint path
if cfg.RestoreTrigger != "" {
data, err := os.ReadFile(cfg.RestoreTrigger)
if err == nil {
checkpointPath := strings.TrimSpace(string(data))
if checkpointPath != "" {
donePath := checkpointPath + "/" + checkpoint.CheckpointDoneFilename
if _, err := os.Stat(donePath); err == nil {
if checkpointDoneSucceeded(donePath, log) {
log.WithField("path", checkpointPath).Info("Restore triggered via file (checkpoint.done success=true)")
return checkpointPath, true
}
}
}
}
}
return "", false
}
// WaitForCheckpoint waits for a checkpoint to become available.
// If cfg.WaitTimeout is zero, waits indefinitely (until ctx is cancelled).
func WaitForCheckpoint(ctx context.Context, cfg *RestoreRequest, log *logrus.Entry) (string, error) {
if cfg.WaitTimeout > 0 {
log.WithField("timeout", cfg.WaitTimeout).Info("Waiting for checkpoint")
} else {
log.Info("Waiting for checkpoint indefinitely")
}
startTime := time.Now()
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
lastLog := time.Now()
for {
select {
case <-ctx.Done():
return "", ctx.Err()
case <-ticker.C:
if path, ok := ShouldRestore(cfg, log); ok {
return path, nil
}
// Log progress every 30 seconds
if time.Since(lastLog) >= 30*time.Second {
elapsed := time.Since(startTime)
log.WithField("elapsed", elapsed).Info("Still waiting for checkpoint...")
lastLog = time.Now()
}
// Only enforce deadline if WaitTimeout is set (non-zero)
if cfg.WaitTimeout > 0 && time.Since(startTime) >= cfg.WaitTimeout {
return "", fmt.Errorf("timed out waiting for checkpoint after %s", cfg.WaitTimeout)
}
}
}
}
...@@ -12,28 +12,42 @@ import ( ...@@ -12,28 +12,42 @@ import (
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/common" "github.com/ai-dynamo/dynamo/deploy/chrek/pkg/common"
) )
// CRIURestoreConfig holds configuration for CRIU restore operations. // CRIURestorePlan holds configuration for CRIU restore operations.
// Most options are always-on with safe defaults for K8s environments. // Most fields come from the saved CheckpointManifest.CRIUDump.CRIU settings.
type CRIURestoreConfig struct { type CRIURestorePlan struct {
ImageDirFD int32 // File descriptors
RootPath string ImageDirFD int32
LogLevel int32 WorkDirFD int32
LogFile string NetNsFD int32
WorkDirFD int32
NetNsFD int32 // Paths
RootPath string
LogFile string
// Options from CheckpointManifest.CRIUDump.CRIU.
LogLevel int32
Timeout uint32 // CRIU timeout in seconds (0 = no timeout, required for CUDA)
ShellJob bool // Allow session leaders (containers are often session leaders)
TcpClose bool // Close TCP connections (pod IPs change on restore)
FileLocks bool // Allow file locks
ExtUnixSk bool // Allow external Unix sockets
LinkRemap bool // Handle deleted-but-open files via CRIU link remap
ManageCgroupsMode string // Cgroup handling mode: "ignore" lets K8s manage cgroups
// External mount mappings (from CheckpointManifest.CRIUDump.ExtMnt).
ExtMountMaps []*criurpc.ExtMountMap ExtMountMaps []*criurpc.ExtMountMap
} }
// OpenImageDir opens a checkpoint directory and clears CLOEXEC for CRIU. // OpenImageDir opens a checkpoint directory and clears CLOEXEC for CRIU.
// Returns the opened file and its FD. Caller must close the file when done. // Returns the opened file and its FD. Caller must close the file when done.
func OpenImageDir(checkpointPath string) (*os.File, int32, error) { func OpenImageDir(checkpointPath string) (*os.File, int32, error) {
return common.OpenDirForCRIU(checkpointPath) return common.OpenPathForCRIU(checkpointPath)
} }
// OpenNetworkNamespace opens the target network namespace for restore. // OpenNetworkNamespace opens the target network namespace for restore.
// Returns the opened file and its FD. Caller must close the file when done. // Returns the opened file and its FD. Caller must close the file when done.
func OpenNetworkNamespace(nsPath string) (*os.File, int32, error) { func OpenNetworkNamespace(nsPath string) (*os.File, int32, error) {
return common.OpenDirForCRIU(nsPath) return common.OpenPathForCRIU(nsPath)
} }
// OpenWorkDir opens a work directory for CRIU and clears CLOEXEC. // OpenWorkDir opens a work directory for CRIU and clears CLOEXEC.
...@@ -65,62 +79,87 @@ func OpenWorkDir(workDir string, log *logrus.Entry) (*os.File, int32) { ...@@ -65,62 +79,87 @@ func OpenWorkDir(workDir string, log *logrus.Entry) (*os.File, int32) {
return workDirFile, int32(workDirFile.Fd()) return workDirFile, int32(workDirFile.Fd())
} }
// BuildRestoreCRIUOpts creates CRIU options for restore from a config struct. // BuildCRIURestoreOptions creates CRIU options for restore from a runtime plan.
// //
// Always-on options for K8s: // Options from CheckpointManifest.CRIUDump.CRIU (saved at checkpoint time):
// - ShellJob: containers are often session leaders // - ShellJob, TcpClose, FileLocks, ExtUnixSk, LinkRemap, ManageCgroupsMode
// - TcpClose: pod IPs change on restore/migration //
// - FileLocks: applications use file locks // Hardcoded restore-specific options:
// - ExtUnixSk: containers have external Unix sockets // - RstSibling: restore in detached mode
// - ManageCgroups (IGNORE): let K8s manage cgroups // - MntnsCompatMode: cross-container restore
func BuildRestoreCRIUOpts(cfg CRIURestoreConfig) *criurpc.CriuOpts { // - EvasiveDevices, ForceIrmap: device/inode handling
cgMode := criurpc.CriuCgMode_IGNORE func BuildCRIURestoreOptions(plan CRIURestorePlan) *criurpc.CriuOpts {
// Map cgroup management mode from plan.
var cgMode criurpc.CriuCgMode
switch plan.ManageCgroupsMode {
case "soft":
cgMode = criurpc.CriuCgMode_SOFT
case "full":
cgMode = criurpc.CriuCgMode_FULL
case "strict":
cgMode = criurpc.CriuCgMode_STRICT
case "ignore", "":
cgMode = criurpc.CriuCgMode_IGNORE
default:
cgMode = criurpc.CriuCgMode_IGNORE
}
criuOpts := &criurpc.CriuOpts{ criuOpts := &criurpc.CriuOpts{
ImagesDirFd: proto.Int32(cfg.ImageDirFD), ImagesDirFd: proto.Int32(plan.ImageDirFD),
LogLevel: proto.Int32(cfg.LogLevel), LogLevel: proto.Int32(plan.LogLevel),
LogFile: proto.String(cfg.LogFile), LogFile: proto.String(plan.LogFile),
// Root filesystem - use current container's root // Root filesystem - use current container's root
Root: proto.String(cfg.RootPath), Root: proto.String(plan.RootPath),
// Restore in detached mode - process runs in background // Restore in detached mode - process runs in background (restore-specific)
RstSibling: proto.Bool(true), RstSibling: proto.Bool(true),
// Mount namespace compatibility mode for cross-container restore // Mount namespace mode:
MntnsCompatMode: proto.Bool(true), // - MntnsCompatMode=false (default): Uses mount-v2 with MOVE_MOUNT_SET_GROUP (kernel 5.15+)
// This is preferred as it doesn't create temp dirs in /tmp
// Always-on for K8s environments // - MntnsCompatMode=true: Uses compat mode which creates /tmp/cr-tmpfs.XXX
ShellJob: proto.Bool(true), // This can cause "Device or resource busy" errors on cleanup
TcpClose: proto.Bool(true), // We explicitly set to false to use mount-v2 (requires kernel 5.15+)
FileLocks: proto.Bool(true), MntnsCompatMode: proto.Bool(false),
ExtUnixSk: proto.Bool(true),
// Options from saved CheckpointManifest.CRIUDump.CRIU.
// Cgroup management - ignore to avoid conflicts ShellJob: proto.Bool(plan.ShellJob),
TcpClose: proto.Bool(plan.TcpClose),
FileLocks: proto.Bool(plan.FileLocks),
ExtUnixSk: proto.Bool(plan.ExtUnixSk),
LinkRemap: proto.Bool(plan.LinkRemap),
// Cgroup management from saved settings.
ManageCgroups: proto.Bool(true), ManageCgroups: proto.Bool(true),
ManageCgroupsMode: &cgMode, ManageCgroupsMode: &cgMode,
// Device and inode handling // Device and inode handling (restore-specific)
EvasiveDevices: proto.Bool(true), EvasiveDevices: proto.Bool(true),
ForceIrmap: proto.Bool(true), ForceIrmap: proto.Bool(true),
// External mount mappings // External mount mappings
ExtMnt: cfg.ExtMountMaps, ExtMnt: plan.ExtMountMaps,
} }
// Add network namespace inheritance if provided // Add network namespace inheritance if provided
if cfg.NetNsFD >= 0 { if plan.NetNsFD >= 0 {
criuOpts.InheritFd = []*criurpc.InheritFd{ criuOpts.InheritFd = []*criurpc.InheritFd{
{ {
Key: proto.String("extNetNs"), Key: proto.String("extNetNs"),
Fd: proto.Int32(cfg.NetNsFD), Fd: proto.Int32(plan.NetNsFD),
}, },
} }
} }
// Add work directory if specified // Add work directory if specified
if cfg.WorkDirFD >= 0 { if plan.WorkDirFD >= 0 {
criuOpts.WorkDirFd = proto.Int32(cfg.WorkDirFD) criuOpts.WorkDirFd = proto.Int32(plan.WorkDirFD)
}
// Add timeout if specified (required for CUDA restores)
if plan.Timeout > 0 {
criuOpts.Timeout = proto.Uint32(plan.Timeout)
} }
return criuOpts return criuOpts
......
...@@ -8,19 +8,14 @@ import ( ...@@ -8,19 +8,14 @@ import (
"path/filepath" "path/filepath"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
)
const ( "github.com/ai-dynamo/dynamo/deploy/chrek/pkg/checkpoint"
// RootfsDiffFilename is the name of the rootfs diff tar file
RootfsDiffFilename = "rootfs-diff.tar"
// DeletedFilesFilename is the name of the deleted files JSON
DeletedFilesFilename = "deleted-files.json"
) )
// ApplyRootfsDiff extracts the rootfs-diff.tar from the checkpoint to the target root. // ApplyRootfsDiff extracts the rootfs-diff.tar from the checkpoint to the target root.
// This restores filesystem changes that were made in the original container. // This restores filesystem changes that were made in the original container.
func ApplyRootfsDiff(checkpointPath, targetRoot string, log *logrus.Entry) error { func ApplyRootfsDiff(checkpointPath, targetRoot string, log *logrus.Entry) error {
rootfsDiffPath := filepath.Join(checkpointPath, RootfsDiffFilename) rootfsDiffPath := filepath.Join(checkpointPath, checkpoint.RootfsDiffFilename)
// Check if rootfs-diff.tar exists // Check if rootfs-diff.tar exists
if _, err := os.Stat(rootfsDiffPath); os.IsNotExist(err) { if _, err := os.Stat(rootfsDiffPath); os.IsNotExist(err) {
...@@ -30,15 +25,10 @@ func ApplyRootfsDiff(checkpointPath, targetRoot string, log *logrus.Entry) error ...@@ -30,15 +25,10 @@ func ApplyRootfsDiff(checkpointPath, targetRoot string, log *logrus.Entry) error
log.WithField("path", rootfsDiffPath).Info("Applying rootfs diff") log.WithField("path", rootfsDiffPath).Info("Applying rootfs diff")
// Build tar command with options to handle conflicts: // Exclusions are already applied at checkpoint time (bind mounts, system dirs, etc.)
// --keep-old-files: Don't overwrite existing files (may already be mounted) // so we just extract with --keep-old-files to avoid overwriting existing files.
// Exclude paths that are typically mounted read-only by the container runtime
cmd := exec.Command("tar", cmd := exec.Command("tar",
"--keep-old-files", "--keep-old-files",
"--exclude=./run/secrets",
"--exclude=./etc/resolv.conf",
"--exclude=./etc/hostname",
"--exclude=./etc/hosts",
"-C", targetRoot, "-C", targetRoot,
"-xf", rootfsDiffPath, "-xf", rootfsDiffPath,
) )
...@@ -61,7 +51,7 @@ func ApplyRootfsDiff(checkpointPath, targetRoot string, log *logrus.Entry) error ...@@ -61,7 +51,7 @@ func ApplyRootfsDiff(checkpointPath, targetRoot string, log *logrus.Entry) error
// ApplyDeletedFiles removes files that were deleted in the original container. // ApplyDeletedFiles removes files that were deleted in the original container.
// These are tracked via overlay whiteout markers (.wh.<filename>). // These are tracked via overlay whiteout markers (.wh.<filename>).
func ApplyDeletedFiles(checkpointPath, targetRoot string, log *logrus.Entry) error { func ApplyDeletedFiles(checkpointPath, targetRoot string, log *logrus.Entry) error {
deletedFilesPath := filepath.Join(checkpointPath, DeletedFilesFilename) deletedFilesPath := filepath.Join(checkpointPath, checkpoint.DeletedFilesFilename)
// Check if deleted-files.json exists // Check if deleted-files.json exists
data, err := os.ReadFile(deletedFilesPath) data, err := os.ReadFile(deletedFilesPath)
...@@ -109,8 +99,5 @@ func ApplyDeletedFiles(checkpointPath, targetRoot string, log *logrus.Entry) err ...@@ -109,8 +99,5 @@ func ApplyDeletedFiles(checkpointPath, targetRoot string, log *logrus.Entry) err
func CheckpointFilesExist(checkpointPath string) bool { func CheckpointFilesExist(checkpointPath string) bool {
// Check for CRIU image files (core-*.img is always present) // Check for CRIU image files (core-*.img is always present)
matches, err := filepath.Glob(filepath.Join(checkpointPath, "core-*.img")) matches, err := filepath.Glob(filepath.Join(checkpointPath, "core-*.img"))
if err != nil || len(matches) == 0 { return err == nil && len(matches) > 0
return false
}
return true
} }
// Package restore provides CRIU restore operations.
package restore
import (
"encoding/binary"
"fmt"
"io"
"os"
"path/filepath"
"strings"
"github.com/checkpoint-restore/go-criu/v7/crit"
"github.com/checkpoint-restore/go-criu/v7/crit/images/fdinfo"
"github.com/checkpoint-restore/go-criu/v7/crit/images/regfile"
remap_file_path "github.com/checkpoint-restore/go-criu/v7/crit/images/remap-file-path"
"github.com/sirupsen/logrus"
"google.golang.org/protobuf/proto"
)
// CreateLinkRemapStubs parses CRIU images to find remapped files and creates
// the link_remap stub files needed for CRIU restore.
//
// Background: When a file is unlink()'d but a process still has an open FD to it,
// CRIU handles this via "link remapping":
//
// - During dump: CRIU creates a hardlink link_remap.<id> -> original_file
// - During restore: CRIU does linkat(link_remap.<id>, original_path) to recreate it
//
// The link_remap file only exists on the original node's filesystem. For cross-node
// restore, we must create stub files so CRIU can hardlink from them.
//
// Without these stubs, CRIU fails with:
//
// "Can't link <path>/link_remap.X -> <path>/original: No such file or directory"
func CreateLinkRemapStubs(checkpointPath string, log *logrus.Entry) error {
// 1. Parse remap-fpath.img to find files that need remapping
remapPath := filepath.Join(checkpointPath, "remap-fpath.img")
remaps, err := parseRemapFpath(remapPath)
if err != nil {
if os.IsNotExist(err) {
log.Debug("No remap-fpath.img found, no link_remap stubs needed")
return nil
}
return fmt.Errorf("failed to parse remap-fpath.img: %w", err)
}
if len(remaps) == 0 {
log.Debug("No file remaps found in checkpoint")
return nil
}
// 2. Parse file info to build ID -> fileInfo mapping
// Try reg-files.img first (older CRIU format), fall back to files.img (newer format)
regFilesPath := filepath.Join(checkpointPath, "reg-files.img")
filesPath := filepath.Join(checkpointPath, "files.img")
var fileMap map[uint32]fileInfo
var parseErr error
// Try reg-files.img first (older CRIU format)
fileMap, parseErr = parseRegFilesWithMode(regFilesPath)
if parseErr != nil {
log.WithError(parseErr).Debug("Could not parse reg-files.img, trying files.img")
// Fall back to files.img (newer format)
fileMap, parseErr = parseFilesImgWithMode(filesPath)
if parseErr != nil {
log.WithError(parseErr).WithField("remap_count", len(remaps)).Warn(
"Found remap entries but could not parse reg-files.img or files.img — link_remap stubs will not be created")
return fmt.Errorf("found %d remap entries but could not build file map: %w", len(remaps), parseErr)
}
}
// 3. Create link_remap stub files for all remapped files
var created []string
for _, remap := range remaps {
// Look up the original file by ID
origInfo, ok := fileMap[remap.origID]
if !ok {
log.WithField("orig_id", remap.origID).Debug("Original file ID not found in file map, skipping")
continue
}
// Look up the remap file path by remap ID
// This is the link_remap.XXX file that CRIU will hardlink FROM
remapInfo, ok := fileMap[remap.remapID]
var remapName string
var mode os.FileMode
if ok {
remapName = remapInfo.name
mode = remapInfo.mode
} else {
// If we can't find the remap file in fileMap, construct it
// CRIU creates link_remap files in the same directory as the original
// with format: link_remap.<remap_id>
dir := filepath.Dir(origInfo.name)
if !strings.HasPrefix(dir, "/") {
dir = "/" + dir
}
remapName = filepath.Join(dir, fmt.Sprintf("link_remap.%d", remap.remapID))
// Use original file's mode since we don't have the remap file's mode
mode = origInfo.mode
log.WithFields(logrus.Fields{
"orig_id": remap.origID,
"remap_id": remap.remapID,
"orig_path": origInfo.name,
"remap_path": remapName,
"mode": fmt.Sprintf("%04o", mode),
}).Debug("Constructed link_remap path from remap ID")
}
// Normalize path
if !strings.HasPrefix(remapName, "/") {
remapName = "/" + remapName
}
// Check if the link_remap file already exists
if _, err := os.Stat(remapName); err == nil {
log.WithField("remap_file", remapName).Debug("Link remap file already exists")
continue
}
// Create the link_remap stub file with correct permissions
// CRIU will hardlink FROM this file TO the original path
if err := createLinkRemapStub(remapName, mode); err != nil {
log.WithError(err).WithFields(logrus.Fields{
"remap_file": remapName,
"target": origInfo.name,
"mode": fmt.Sprintf("%04o", mode),
}).Warn("Failed to create link_remap stub")
continue
}
created = append(created, filepath.Base(remapName))
log.WithFields(logrus.Fields{
"remap_file": remapName,
"target": origInfo.name,
"mode": fmt.Sprintf("%04o", mode),
}).Debug("Created link_remap stub file")
}
if len(created) > 0 {
log.WithFields(logrus.Fields{
"count": len(created),
"remap_files": created,
}).Info("Created link_remap stub files for CRIU restore")
} else {
log.Debug("No link_remap stubs needed")
}
return nil
}
// fileInfo holds file metadata from CRIU checkpoint images
type fileInfo struct {
name string
mode os.FileMode
}
// remapEntry represents a file remap entry from CRIU
type remapEntry struct {
origID uint32
remapID uint32
remapType int32
}
// parseRemapFpath parses the remap-fpath.img file
func parseRemapFpath(path string) ([]remapEntry, error) {
f, err := os.Open(path)
if err != nil {
return nil, err
}
defer f.Close()
// Read and validate magic number using go-criu's ReadMagic
magic, err := crit.ReadMagic(f)
if err != nil {
return nil, fmt.Errorf("failed to read magic: %w", err)
}
if magic != "REMAP_FPATH" {
return nil, fmt.Errorf("unexpected magic: %s (expected REMAP_FPATH)", magic)
}
var entries []remapEntry
sizeBuf := make([]byte, 4)
for {
// Read entry size
_, err := io.ReadFull(f, sizeBuf)
if err == io.EOF || err == io.ErrUnexpectedEOF {
break
}
if err != nil {
return nil, fmt.Errorf("failed to read entry size: %w", err)
}
entrySize := binary.LittleEndian.Uint32(sizeBuf)
entryBuf := make([]byte, entrySize)
if _, err := io.ReadFull(f, entryBuf); err != nil {
return nil, fmt.Errorf("failed to read entry data: %w", err)
}
// Parse protobuf
entry := &remap_file_path.RemapFilePathEntry{}
if err := proto.Unmarshal(entryBuf, entry); err != nil {
return nil, fmt.Errorf("failed to unmarshal entry: %w", err)
}
entries = append(entries, remapEntry{
origID: entry.GetOrigId(),
remapID: entry.GetRemapId(),
remapType: int32(entry.GetRemapType()),
})
}
return entries, nil
}
// parseRegFilesWithMode parses the reg-files.img file and returns a map of ID -> fileInfo
func parseRegFilesWithMode(path string) (map[uint32]fileInfo, error) {
f, err := os.Open(path)
if err != nil {
return nil, err
}
defer f.Close()
// Read and validate magic number using go-criu's ReadMagic
magic, err := crit.ReadMagic(f)
if err != nil {
return nil, fmt.Errorf("failed to read magic: %w", err)
}
if magic != "REG_FILES" {
return nil, fmt.Errorf("unexpected magic: %s (expected REG_FILES)", magic)
}
fileMap := make(map[uint32]fileInfo)
sizeBuf := make([]byte, 4)
for {
// Read entry size
_, err := io.ReadFull(f, sizeBuf)
if err == io.EOF || err == io.ErrUnexpectedEOF {
break
}
if err != nil {
return nil, fmt.Errorf("failed to read entry size: %w", err)
}
entrySize := binary.LittleEndian.Uint32(sizeBuf)
entryBuf := make([]byte, entrySize)
if _, err := io.ReadFull(f, entryBuf); err != nil {
return nil, fmt.Errorf("failed to read entry data: %w", err)
}
// Parse protobuf
entry := &regfile.RegFileEntry{}
if err := proto.Unmarshal(entryBuf, entry); err != nil {
return nil, fmt.Errorf("failed to unmarshal entry: %w", err)
}
// Convert CRIU mode (includes file type bits) to os.FileMode
// CRIU stores the full st_mode, we need just the permission bits
mode := os.FileMode(entry.GetMode() & 0777)
if mode == 0 {
mode = 0600 // Default to owner read/write if mode not set
}
fileMap[entry.GetId()] = fileInfo{
name: entry.GetName(),
mode: mode,
}
}
return fileMap, nil
}
// parseFilesImgWithMode parses the files.img file and returns a map of ID -> fileInfo
// This is the newer CRIU format where file info is embedded in FileEntry messages
func parseFilesImgWithMode(path string) (map[uint32]fileInfo, error) {
f, err := os.Open(path)
if err != nil {
return nil, err
}
defer f.Close()
// Read and validate magic number using go-criu's ReadMagic
magic, err := crit.ReadMagic(f)
if err != nil {
return nil, fmt.Errorf("failed to read magic: %w", err)
}
if magic != "FILES" {
return nil, fmt.Errorf("unexpected magic: %s (expected FILES)", magic)
}
fileMap := make(map[uint32]fileInfo)
sizeBuf := make([]byte, 4)
for {
// Read entry size
_, err := io.ReadFull(f, sizeBuf)
if err == io.EOF || err == io.ErrUnexpectedEOF {
break
}
if err != nil {
return nil, fmt.Errorf("failed to read entry size: %w", err)
}
entrySize := binary.LittleEndian.Uint32(sizeBuf)
entryBuf := make([]byte, entrySize)
if _, err := io.ReadFull(f, entryBuf); err != nil {
return nil, fmt.Errorf("failed to read entry data: %w", err)
}
// Parse protobuf as FileEntry
entry := &fdinfo.FileEntry{}
if err := proto.Unmarshal(entryBuf, entry); err != nil {
return nil, fmt.Errorf("failed to unmarshal entry: %w", err)
}
// Extract fileinfo from embedded RegFileEntry if present
if entry.GetReg() != nil {
reg := entry.GetReg()
// Convert CRIU mode to os.FileMode (permission bits only)
mode := os.FileMode(reg.GetMode() & 0777)
if mode == 0 {
mode = 0600 // Default to owner read/write if mode not set
}
fileMap[entry.GetId()] = fileInfo{
name: reg.GetName(),
mode: mode,
}
}
}
return fileMap, nil
}
// createLinkRemapStub creates an empty stub file for CRIU link_remap.
// The file is created with the specified mode to match what CRIU expects.
func createLinkRemapStub(path string, mode os.FileMode) error {
// Ensure parent directory exists
dir := filepath.Dir(path)
if err := os.MkdirAll(dir, 0755); err != nil {
return fmt.Errorf("failed to create directory %s: %w", dir, err)
}
// Create file with the specified mode
// CRIU validates the file mode matches what was recorded at checkpoint time
f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, mode)
if err != nil {
return fmt.Errorf("failed to create file: %w", err)
}
defer f.Close()
// Write 32 bytes of zeros as stub content
// This provides a minimal valid file for CRIU to hardlink from
stub := make([]byte, 32)
if _, err := f.Write(stub); err != nil {
return fmt.Errorf("failed to write stub data: %w", err)
}
return nil
}
...@@ -6,81 +6,44 @@ import ( ...@@ -6,81 +6,44 @@ import (
criurpc "github.com/checkpoint-restore/go-criu/v7/rpc" criurpc "github.com/checkpoint-restore/go-criu/v7/rpc"
"google.golang.org/protobuf/proto" "google.golang.org/protobuf/proto"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/common" "github.com/ai-dynamo/dynamo/deploy/chrek/pkg/checkpoint"
) )
// GenerateExtMountMaps generates external mount mappings for CRIU restore. // GenerateExtMountMaps generates external mount mappings for CRIU restore.
// It parses /proc/1/mountinfo (the restore container's mounts) and adds // It reuses the exact dump-time ext-mount plan persisted in checkpoint manifest.
// mappings for all mount points plus masked/readonly paths from common. func GenerateExtMountMaps(data *checkpoint.CheckpointManifest) ([]*criurpc.ExtMountMap, error) {
// if data == nil {
// If meta is nil or doesn't have OCI-derived paths, falls back to defaults. return nil, fmt.Errorf("checkpoint manifest is required")
func GenerateExtMountMaps(meta *common.CheckpointMetadata) ([]*criurpc.ExtMountMap, error) { }
var maps []*criurpc.ExtMountMap if len(data.CRIUDump.ExtMnt) == 0 {
addedMounts := make(map[string]bool) return nil, fmt.Errorf("checkpoint manifest is missing criuDump.extMnt")
}
// Add root filesystem mapping first maps := []*criurpc.ExtMountMap{{
maps = append(maps, &criurpc.ExtMountMap{
Key: proto.String("/"), Key: proto.String("/"),
Val: proto.String("."), Val: proto.String("."),
}) }}
addedMounts["/"] = true addedMounts := map[string]struct{}{"/": {}}
// Parse /proc/1/mountinfo for all current mount points // Replay dump-time ext-mount plan exactly, with restore-specific root remap.
mountPoints, err := common.GetMountPointPaths("/proc/1/mountinfo") for _, mount := range data.CRIUDump.ExtMnt {
if err != nil { key := mount.Key
return nil, fmt.Errorf("failed to parse mountinfo: %w", err) if key == "" || key == "/" {
}
for _, mountPoint := range mountPoints {
if addedMounts[mountPoint] || mountPoint == "/" {
continue continue
} }
maps = append(maps, &criurpc.ExtMountMap{ if _, exists := addedMounts[key]; exists {
Key: proto.String(mountPoint),
Val: proto.String(mountPoint),
})
addedMounts[mountPoint] = true
}
// Use masked paths from checkpoint metadata (OCI spec derived)
// Fall back to defaults for backwards compatibility
maskedPaths := common.DefaultMaskedPaths()
if meta != nil && len(meta.MaskedPaths) > 0 {
maskedPaths = meta.MaskedPaths
}
for _, path := range maskedPaths {
if addedMounts[path] {
continue continue
} }
val := mount.Val
if val == "" {
val = key
}
maps = append(maps, &criurpc.ExtMountMap{ maps = append(maps, &criurpc.ExtMountMap{
Key: proto.String(path), Key: proto.String(key),
Val: proto.String(path), Val: proto.String(val),
}) })
addedMounts[path] = true addedMounts[key] = struct{}{}
}
// Also add readonly paths from metadata if available
if meta != nil {
for _, path := range meta.ReadonlyPaths {
if addedMounts[path] {
continue
}
maps = append(maps, &criurpc.ExtMountMap{
Key: proto.String(path),
Val: proto.String(path),
})
addedMounts[path] = true
}
} }
return maps, nil return maps, nil
} }
// AddExtMountMap is a helper to create a single ExtMountMap entry.
func AddExtMountMap(key, val string) *criurpc.ExtMountMap {
return &criurpc.ExtMountMap{
Key: proto.String(key),
Val: proto.String(val),
}
}
// Package restore provides CRIU restore operations for self-restoring placeholder containers.
package restore
import (
"context"
"os"
"strconv"
"time"
criurpc "github.com/checkpoint-restore/go-criu/v7/rpc"
"github.com/sirupsen/logrus"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/common"
)
// Config holds the configuration for the restore entrypoint.
// These values are typically set via environment variables.
type Config struct {
// CheckpointPath is the base directory containing checkpoints (default: /checkpoints)
// Env: DYN_CHECKPOINT_PATH
CheckpointPath string
// CheckpointHash is the ID/hash of the checkpoint to restore
// Env: DYN_CHECKPOINT_HASH
CheckpointHash string
// RestoreTrigger is the path to the trigger file that signals restore should start
RestoreTrigger string
// WaitForCheckpoint indicates whether to wait for a checkpoint to appear
WaitForCheckpoint bool
// WaitTimeout is the maximum time to wait for a checkpoint to become available
WaitTimeout time.Duration
// CRIULogLevel is the CRIU verbosity level (0-4, default: 4)
CRIULogLevel int32
// DefaultCmd is the command to run if no checkpoint is available
DefaultCmd string
// Debug enables debug logging
Debug bool
// EmbeddedCheckpointPath is the path to an embedded checkpoint within the image
// When set, the checkpoint data is baked into the container image itself
EmbeddedCheckpointPath string
// SkipInFlightConnections skips in-flight TCP connections during restore
SkipInFlightConnections bool
// AutoDedup enables auto-deduplication of memory pages
AutoDedup bool
// LazyPages enables lazy page migration (experimental)
LazyPages bool
// CRIUWorkDir is an alternative work directory for CRIU (instead of /tmp)
// Useful when /tmp has mount issues
CRIUWorkDir string
// CUDAPluginDir is the path to CRIU CUDA plugin directory (e.g., /usr/local/lib/criu)
// When set, a CRIU config file is created with libdir for CUDA plugin discovery during restore.
CUDAPluginDir string
// CRIUTimeout is the CRIU timeout in seconds (required for CUDA restores)
CRIUTimeout uint32
// RestoreMarkerFile is the path to a marker file created before CRIU restore.
// The restored process can check for this file to detect it was restored.
RestoreMarkerFile string
}
// DefaultEmbeddedCheckpointPath is the default path for embedded checkpoints
const DefaultEmbeddedCheckpointPath = "/embedded-checkpoint"
// ConfigFromEnv creates a Config from environment variables.
func ConfigFromEnv() *Config {
cfg := &Config{
CheckpointPath: getEnvOrDefault("DYN_CHECKPOINT_PATH", "/checkpoints"),
CheckpointHash: os.Getenv("DYN_CHECKPOINT_HASH"),
RestoreTrigger: getEnvOrDefault("RESTORE_TRIGGER", "/tmp/restore-trigger"),
WaitForCheckpoint: os.Getenv("WAIT_FOR_CHECKPOINT") == "1",
WaitTimeout: parseDurationOrDefault("RESTORE_WAIT_TIMEOUT", 300*time.Second),
CRIULogLevel: parseIntOrDefault("CRIU_LOG_LEVEL", 4),
DefaultCmd: os.Getenv("DEFAULT_CMD"),
Debug: os.Getenv("DEBUG") == "1",
EmbeddedCheckpointPath: getEnvOrDefault("EMBEDDED_CHECKPOINT_PATH", DefaultEmbeddedCheckpointPath),
SkipInFlightConnections: os.Getenv("CRIU_SKIP_IN_FLIGHT") == "1",
AutoDedup: os.Getenv("CRIU_AUTO_DEDUP") == "1",
LazyPages: os.Getenv("CRIU_LAZY_PAGES") == "1",
CRIUWorkDir: getEnvOrDefault("CRIU_WORK_DIR", ""),
CUDAPluginDir: os.Getenv("CUDA_PLUGIN_DIR"), // For CUDA plugin discovery during restore
CRIUTimeout: uint32(parseIntOrDefault("CRIU_TIMEOUT", 0)),
RestoreMarkerFile: getEnvOrDefault("DYN_RESTORE_MARKER_FILE", "/tmp/dynamo-restored"),
}
return cfg
}
// RestoreOptions holds the options for a CRIU restore operation.
// Most CRIU options are hardcoded with safe K8s defaults.
type RestoreOptions struct {
// CheckpointPath is the path to the checkpoint directory
CheckpointPath string
// RootPath is the root filesystem path for restore (typically "/")
RootPath string
// PidFile is the path where CRIU writes the restored process PID
PidFile string
// LogFile is the name of the CRIU restore log file
LogFile string
// LogLevel is the CRIU logging verbosity (0-4)
LogLevel int32
// ExtMountMaps contains external mount mappings for CRIU
ExtMountMaps []*criurpc.ExtMountMap
// WorkDir is an alternative work directory for CRIU (instead of /tmp)
WorkDir string
// LibDir is the path to CRIU plugin directory (e.g., /usr/local/lib/criu)
// When set, a CRIU config file is created with libdir for CUDA plugin discovery.
LibDir string
// Timeout is the CRIU timeout in seconds (required for CUDA restores)
Timeout uint32
}
// DefaultRestoreOptions returns RestoreOptions with sensible defaults.
func DefaultRestoreOptions(checkpointPath string) *RestoreOptions {
return &RestoreOptions{
CheckpointPath: checkpointPath,
RootPath: "/",
PidFile: "/tmp/restored.pid",
LogFile: "restore.log",
LogLevel: 4,
}
}
// LoadRestoreOptions creates RestoreOptions from checkpoint metadata.
// CRIU options are hardcoded with safe K8s defaults; metadata is only used for mount mappings.
func LoadRestoreOptions(checkpointPath string, logLevel int32) (*RestoreOptions, error) {
opts := DefaultRestoreOptions(checkpointPath)
opts.LogLevel = logLevel
// Load metadata for OCI-derived paths (masked/readonly paths for external mounts)
meta, err := common.LoadMetadata(checkpointPath)
if err != nil {
// Return defaults if metadata is unavailable
// GenerateExtMountMaps with nil will use fallback defaults
return opts, nil
}
// Pre-generate external mount maps using OCI-derived paths from metadata
// This uses masked/readonly paths from the OCI spec instead of hardcoded defaults
extMounts, err := GenerateExtMountMaps(meta)
if err != nil {
// Fall back to defaults if generation fails
return opts, nil
}
opts.ExtMountMaps = extMounts
return opts, nil
}
// ShouldRestore checks if a restore should be performed.
// Returns the checkpoint path and true if restore should proceed.
// IMPORTANT: We check for checkpoint.done marker (not just metadata.json or inventory.img) because
// checkpoint.done is written LAST in the checkpoint process, after rootfs-diff.tar completes.
// Order: metadata.json -> CRIU dump (*.img files) -> rootfs-diff.tar -> checkpoint.done
func ShouldRestore(cfg *Config, log *logrus.Entry) (string, bool) {
// Method 0: Embedded checkpoint in image (highest priority)
// This is for self-contained checkpoint images where data is baked in
if cfg.EmbeddedCheckpointPath != "" {
metadataPath := cfg.EmbeddedCheckpointPath + "/" + common.MetadataFilename
if _, err := os.Stat(metadataPath); err == nil {
log.WithField("path", cfg.EmbeddedCheckpointPath).Info("Embedded checkpoint found in image")
return cfg.EmbeddedCheckpointPath, true
}
}
// Method 1: DYN_CHECKPOINT_HASH is set and checkpoint is fully complete
if cfg.CheckpointHash != "" {
checkpointPath := cfg.CheckpointPath + "/" + cfg.CheckpointHash
// Check for checkpoint.done marker (written LAST after rootfs-diff.tar completes)
donePath := checkpointPath + "/checkpoint.done"
if _, err := os.Stat(donePath); err == nil {
log.WithField("path", checkpointPath).Info("Checkpoint found (checkpoint.done marker present)")
return checkpointPath, true
}
// Fallback: check for metadata.json but warn about potential race condition
metadataPath := checkpointPath + "/" + common.MetadataFilename
if _, err := os.Stat(metadataPath); err == nil {
log.WithFields(logrus.Fields{
"path": checkpointPath,
"warning": "checkpoint.done marker not found, checkpoint may be incomplete",
}).Warn("Checkpoint metadata found but checkpoint.done missing - checkpoint may still be in progress")
// Don't return true here - wait for checkpoint.done
}
}
// Method 2: Restore trigger file exists with checkpoint path
if cfg.RestoreTrigger != "" {
data, err := os.ReadFile(cfg.RestoreTrigger)
if err == nil {
checkpointPath := string(data)
if checkpointPath != "" {
donePath := checkpointPath + "/checkpoint.done"
if _, err := os.Stat(donePath); err == nil {
log.WithField("path", checkpointPath).Info("Restore triggered via file (checkpoint.done marker present)")
return checkpointPath, true
}
}
}
}
return "", false
}
// WaitForCheckpoint waits for a checkpoint to become available.
func WaitForCheckpoint(ctx context.Context, cfg *Config, log *logrus.Entry) (string, error) {
log.WithField("timeout", cfg.WaitTimeout).Info("Waiting for checkpoint")
deadline := time.Now().Add(cfg.WaitTimeout)
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
lastLog := time.Now()
for {
select {
case <-ctx.Done():
return "", ctx.Err()
case <-ticker.C:
if path, ok := ShouldRestore(cfg, log); ok {
return path, nil
}
// Log progress every 30 seconds
if time.Since(lastLog) >= 30*time.Second {
elapsed := time.Since(deadline.Add(-cfg.WaitTimeout))
log.WithField("elapsed", elapsed).Info("Still waiting for checkpoint...")
lastLog = time.Now()
}
if time.Now().After(deadline) {
return "", context.DeadlineExceeded
}
}
}
}
// Helper functions
func getEnvOrDefault(key, defaultValue string) string {
if value := os.Getenv(key); value != "" {
return value
}
return defaultValue
}
func parseDurationOrDefault(key string, defaultValue time.Duration) time.Duration {
value := os.Getenv(key)
if value == "" {
return defaultValue
}
seconds, err := strconv.Atoi(value)
if err != nil {
return defaultValue
}
return time.Duration(seconds) * time.Second
}
func parseIntOrDefault(key string, defaultValue int32) int32 {
value := os.Getenv(key)
if value == "" {
return defaultValue
}
i, err := strconv.Atoi(value)
if err != nil {
return defaultValue
}
return int32(i)
}
package restore package restore
import ( import (
"errors"
"fmt" "fmt"
"io" "io"
"os" "os"
...@@ -8,6 +9,7 @@ import ( ...@@ -8,6 +9,7 @@ import (
"os/signal" "os/signal"
"strconv" "strconv"
"strings" "strings"
"sync"
"syscall" "syscall"
"time" "time"
...@@ -54,31 +56,39 @@ func ForwardProcessOutput(pid int, log *logrus.Entry) int { ...@@ -54,31 +56,39 @@ func ForwardProcessOutput(pid int, log *logrus.Entry) int {
// Try to open the process's stdout and stderr via /proc // Try to open the process's stdout and stderr via /proc
stdoutPath := fmt.Sprintf("/proc/%d/fd/1", pid) stdoutPath := fmt.Sprintf("/proc/%d/fd/1", pid)
stderrPath := fmt.Sprintf("/proc/%d/fd/2", pid) stderrPath := fmt.Sprintf("/proc/%d/fd/2", pid)
var wg sync.WaitGroup
// Channel to signal when copying goroutines should stop
done := make(chan struct{})
// Forward stdout // Forward stdout
go forwardFD(stdoutPath, os.Stdout, "stdout", log, done) wg.Add(1)
go forwardFD(stdoutPath, os.Stdout, "stdout", log, &wg)
// Forward stderr // Forward stderr
go forwardFD(stderrPath, os.Stderr, "stderr", log, done) wg.Add(1)
go forwardFD(stderrPath, os.Stderr, "stderr", log, &wg)
// Wait for process to exit // Wait for process to exit (and reap it if it's our child).
exitCode := waitForProcess(pid, log) exitCode := waitForProcess(pid, log)
// Signal goroutines to stop // Give copy goroutines a short window to flush/finish.
close(done) done := make(chan struct{})
go func() {
// Give goroutines a moment to flush any remaining output wg.Wait()
time.Sleep(100 * time.Millisecond) close(done)
}()
select {
case <-done:
case <-time.After(2 * time.Second):
log.WithField("pid", pid).Warn("Timed out waiting for output forwarding goroutines to finish")
}
return exitCode return exitCode
} }
// forwardFD copies data from a file descriptor path to a writer. // forwardFD copies data from a file descriptor path to a writer.
// It handles the case where the FD may not be readable. // It handles the case where the FD may not be readable.
func forwardFD(fdPath string, dst io.Writer, name string, log *logrus.Entry, done <-chan struct{}) { func forwardFD(fdPath string, dst io.Writer, name string, log *logrus.Entry, wg *sync.WaitGroup) {
defer wg.Done()
// Try to open the FD path // Try to open the FD path
src, err := os.Open(fdPath) src, err := os.Open(fdPath)
if err != nil { if err != nil {
...@@ -100,54 +110,71 @@ func forwardFD(fdPath string, dst io.Writer, name string, log *logrus.Entry, don ...@@ -100,54 +110,71 @@ func forwardFD(fdPath string, dst io.Writer, name string, log *logrus.Entry, don
"path": fdPath, "path": fdPath,
}).Debug("Forwarding process output") }).Debug("Forwarding process output")
// Copy data until done or EOF _, err = io.Copy(dst, src)
buf := make([]byte, 4096) if err != nil && !errors.Is(err, io.EOF) {
for { log.WithError(err).WithField("name", name).Debug("Error reading from process FD")
select {
case <-done:
return
default:
// Set a read deadline to allow checking done channel periodically
src.SetReadDeadline(time.Now().Add(100 * time.Millisecond))
n, err := src.Read(buf)
if n > 0 {
dst.Write(buf[:n])
}
if err != nil {
if os.IsTimeout(err) {
continue
}
if err != io.EOF {
log.WithError(err).WithField("name", name).Debug("Error reading from process FD")
}
return
}
}
} }
} }
// waitForProcess waits for a process to exit and returns its exit code. // waitForProcess waits for a process to exit and returns its exit code.
func waitForProcess(pid int, log *logrus.Entry) int { func waitForProcess(pid int, log *logrus.Entry) int {
// Preferred path: restored process is typically our direct child.
// Use wait4() so zombies are reaped and exit status is reliable.
var status syscall.WaitStatus
for { for {
// Check if process still exists by sending signal 0 wpid, err := syscall.Wait4(pid, &status, 0, nil)
proc, err := os.FindProcess(pid) if errors.Is(err, syscall.EINTR) {
continue
}
if err != nil { if err != nil {
log.WithError(err).Error("Failed to find process") if errors.Is(err, syscall.ECHILD) {
log.WithField("pid", pid).Warn("Restored process is not a child; falling back to signal-based monitoring")
return waitForProcessBySignal(pid, log)
}
log.WithError(err).WithField("pid", pid).Error("Wait4 failed for restored process")
return 1 return 1
} }
if wpid != pid {
err = proc.Signal(syscall.Signal(0)) continue
if err != nil { }
// Process has exited if status.Exited() {
log.WithField("pid", pid).Info("Restored process exited") exitCode := status.ExitStatus()
log.WithFields(logrus.Fields{
// Try to get exit status "pid": pid,
exitCode := getExitCode(pid) "exit_code": exitCode,
log.WithField("exit_code", exitCode).Info("Restored process exit status") }).Info("Restored process exited")
return exitCode
}
if status.Signaled() {
exitCode := 128 + int(status.Signal())
log.WithFields(logrus.Fields{
"pid": pid,
"signal": status.Signal().String(),
"exit_code": exitCode,
}).Warn("Restored process terminated by signal")
return exitCode return exitCode
} }
log.WithField("pid", pid).Warn("Restored process exited with unexpected wait status")
return 1
}
}
func waitForProcessBySignal(pid int, log *logrus.Entry) int {
for {
proc, err := os.FindProcess(pid)
if err != nil {
log.WithError(err).WithField("pid", pid).Error("Failed to find restored process")
return 1
}
if err := proc.Signal(syscall.Signal(0)); err != nil {
log.WithField("pid", pid).Info("Restored process no longer exists")
return 0
}
// Detect zombie state when wait4 is unavailable.
if state, err := readProcState(pid); err == nil && state == "Z" {
log.WithField("pid", pid).Warn("Restored process is zombie while not reaped by this process")
return 1
}
time.Sleep(100 * time.Millisecond) time.Sleep(100 * time.Millisecond)
} }
} }
...@@ -182,6 +209,23 @@ func getExitCode(pid int) int { ...@@ -182,6 +209,23 @@ func getExitCode(pid int) int {
return 0 return 0
} }
func readProcState(pid int) (string, error) {
data, err := os.ReadFile(fmt.Sprintf("/proc/%d/status", pid))
if err != nil {
return "", err
}
for _, line := range strings.Split(string(data), "\n") {
if strings.HasPrefix(line, "State:") {
fields := strings.Fields(line)
if len(fields) >= 2 {
return fields[1], nil
}
break
}
}
return "", fmt.Errorf("state field not found in /proc/%d/status", pid)
}
// SetupSignalForwarding sets up signal forwarding to the restored process. // SetupSignalForwarding sets up signal forwarding to the restored process.
// Returns a cleanup function that should be called when done. // Returns a cleanup function that should be called when done.
func SetupSignalForwarding(pid int, log *logrus.Entry) func() { func SetupSignalForwarding(pid int, log *logrus.Entry) func() {
...@@ -232,52 +276,36 @@ func WaitForPidFile(pidFile string, timeout time.Duration, log *logrus.Entry) (i ...@@ -232,52 +276,36 @@ func WaitForPidFile(pidFile string, timeout time.Duration, log *logrus.Entry) (i
return 0, fmt.Errorf("timeout waiting for PID file %s after %v", pidFile, timeout) return 0, fmt.Errorf("timeout waiting for PID file %s after %v", pidFile, timeout)
} }
// RunDefault runs the default command when no checkpoint is available. // ExecColdStart execs the cold start command (ColdStartArgs), replacing the current process.
// It attempts to detect and run the appropriate default command for the container. // If no args are provided, falls back to sleep infinity.
func RunDefault(cfg *Config, log *logrus.Entry) error { func ExecColdStart(cfg *RestoreRequest, log *logrus.Entry) error {
// If DEFAULT_CMD is set, use it if len(cfg.ColdStartArgs) == 0 {
if cfg.DefaultCmd != "" { log.Warn("No cold start command provided, sleeping indefinitely")
log.WithField("cmd", cfg.DefaultCmd).Info("Running default command") return ExecArgs([]string{"sleep", "infinity"}, log)
return execCommand(cfg.DefaultCmd)
} }
// Try common application entrypoints log.WithField("cmd", cfg.ColdStartArgs).Info("Executing cold start command")
if _, err := os.Stat("/docker-entrypoint.sh"); err == nil { return ExecArgs(cfg.ColdStartArgs, log)
log.Info("Running docker-entrypoint.sh")
return execCommand("/docker-entrypoint.sh nginx -g 'daemon off;'")
}
// Check for nginx
if _, err := exec.LookPath("nginx"); err == nil {
log.Info("Running nginx")
return execCommand("nginx -g 'daemon off;'")
}
// Fallback to sleep infinity
log.Warn("No default command specified and no known entrypoint found, sleeping")
return execCommand("sleep infinity")
} }
// execCommand executes a command by replacing the current process. // ExecArgs replaces the current process with the given command and arguments.
func execCommand(cmdLine string) error { // Uses syscall.Exec for proper PID 1 behavior in containers.
// Parse command line - simple split by spaces func ExecArgs(args []string, log *logrus.Entry) error {
// For complex commands, shell wrapper is needed if len(args) == 0 {
parts := strings.Fields(cmdLine)
if len(parts) == 0 {
return fmt.Errorf("empty command") return fmt.Errorf("empty command")
} }
cmd := parts[0]
args := parts
// Find the executable path // Find the executable path
path, err := exec.LookPath(cmd) path, err := exec.LookPath(args[0])
if err != nil { if err != nil {
// Try running through shell for complex commands return fmt.Errorf("command not found: %s: %w", args[0], err)
path = "/bin/sh"
args = []string{"sh", "-c", cmdLine}
} }
log.WithFields(logrus.Fields{
"path": path,
"args": args,
}).Debug("Replacing process via syscall.Exec")
// Replace current process with the command // Replace current process with the command
return syscall.Exec(path, args, os.Environ()) return syscall.Exec(path, args, os.Environ())
} }
package restore package restore
import ( import (
"bufio"
"context" "context"
"fmt" "fmt"
"os" "os"
"os/exec"
"path/filepath" "path/filepath"
"sort"
"strings" "strings"
"syscall"
"time" "time"
criu "github.com/checkpoint-restore/go-criu/v7" criu "github.com/checkpoint-restore/go-criu/v7"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
"google.golang.org/protobuf/proto" "google.golang.org/protobuf/proto"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/checkpoint"
) )
// LogGPUDiagnostics logs nvidia-smi and /dev/nvidia* for debugging GPU visibility.
func LogGPUDiagnostics(label string, log *logrus.Entry) {
log.Infof("=== GPU DIAGNOSTICS [%s] ===", label)
diagCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
if out, err := exec.CommandContext(diagCtx, "nvidia-smi", "-L").CombinedOutput(); err != nil {
log.Infof("nvidia-smi -L: error: %v", err)
} else {
log.Infof("nvidia-smi -L:\n%s", string(out))
}
// Also log memory usage per GPU to detect OOM conditions
diagCtx2, cancel2 := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel2()
if out, err := exec.CommandContext(diagCtx2, "nvidia-smi", "--query-gpu=index,uuid,memory.used,memory.total,memory.free", "--format=csv,noheader").CombinedOutput(); err != nil {
log.Infof("nvidia-smi memory query: error: %v", err)
} else {
log.Infof("nvidia-smi memory:\n%s", string(out))
}
matches, _ := filepath.Glob("/dev/nvidia*")
log.Infof("/dev/nvidia* devices: %s", strings.Join(matches, ", "))
log.Infof("NVIDIA_VISIBLE_DEVICES=%s", os.Getenv("NVIDIA_VISIBLE_DEVICES"))
log.Infof("=== END GPU DIAGNOSTICS [%s] ===", label)
}
func processSnapshotPIDs(restoredPID int) []int {
pidSet := map[int]struct{}{
1: {},
os.Getpid(): {},
}
if restoredPID > 0 {
pidSet[restoredPID] = struct{}{}
}
pids := make([]int, 0, len(pidSet))
for pid := range pidSet {
pids = append(pids, pid)
}
sort.Ints(pids)
return pids
}
func logProcessNamespaces(pid int, log *logrus.Entry) {
for _, ns := range []string{"mnt", "pid", "ipc", "net", "uts", "cgroup"} {
nsPath := fmt.Sprintf("/proc/%d/ns/%s", pid, ns)
link, err := os.Readlink(nsPath)
if err != nil {
log.WithError(err).WithFields(logrus.Fields{
"pid": pid,
"path": nsPath,
}).Warn("Failed to read namespace symlink")
continue
}
log.WithFields(logrus.Fields{
"pid": pid,
"namespace": ns,
"value": link,
}).Info("Namespace snapshot")
}
}
func logProcessCgroupPath(pid int, log *logrus.Entry) {
path := fmt.Sprintf("/proc/%d/cgroup", pid)
data, err := os.ReadFile(path)
if err != nil {
log.WithError(err).WithFields(logrus.Fields{
"pid": pid,
"path": path,
}).Warn("Failed to read cgroup path")
return
}
log.WithFields(logrus.Fields{
"pid": pid,
"path": path,
"contents": strings.TrimSpace(string(data)),
}).Info("Cgroup membership snapshot")
}
func logProcessFilteredMountInfo(pid int, log *logrus.Entry) {
// Mountinfo dumps are very large; only emit them in DEBUG mode.
if !log.Logger.IsLevelEnabled(logrus.DebugLevel) {
return
}
path := fmt.Sprintf("/proc/%d/mountinfo", pid)
f, err := os.Open(path)
if err != nil {
log.WithError(err).WithFields(logrus.Fields{
"pid": pid,
"path": path,
}).Warn("Failed to open mountinfo")
return
}
defer f.Close()
var selected []string
scanner := bufio.NewScanner(f)
scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024)
for scanner.Scan() {
line := scanner.Text()
if strings.Contains(line, " /dev ") ||
strings.Contains(line, "/dev/") ||
strings.Contains(line, "nvidia") ||
strings.Contains(line, "cgroup2") {
selected = append(selected, line)
}
}
if err := scanner.Err(); err != nil {
log.WithError(err).WithFields(logrus.Fields{
"pid": pid,
"path": path,
}).Warn("Failed while scanning mountinfo")
return
}
log.WithFields(logrus.Fields{
"pid": pid,
"path": path,
"count": len(selected),
}).Debug("Filtered mountinfo snapshot count")
if len(selected) > 0 {
for i, line := range selected {
log.WithFields(logrus.Fields{
"pid": pid,
"index": i + 1,
"total": len(selected),
}).Debugf("Filtered mountinfo: %s", line)
}
}
}
func logNvidiaDeviceNodeMetadata(log *logrus.Entry) {
devices, err := filepath.Glob("/dev/nvidia*")
if err != nil {
log.WithError(err).Warn("Failed to glob /dev/nvidia*")
return
}
if len(devices) == 0 {
log.Info("No /dev/nvidia* entries found")
return
}
for _, path := range devices {
fi, err := os.Lstat(path)
if err != nil {
log.WithError(err).WithField("path", path).Warn("Failed to stat NVIDIA device entry")
continue
}
stat, ok := fi.Sys().(*syscall.Stat_t)
if !ok {
log.WithFields(logrus.Fields{
"path": path,
"mode": fi.Mode().String(),
}).Warn("Unexpected stat type for NVIDIA device entry")
continue
}
log.WithFields(logrus.Fields{
"path": path,
"mode": fi.Mode().String(),
"inode": stat.Ino,
"rdev": fmt.Sprintf("0x%x", stat.Rdev),
}).Info("NVIDIA device entry metadata")
}
}
func logCgroupV2HostInfo(log *logrus.Entry) {
const controllersPath = "/sys/fs/cgroup/cgroup.controllers"
data, err := os.ReadFile(controllersPath)
if err != nil {
log.WithError(err).WithField("path", controllersPath).Warn("Failed to read cgroup v2 controllers")
return
}
log.WithFields(logrus.Fields{
"path": controllersPath,
"controllers": strings.TrimSpace(string(data)),
}).Info("cgroup v2 controllers")
}
// LogRestoreBoundaryDiagnostics captures cgroup and namespace state around CRIU restore.
func LogRestoreBoundaryDiagnostics(label string, restoredPID int, log *logrus.Entry) {
log.Infof("=== RESTORE BOUNDARY DIAGNOSTICS [%s] ===", label)
for _, pid := range processSnapshotPIDs(restoredPID) {
logProcessNamespaces(pid, log)
logProcessCgroupPath(pid, log)
logProcessFilteredMountInfo(pid, log)
}
logCgroupV2HostInfo(log)
logNvidiaDeviceNodeMetadata(log)
log.Infof("=== END RESTORE BOUNDARY DIAGNOSTICS [%s] ===", label)
}
// Restore performs the CRIU restore operation using go-criu. // Restore performs the CRIU restore operation using go-criu.
// All CRIU options are read from the saved CheckpointManifest - no hardcoding.
// Returns the PID of the restored process. // Returns the PID of the restored process.
func Restore(ctx context.Context, opts *RestoreOptions, log *logrus.Entry) (int, error) { func Restore(ctx context.Context, checkpointPath string, data *checkpoint.CheckpointManifest, log *logrus.Entry) (int, error) {
log.WithField("checkpoint", opts.CheckpointPath).Info("Starting CRIU restore") if data == nil {
return 0, fmt.Errorf("checkpoint manifest is required")
}
// Hardcoded restore constants
const (
rootPath = "/"
pidFile = "/tmp/restored.pid"
logFile = RestoreLogFilename
)
log.WithField("checkpoint", checkpointPath).Info("Starting CRIU restore")
// 1. Open checkpoint directory // 1. Open checkpoint directory
imageDir, imageDirFD, err := OpenImageDir(opts.CheckpointPath) imageDir, imageDirFD, err := OpenImageDir(checkpointPath)
if err != nil { if err != nil {
return 0, err return 0, err
} }
defer imageDir.Close() defer imageDir.Close()
log.WithField("fd", imageDirFD).Debug("Opened checkpoint directory")
// 2. Generate external mount mappings if not already set // 2. Generate external mount mappings from saved CheckpointManifest
if opts.ExtMountMaps == nil { extMounts, err := GenerateExtMountMaps(data)
extMounts, err := GenerateExtMountMaps(nil) if err != nil {
if err != nil { return 0, fmt.Errorf("failed to generate mount maps: %w", err)
return 0, fmt.Errorf("failed to generate mount maps: %w", err)
}
opts.ExtMountMaps = extMounts
} }
log.WithField("mount_count", len(opts.ExtMountMaps)).Debug("External mount maps ready")
// 3. Open target network namespace // 3. Open target network namespace
netNsFile, netNsFD, err := OpenNetworkNamespace("/proc/1/ns/net") netNsFile, netNsFD, err := OpenNetworkNamespace("/proc/1/ns/net")
...@@ -42,53 +244,44 @@ func Restore(ctx context.Context, opts *RestoreOptions, log *logrus.Entry) (int, ...@@ -42,53 +244,44 @@ func Restore(ctx context.Context, opts *RestoreOptions, log *logrus.Entry) (int,
return 0, err return 0, err
} }
defer netNsFile.Close() defer netNsFile.Close()
log.WithField("fd", netNsFD).Debug("Opened target network namespace")
// 4. Open work directory if specified // 4. Open work directory if specified in checkpoint dump settings.
var workDirFile *os.File var workDirFile *os.File
var workDirFD int32 = -1 var workDirFD int32 = -1
if opts.WorkDir != "" { if data.CRIUDump.CRIU.WorkDir != "" {
workDirFile, workDirFD = OpenWorkDir(opts.WorkDir, log) workDirFile, workDirFD = OpenWorkDir(data.CRIUDump.CRIU.WorkDir, log)
if workDirFile != nil { if workDirFile != nil {
defer workDirFile.Close() defer workDirFile.Close()
} }
} }
// 5. Build CRIU options // 5. Build CRIU options from saved checkpoint manifest.
cfg := CRIURestoreConfig{ plan := CRIURestorePlan{
ImageDirFD: imageDirFD, // File descriptors
RootPath: opts.RootPath, ImageDirFD: imageDirFD,
LogLevel: opts.LogLevel, WorkDirFD: workDirFD,
LogFile: opts.LogFile, NetNsFD: netNsFD,
WorkDirFD: workDirFD, // Paths
NetNsFD: netNsFD, RootPath: rootPath,
ExtMountMaps: opts.ExtMountMaps, LogFile: logFile,
// Options from CheckpointManifest.CRIUDump.CRIU
LogLevel: data.CRIUDump.CRIU.LogLevel,
Timeout: data.CRIUDump.CRIU.Timeout,
ShellJob: data.CRIUDump.CRIU.ShellJob,
TcpClose: data.CRIUDump.CRIU.TcpClose,
FileLocks: data.CRIUDump.CRIU.FileLocks,
ExtUnixSk: data.CRIUDump.CRIU.ExtUnixSk,
LinkRemap: data.CRIUDump.CRIU.LinkRemap,
ManageCgroupsMode: data.CRIUDump.CRIU.ManageCgroupsMode,
// External mounts
ExtMountMaps: extMounts,
} }
criuOpts := BuildRestoreCRIUOpts(cfg) criuOpts := BuildCRIURestoreOptions(plan)
// 6. Create CRIU config file for CUDA plugin if libdir is specified // 6. Reuse criu.conf from checkpoint time if it exists.
if opts.LibDir != "" { criuConfPath := filepath.Join(checkpointPath, checkpoint.CheckpointCRIUConfFilename)
if opts.Timeout == 0 { if _, err := os.Stat(criuConfPath); err == nil {
return 0, fmt.Errorf("CRIU_TIMEOUT environment variable must be set for CUDA restores") criuOpts.ConfigFile = proto.String(criuConfPath)
}
configPath := filepath.Join(opts.CheckpointPath, "restore-criu.conf")
configContent := fmt.Sprintf(`enable-external-masters
libdir %s
tcp-close
link-remap
timeout %d
allow-uprobes
skip-in-flight
`, opts.LibDir, opts.Timeout)
if err := os.WriteFile(configPath, []byte(configContent), 0644); err != nil {
log.WithError(err).Warn("Failed to write CRIU config file for restore")
} else {
criuOpts.ConfigFile = proto.String(configPath)
log.WithFields(logrus.Fields{
"config_path": configPath,
"lib_dir": opts.LibDir,
}).Info("Created CRIU config file with libdir for CUDA plugin")
}
} }
// 7. Execute CRIU restore // 7. Execute CRIU restore
...@@ -99,7 +292,7 @@ skip-in-flight ...@@ -99,7 +292,7 @@ skip-in-flight
criuExecStart := time.Now() criuExecStart := time.Now()
if err := c.Restore(criuOpts, notify); err != nil { if err := c.Restore(criuOpts, notify); err != nil {
log.WithField("duration", time.Since(criuExecStart)).Error("CRIU c.Restore failed") log.WithField("duration", time.Since(criuExecStart)).Error("CRIU c.Restore failed")
logCRIUErrors(opts.CheckpointPath, opts.LogFile, log) logCRIUErrors(checkpointPath, logFile, log)
return 0, fmt.Errorf("CRIU restore failed: %w", err) return 0, fmt.Errorf("CRIU restore failed: %w", err)
} }
...@@ -114,15 +307,11 @@ skip-in-flight ...@@ -114,15 +307,11 @@ skip-in-flight
} }
// Fallback: try to read from PID file // Fallback: try to read from PID file
if opts.PidFile != "" { pid, err := WaitForPidFile(pidFile, 10*time.Second, log)
pid, err := WaitForPidFile(opts.PidFile, 10*time.Second, log) if err != nil {
if err != nil { return 0, fmt.Errorf("failed to get restored PID: %w", err)
return 0, fmt.Errorf("failed to get restored PID: %w", err)
}
return pid, nil
} }
return pid, nil
return 0, fmt.Errorf("could not determine restored process PID")
} }
// logCRIUErrors reads CRIU log file and logs errors. // logCRIUErrors reads CRIU log file and logs errors.
...@@ -142,62 +331,58 @@ func logCRIUErrors(checkpointPath, logFile string, log *logrus.Entry) { ...@@ -142,62 +331,58 @@ func logCRIUErrors(checkpointPath, logFile string, log *logrus.Entry) {
} }
log.Error("=== CRIU RESTORE LOG END ===") log.Error("=== CRIU RESTORE LOG END ===")
// Copy log to shared directory if CRIU_LOG_DIR is set // Copy log to shared directory for debugging
if logDir := os.Getenv("CRIU_LOG_DIR"); logDir != "" { if err := os.MkdirAll(CRIULogDir, 0755); err == nil {
if err := os.MkdirAll(logDir, 0755); err == nil { destPath := filepath.Join(CRIULogDir, fmt.Sprintf("restore-%d.log", time.Now().Unix()))
destPath := filepath.Join(logDir, fmt.Sprintf("restore-%d.log", time.Now().Unix())) if err := os.WriteFile(destPath, data, 0644); err == nil {
if err := os.WriteFile(destPath, data, 0644); err == nil { log.WithField("path", destPath).Info("CRIU log copied to shared directory")
log.WithField("path", destPath).Info("CRIU log copied to shared directory")
}
} }
} }
} }
// Run is the main entry point for the restore entrypoint. // Run is the main entry point for the restore entrypoint.
// It orchestrates the entire restore process. // It orchestrates the entire restore process.
func Run(ctx context.Context, cfg *Config, log *logrus.Entry) error { func Run(ctx context.Context, cfg *RestoreRequest, log *logrus.Entry) error {
log.Info("=== Self-Restoring Placeholder Entrypoint ===") log.Info("=== Restore Entrypoint ===")
log.WithFields(logrus.Fields{ log.WithFields(logrus.Fields{
"checkpoint_path": cfg.CheckpointPath, "checkpoint_path": cfg.CheckpointPath,
"checkpoint_hash": cfg.CheckpointHash, "checkpoint_hash": cfg.CheckpointHash,
"embedded_checkpoint_path": cfg.EmbeddedCheckpointPath, "checkpoint_location": cfg.CheckpointLocation,
"wait_for_checkpoint": cfg.WaitForCheckpoint, "skip_wait_for_checkpoint": cfg.SkipWaitForCheckpoint,
"restore_marker_file": cfg.RestoreMarkerFile, "cold_start_args": cfg.ColdStartArgs,
}).Info("Configuration") }).Debug("Configuration")
// Check CRIU availability // Check CRIU availability
c := criu.MakeCriu() c := criu.MakeCriu()
version, err := c.GetCriuVersion() if _, err := c.GetCriuVersion(); err != nil {
if err != nil {
log.WithError(err).Error("CRIU is not available") log.WithError(err).Error("CRIU is not available")
log.Info("Falling back to default command") return ExecColdStart(cfg, log)
return RunDefault(cfg, log)
} }
log.WithField("version", version).Info("CRIU version")
// Determine checkpoint path // Determine checkpoint path based on mode
var checkpointPath string var checkpointPath string
var shouldRestore bool
// Check if we should restore immediately
checkpointPath, shouldRestore = ShouldRestore(cfg, log)
// If not and we're configured to wait, wait for checkpoint if cfg.SkipWaitForCheckpoint {
if !shouldRestore && cfg.WaitForCheckpoint { // Operator path: check once, restore if ready, otherwise cold start
log.Info("Waiting for checkpoint...") var ready bool
var err error checkpointPath, ready = ShouldRestore(cfg, log)
checkpointPath, err = WaitForCheckpoint(ctx, cfg, log) if !ready {
if err != nil { log.Info("No checkpoint ready, executing cold start command")
log.WithError(err).Info("No checkpoint received, running default command") return ExecColdStart(cfg, log)
return RunDefault(cfg, log) }
} else {
// Standalone/DaemonSet path: check first, then poll if needed
var ready bool
checkpointPath, ready = ShouldRestore(cfg, log)
if !ready {
log.Info("Waiting for checkpoint...")
var err error
checkpointPath, err = WaitForCheckpoint(ctx, cfg, log)
if err != nil {
log.WithError(err).Info("No checkpoint received")
return ExecColdStart(cfg, log)
}
} }
shouldRestore = true
}
// If no checkpoint, run default command
if !shouldRestore {
log.Info("No checkpoint configured, running default command")
return RunDefault(cfg, log)
} }
// Perform restore // Perform restore
...@@ -205,68 +390,61 @@ func Run(ctx context.Context, cfg *Config, log *logrus.Entry) error { ...@@ -205,68 +390,61 @@ func Run(ctx context.Context, cfg *Config, log *logrus.Entry) error {
restoreStart := time.Now() restoreStart := time.Now()
// Apply filesystem changes // Apply filesystem changes
rootfsDiffStart := time.Now()
if err := ApplyRootfsDiff(checkpointPath, "/", log); err != nil { if err := ApplyRootfsDiff(checkpointPath, "/", log); err != nil {
log.WithError(err).Error("Failed to apply rootfs diff") log.WithError(err).Error("Failed to apply rootfs diff")
} }
log.WithField("duration", time.Since(rootfsDiffStart)).Info("ApplyRootfsDiff completed")
deletedFilesStart := time.Now()
if err := ApplyDeletedFiles(checkpointPath, "/", log); err != nil { if err := ApplyDeletedFiles(checkpointPath, "/", log); err != nil {
log.WithError(err).Error("Failed to apply deleted files") log.WithError(err).Error("Failed to apply deleted files")
} }
log.WithField("duration", time.Since(deletedFilesStart)).Info("ApplyDeletedFiles completed")
// Load restore options from metadata // Load checkpoint manifest (contains CRIU settings + mounts + namespaces).
loadOptsStart := time.Now() data, err := checkpoint.ReadCheckpointManifest(checkpointPath)
opts, err := LoadRestoreOptions(checkpointPath, cfg.CRIULogLevel)
if err != nil { if err != nil {
log.WithError(err).Warn("Could not load restore options from metadata, using defaults") log.WithError(err).Error("Failed to load checkpoint manifest")
return ExecColdStart(cfg, log)
} }
log.WithField("duration", time.Since(loadOptsStart)).Info("LoadRestoreOptions completed")
// Apply additional config options // Write restore marker file before CRIU restore
if cfg.CRIUWorkDir != "" { restoreMarkerFile := cfg.RestoreMarkerFilePath
opts.WorkDir = cfg.CRIUWorkDir if err := os.MkdirAll(filepath.Dir(restoreMarkerFile), 0755); err != nil {
log.WithError(err).Warn("Failed to create restore marker directory")
}
if err := os.WriteFile(restoreMarkerFile, []byte("restored"), 0644); err != nil {
log.WithError(err).Warn("Failed to write restore marker file")
} }
// Set CUDA plugin directory and timeout for restore config file // Restore /dev/shm contents before CRIU restore
if cfg.CUDAPluginDir != "" { if err := RestoreDevShm(checkpointPath, log); err != nil {
if cfg.CRIUTimeout == 0 { log.WithError(err).Error("Failed to restore /dev/shm contents - CRIU restore may fail with missing FD errors")
return fmt.Errorf("CRIU_TIMEOUT environment variable must be set for CUDA restores")
}
opts.LibDir = cfg.CUDAPluginDir
opts.Timeout = cfg.CRIUTimeout
log.WithFields(logrus.Fields{
"lib_dir": cfg.CUDAPluginDir,
"timeout": cfg.CRIUTimeout,
}).Info("CUDA plugin directory and timeout configured for restore")
} }
// Write restore marker file before CRIU restore // Create link_remap stub files for unlinked files referenced in CRIU images
// This allows the restored process to detect it's been restored if err := CreateLinkRemapStubs(checkpointPath, log); err != nil {
if cfg.RestoreMarkerFile != "" { log.WithError(err).Warn("Failed to create link_remap stubs")
if err := os.WriteFile(cfg.RestoreMarkerFile, []byte("restored"), 0644); err != nil {
log.WithError(err).Warn("Failed to write restore marker file")
} else {
log.WithField("path", cfg.RestoreMarkerFile).Info("Wrote restore marker file")
}
} }
// Log GPU diagnostics right before CRIU restore to track device visibility changes
LogGPUDiagnostics("PRE-CRIU-RESTORE", log)
LogRestoreBoundaryDiagnostics("PRE-CRIU-RESTORE", 0, log)
// Perform CRIU restore (CUDA plugin handles CUDA state automatically) // Perform CRIU restore (CUDA plugin handles CUDA state automatically)
criuRestoreStart := time.Now() criuRestoreStart := time.Now()
pid, err := Restore(ctx, opts, log) pid, err := Restore(ctx, checkpointPath, data, log)
if err != nil { if err != nil {
log.WithField("duration", time.Since(criuRestoreStart)).WithError(err).Error("Restore failed, falling back to default command") log.WithField("duration", time.Since(criuRestoreStart)).WithError(err).Error("Restore failed, falling back to default command")
if cfg.Debug { if cfg.Debug {
log.Info("DEBUG mode: sleeping 300s to allow log collection...") log.Info("DEBUG mode: sleeping 300s to allow log collection...")
time.Sleep(300 * time.Second) time.Sleep(300 * time.Second)
} }
return RunDefault(cfg, log) return ExecColdStart(cfg, log)
} }
criuRestoreDuration := time.Since(criuRestoreStart) criuRestoreDuration := time.Since(criuRestoreStart)
log.WithField("duration", criuRestoreDuration).Info("CRIU Restore completed (CUDA state restored by plugin)") log.WithField("duration", criuRestoreDuration).Info("CRIU Restore completed (CUDA state restored by plugin)")
// Log GPU diagnostics AFTER restore to compare with pre-restore
LogGPUDiagnostics("POST-RESTORE", log)
LogRestoreBoundaryDiagnostics("POST-RESTORE", pid, log)
totalDuration := time.Since(restoreStart) totalDuration := time.Since(restoreStart)
log.WithFields(logrus.Fields{ log.WithFields(logrus.Fields{
"total_duration": totalDuration, "total_duration": totalDuration,
......
// Package restore provides CRIU restore operations.
package restore
import (
"fmt"
"io"
"os"
"path/filepath"
"github.com/sirupsen/logrus"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/checkpoint"
)
// RestoreDevShm restores files from the checkpoint's dev-shm directory to /dev/shm.
// This must be called BEFORE CRIU restore so that the shared memory files exist
// when CRIU tries to restore file descriptors pointing to them.
func RestoreDevShm(checkpointPath string, log *logrus.Entry) error {
srcDir := filepath.Join(checkpointPath, checkpoint.DevShmDirName)
// Check if dev-shm directory exists in checkpoint
entries, err := os.ReadDir(srcDir)
if err != nil {
if os.IsNotExist(err) {
log.Debug("No dev-shm directory in checkpoint, skipping restore")
return nil
}
return fmt.Errorf("failed to read checkpoint dev-shm directory: %w", err)
}
if len(entries) == 0 {
log.Debug("Checkpoint dev-shm directory is empty")
return nil
}
// Ensure /dev/shm exists and is writable
destDir := "/dev/shm"
if err := os.MkdirAll(destDir, 0777); err != nil {
return fmt.Errorf("failed to ensure /dev/shm exists: %w", err)
}
var restored []string
var totalSize int64
for _, entry := range entries {
if entry.IsDir() {
continue
}
name := entry.Name()
srcPath := filepath.Join(srcDir, name)
destPath := filepath.Join(destDir, name)
info, err := entry.Info()
if err != nil {
log.WithError(err).WithField("file", name).Warn("Failed to get file info, skipping")
continue
}
size := info.Size()
// Copy the file to /dev/shm
if err := copyFileToShm(srcPath, destPath, info.Mode()); err != nil {
log.WithError(err).WithField("file", name).Warn("Failed to restore file, skipping")
continue
}
restored = append(restored, name)
totalSize += size
log.WithFields(logrus.Fields{
"file": name,
"size": size,
}).Debug("Restored /dev/shm file")
}
if len(restored) > 0 {
log.WithFields(logrus.Fields{
"count": len(restored),
"total_size": totalSize,
"files": restored,
}).Info("Restored /dev/shm files from checkpoint")
}
return nil
}
// copyFileToShm copies a file from src to dest in /dev/shm.
// Uses mode 0666 as default when mode is 0, otherwise preserves the original mode.
func copyFileToShm(src, dest string, mode os.FileMode) error {
srcFile, err := os.Open(src)
if err != nil {
return fmt.Errorf("failed to open source: %w", err)
}
defer srcFile.Close()
// Default to 0666 when mode is not set (mode == 0)
if mode == 0 {
mode = 0666
}
destFile, err := os.OpenFile(dest, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, mode)
if err != nil {
return fmt.Errorf("failed to create destination: %w", err)
}
defer destFile.Close()
if _, err := io.Copy(destFile, srcFile); err != nil {
return fmt.Errorf("failed to copy contents: %w", err)
}
return nil
}
...@@ -21,18 +21,6 @@ import ( ...@@ -21,18 +21,6 @@ import (
"k8s.io/client-go/tools/cache" "k8s.io/client-go/tools/cache"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/checkpoint" "github.com/ai-dynamo/dynamo/deploy/chrek/pkg/checkpoint"
checkpointk8s "github.com/ai-dynamo/dynamo/deploy/chrek/pkg/checkpoint/k8s"
)
const (
// LabelCheckpointSource is the label that triggers automatic checkpointing
LabelCheckpointSource = "nvidia.com/checkpoint-source"
// LabelCheckpointHash is the label specifying the checkpoint identity hash
LabelCheckpointHash = "nvidia.com/checkpoint-hash"
// EnvCheckpointSignalFile is the env var in the pod specifying the signal file path
EnvCheckpointSignalFile = "DYN_CHECKPOINT_SIGNAL_FILE"
) )
// SignalFile represents the content of a checkpoint completion signal file // SignalFile represents the content of a checkpoint completion signal file
...@@ -44,26 +32,21 @@ type SignalFile struct { ...@@ -44,26 +32,21 @@ type SignalFile struct {
Error string `json:"error,omitempty"` Error string `json:"error,omitempty"`
} }
// Config holds watcher configuration // WatcherConfig holds watcher configuration.
type Config struct { type WatcherConfig struct {
NodeName string NodeName string
CheckpointDir string
HostProc string
ListenAddr string // HTTP server address for health checks (e.g., ":8080") ListenAddr string // HTTP server address for health checks (e.g., ":8080")
RestrictedNamespace string // Optional: restrict watching to this namespace (empty = cluster-wide) RestrictedNamespace string // Optional: restrict watching to this namespace (empty = cluster-wide)
// GPU/CUDA checkpoint options (passed to checkpoint.Options) // Checkpoint configuration (from ConfigMap)
CUDAPluginDir string // Path to CRIU CUDA plugin directory CheckpointSpec *checkpoint.CheckpointSpec
GhostLimit uint32 // Ghost file size limit in bytes (default: 512MB for GPU)
Timeout uint32 // CRIU timeout in seconds
ExternalMounts []string // Additional external mount mappings
} }
// Watcher watches for pods with checkpoint labels and triggers checkpoints // Watcher watches for pods with checkpoint labels and triggers checkpoints
type Watcher struct { type Watcher struct {
config Config config WatcherConfig
clientset kubernetes.Interface clientset kubernetes.Interface
discoveryClient *checkpointk8s.DiscoveryClient discoveryClient *checkpoint.DiscoveryClient
checkpointer *checkpoint.Checkpointer checkpointer *checkpoint.Checkpointer
log *logrus.Entry log *logrus.Entry
...@@ -75,7 +58,7 @@ type Watcher struct { ...@@ -75,7 +58,7 @@ type Watcher struct {
} }
// NewWatcher creates a new pod watcher // NewWatcher creates a new pod watcher
func NewWatcher(cfg Config, discoveryClient *checkpointk8s.DiscoveryClient, checkpointer *checkpoint.Checkpointer) (*Watcher, error) { func NewWatcher(cfg WatcherConfig, discoveryClient *checkpoint.DiscoveryClient, checkpointer *checkpoint.Checkpointer) (*Watcher, error) {
// Create in-cluster Kubernetes client // Create in-cluster Kubernetes client
restConfig, err := rest.InClusterConfig() restConfig, err := rest.InClusterConfig()
if err != nil { if err != nil {
...@@ -100,10 +83,13 @@ func NewWatcher(cfg Config, discoveryClient *checkpointk8s.DiscoveryClient, chec ...@@ -100,10 +83,13 @@ func NewWatcher(cfg Config, discoveryClient *checkpointk8s.DiscoveryClient, chec
// Start begins watching for pods and starts the health check server // Start begins watching for pods and starts the health check server
func (w *Watcher) Start(ctx context.Context) error { func (w *Watcher) Start(ctx context.Context) error {
if w.config.CheckpointSpec == nil {
return fmt.Errorf("checkpoint spec is required")
}
w.log.WithFields(logrus.Fields{ w.log.WithFields(logrus.Fields{
"node": w.config.NodeName, "node": w.config.NodeName,
"label": LabelCheckpointSource, "label": checkpoint.KubeLabelCheckpointSource,
"signal_file_env": EnvCheckpointSignalFile,
}).Info("Starting pod watcher") }).Info("Starting pod watcher")
// Start health check HTTP server if address is configured // Start health check HTTP server if address is configured
...@@ -118,7 +104,7 @@ func (w *Watcher) Start(ctx context.Context) error { ...@@ -118,7 +104,7 @@ func (w *Watcher) Start(ctx context.Context) error {
// Create informer factory with label selector and optional namespace restriction // Create informer factory with label selector and optional namespace restriction
labelSelector := labels.SelectorFromSet(labels.Set{ labelSelector := labels.SelectorFromSet(labels.Set{
LabelCheckpointSource: "true", checkpoint.KubeLabelCheckpointSource: "true",
}).String() }).String()
factoryOptions := []informers.SharedInformerOption{ factoryOptions := []informers.SharedInformerOption{
...@@ -232,7 +218,7 @@ func (w *Watcher) handlePodEvent(ctx context.Context, pod *corev1.Pod) { ...@@ -232,7 +218,7 @@ func (w *Watcher) handlePodEvent(ctx context.Context, pod *corev1.Pod) {
podKey := fmt.Sprintf("%s/%s", pod.Namespace, pod.Name) podKey := fmt.Sprintf("%s/%s", pod.Namespace, pod.Name)
// Get checkpoint ID from label (uses the checkpoint hash) // Get checkpoint ID from label (uses the checkpoint hash)
checkpointID, ok := pod.Labels[LabelCheckpointHash] checkpointID, ok := pod.Labels[checkpoint.KubeLabelCheckpointHash]
if !ok || checkpointID == "" { if !ok || checkpointID == "" {
w.log.WithField("pod", podKey).Warn("Pod has checkpoint label but no checkpoint-hash label") w.log.WithField("pod", podKey).Warn("Pod has checkpoint label but no checkpoint-hash label")
return return
...@@ -282,12 +268,14 @@ func (w *Watcher) doCheckpoint(ctx context.Context, pod *corev1.Pod, checkpointI ...@@ -282,12 +268,14 @@ func (w *Watcher) doCheckpoint(ctx context.Context, pod *corev1.Pod, checkpointI
// Find the main container and get signal file path from env // Find the main container and get signal file path from env
var containerID string var containerID string
var containerName string
var signalFilePath string var signalFilePath string
for _, container := range pod.Spec.Containers { for _, container := range pod.Spec.Containers {
if container.Name == "main" || len(pod.Spec.Containers) == 1 { if container.Name == "main" || len(pod.Spec.Containers) == 1 {
containerName = container.Name
// Get signal file path from environment // Get signal file path from environment
for _, env := range container.Env { for _, env := range container.Env {
if env.Name == EnvCheckpointSignalFile { if env.Name == "DYN_CHECKPOINT_SIGNAL_FILE" {
signalFilePath = env.Value signalFilePath = env.Value
break break
} }
...@@ -325,8 +313,8 @@ func (w *Watcher) doCheckpoint(ctx context.Context, pod *corev1.Pod, checkpointI ...@@ -325,8 +313,8 @@ func (w *Watcher) doCheckpoint(ctx context.Context, pod *corev1.Pod, checkpointI
"signal_file_path": signalFilePath, "signal_file_path": signalFilePath,
}).Info("Found container, starting checkpoint") }).Info("Found container, starting checkpoint")
// Resolve container to get PID for signal file writing // Resolve container to get PID for signal file writing.
containerInfo, err := w.discoveryClient.ResolveContainer(ctx, containerID) containerPID, _, err := w.discoveryClient.ResolveContainer(ctx, containerID)
if err != nil { if err != nil {
log.WithError(err).Error("Failed to resolve container") log.WithError(err).Error("Failed to resolve container")
w.checkpointedMu.Lock() w.checkpointedMu.Lock()
...@@ -335,28 +323,34 @@ func (w *Watcher) doCheckpoint(ctx context.Context, pod *corev1.Pod, checkpointI ...@@ -335,28 +323,34 @@ func (w *Watcher) doCheckpoint(ctx context.Context, pod *corev1.Pod, checkpointI
return return
} }
// Validate CheckpointSpec is set
if w.config.CheckpointSpec == nil {
log.Error("CheckpointSpec is nil - cannot perform checkpoint")
w.checkpointedMu.Lock()
delete(w.checkpointed, podKey)
w.checkpointedMu.Unlock()
return
}
// Perform checkpoint // Perform checkpoint
opts := checkpoint.Options{ params := checkpoint.CheckpointRequest{
ContainerID: containerID, ContainerID: containerID,
CheckpointID: checkpointID, ContainerName: containerName,
CheckpointDir: w.config.CheckpointDir, CheckpointID: checkpointID,
NodeName: w.config.NodeName, CheckpointDir: w.config.CheckpointSpec.BasePath,
PodName: pod.Name, NodeName: w.config.NodeName,
PodNamespace: pod.Namespace, PodName: pod.Name,
CUDAPluginDir: w.config.CUDAPluginDir, PodNamespace: pod.Namespace,
GhostLimit: w.config.GhostLimit,
Timeout: w.config.Timeout,
ExternalMounts: w.config.ExternalMounts,
} }
result, err := w.checkpointer.Checkpoint(ctx, opts) result, err := w.checkpointer.Checkpoint(ctx, params, w.config.CheckpointSpec)
if err != nil { if err != nil {
log.WithError(err).Error("Checkpoint failed") log.WithError(err).Error("Checkpoint failed")
// Write failure marker to PVC so restore pods know checkpoint failed // Write failure marker to PVC so restore pods know checkpoint failed
checkpointDir := filepath.Join(w.config.CheckpointDir, checkpointID) checkpointDir := filepath.Join(w.config.CheckpointSpec.BasePath, checkpointID)
w.writeCheckpointDoneMarker(checkpointDir, checkpointID, false, err.Error(), log) w.writeCheckpointDoneMarker(checkpointDir, checkpointID, false, err.Error(), log)
if signalFilePath != "" { if signalFilePath != "" {
w.writeSignalFileToPod(int(containerInfo.PID), signalFilePath, checkpointID, "", false, err.Error()) w.writeSignalFileToPod(containerPID, signalFilePath, checkpointID, "", false, err.Error())
} }
// Clear the in_progress status so checkpoint can be retried // Clear the in_progress status so checkpoint can be retried
w.checkpointedMu.Lock() w.checkpointedMu.Lock()
...@@ -368,12 +362,11 @@ func (w *Watcher) doCheckpoint(ctx context.Context, pod *corev1.Pod, checkpointI ...@@ -368,12 +362,11 @@ func (w *Watcher) doCheckpoint(ctx context.Context, pod *corev1.Pod, checkpointI
log.WithField("checkpoint_dir", result.CheckpointDir).Info("Checkpoint completed successfully") log.WithField("checkpoint_dir", result.CheckpointDir).Info("Checkpoint completed successfully")
// Write checkpoint.done marker to PVC for cross-node restore detection // Write checkpoint.done marker to PVC for cross-node restore detection
// This is written AFTER rootfs-diff.tar is complete, so it's safe to use as a completion marker
w.writeCheckpointDoneMarker(result.CheckpointDir, checkpointID, true, "", log) w.writeCheckpointDoneMarker(result.CheckpointDir, checkpointID, true, "", log)
// Write signal file to pod's hostPath for checkpoint job pod to exit // Write signal file to pod's hostPath for checkpoint job pod to exit
if signalFilePath != "" { if signalFilePath != "" {
w.writeSignalFileToPod(int(containerInfo.PID), signalFilePath, checkpointID, result.CheckpointDir, true, "") w.writeSignalFileToPod(containerPID, signalFilePath, checkpointID, result.CheckpointDir, true, "")
} }
// Mark as completed so we don't checkpoint again // Mark as completed so we don't checkpoint again
...@@ -400,8 +393,7 @@ func (w *Watcher) writeSignalFileToPod(pid int, signalFilePath, checkpointID, ch ...@@ -400,8 +393,7 @@ func (w *Watcher) writeSignalFileToPod(pid int, signalFilePath, checkpointID, ch
} }
// Write to the pod's filesystem via /proc/<pid>/root // Write to the pod's filesystem via /proc/<pid>/root
// signalFilePath is the path inside the pod (e.g., /var/lib/dynamo-checkpoint/signal.done) hostSignalPath := fmt.Sprintf("%s/%d/root%s", checkpoint.HostProcPath, pid, signalFilePath)
hostSignalPath := fmt.Sprintf("%s/%d/root%s", w.config.HostProc, pid, signalFilePath)
// Ensure signal directory exists in pod's filesystem // Ensure signal directory exists in pod's filesystem
signalDir := filepath.Dir(hostSignalPath) signalDir := filepath.Dir(hostSignalPath)
...@@ -424,11 +416,8 @@ func (w *Watcher) writeSignalFileToPod(pid int, signalFilePath, checkpointID, ch ...@@ -424,11 +416,8 @@ func (w *Watcher) writeSignalFileToPod(pid int, signalFilePath, checkpointID, ch
} }
// writeCheckpointDoneMarker writes a checkpoint.done marker file to the checkpoint directory on shared PVC. // writeCheckpointDoneMarker writes a checkpoint.done marker file to the checkpoint directory on shared PVC.
// This file is written AFTER all checkpoint steps complete (including rootfs-diff.tar).
// Restore pods on ANY node check for this file to know the checkpoint is complete and safe to restore.
// This is separate from writeSignalFileToPod which signals the checkpoint job pod to exit.
func (w *Watcher) writeCheckpointDoneMarker(checkpointDir, checkpointID string, success bool, errMsg string, log *logrus.Entry) { func (w *Watcher) writeCheckpointDoneMarker(checkpointDir, checkpointID string, success bool, errMsg string, log *logrus.Entry) {
markerPath := filepath.Join(checkpointDir, "checkpoint.done") markerPath := filepath.Join(checkpointDir, checkpoint.CheckpointDoneFilename)
marker := SignalFile{ marker := SignalFile{
CheckpointID: checkpointID, CheckpointID: checkpointID,
......
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Smart entrypoint wrapper for CRIU checkpoint/restore
# Automatically detects checkpoints and falls back to cold start if not found
#
# Behavior:
# 1. If DYN_CHECKPOINT_HASH is set and checkpoint exists -> restore
# 2. If WAIT_FOR_CHECKPOINT=1 -> wait for checkpoint (restore-entrypoint handles this)
# 3. Otherwise -> execute provided command (cold start)
set -e
# Enable debug output if DEBUG=1
if [ "${DEBUG:-0}" = "1" ]; then
set -x
fi
# Configuration from environment
CHECKPOINT_PATH="${DYN_CHECKPOINT_PATH:-/checkpoints}"
CHECKPOINT_HASH="${DYN_CHECKPOINT_HASH:-}"
WAIT_FOR_CHECKPOINT="${WAIT_FOR_CHECKPOINT:-0}"
# Log function for consistent output
log() {
echo "[smart-entrypoint] $*" >&2
}
# Check if a checkpoint exists and should be restored
should_restore_checkpoint() {
# If WAIT_FOR_CHECKPOINT is set, always use restore-entrypoint
# (it will wait for the checkpoint to appear)
if [ "$WAIT_FOR_CHECKPOINT" = "1" ]; then
log "WAIT_FOR_CHECKPOINT=1, will wait for checkpoint via restore-entrypoint"
return 0
fi
# If checkpoint hash is not set, no restore
if [ -z "$CHECKPOINT_HASH" ]; then
log "DYN_CHECKPOINT_HASH not set, no checkpoint to restore"
return 1
fi
# Check if checkpoint directory exists
CHECKPOINT_DIR="$CHECKPOINT_PATH/$CHECKPOINT_HASH"
if [ ! -d "$CHECKPOINT_DIR" ]; then
log "Checkpoint directory not found: $CHECKPOINT_DIR"
return 1
fi
# Check for checkpoint.done marker which is written LAST in the checkpoint process
# This is more reliable than inventory.img (created by CRIU) or rootfs-diff.tar (may be mid-write)
# Order: metadata.json -> CRIU dump (*.img) -> rootfs-diff.tar -> checkpoint.done
DONE_MARKER="$CHECKPOINT_DIR/checkpoint.done"
if [ ! -f "$DONE_MARKER" ]; then
log "Checkpoint incomplete - checkpoint.done not found in: $CHECKPOINT_DIR"
log "Checkpoint may still be in progress..."
return 1
fi
log "Checkpoint found: $CHECKPOINT_HASH (checkpoint.done marker present)"
return 0
}
# Main logic
if should_restore_checkpoint; then
log "=========================================="
log "CHECKPOINT RESTORE MODE"
log "=========================================="
log "Checkpoint: $CHECKPOINT_HASH"
log "Location: $CHECKPOINT_PATH/$CHECKPOINT_HASH"
log "Invoking restore-entrypoint..."
log "=========================================="
# Execute restore-entrypoint
# Any args passed to this script are forwarded (though restore-entrypoint ignores them)
exec /restore-entrypoint "$@"
else
log "=========================================="
log "COLD START MODE"
log "=========================================="
# No checkpoint found or not requested - fall back to cold start
if [ $# -eq 0 ]; then
# No args provided - this is likely an error
log "ERROR: No checkpoint to restore and no command provided"
log "Set DYN_CHECKPOINT_HASH to restore a checkpoint, or provide a command to run"
exit 1
fi
log "No checkpoint to restore"
log "Executing command: $*"
log "=========================================="
# Execute the provided command
exec "$@"
fi
...@@ -63,7 +63,6 @@ See `values.yaml` for all configuration options. ...@@ -63,7 +63,6 @@ See `values.yaml` for all configuration options.
| `storage.pvc.name` | PVC name (must match operator config) | `chrek-pvc` | | `storage.pvc.name` | PVC name (must match operator config) | `chrek-pvc` |
| `storage.pvc.size` | PVC size | `100Gi` | | `storage.pvc.size` | PVC size | `100Gi` |
| `storage.pvc.storageClass` | Storage class name | `""` (default) | | `storage.pvc.storageClass` | Storage class name | `""` (default) |
| `storage.signalHostPath` | Host path for signal files | `/var/lib/chrek/signals` |
| `daemonset.image.repository` | DaemonSet image repository | `nvidia/chrek-agent` | | `daemonset.image.repository` | DaemonSet image repository | `nvidia/chrek-agent` |
| `daemonset.nodeSelector` | Node selector for GPU nodes | `nvidia.com/gpu.present: "true"` | | `daemonset.nodeSelector` | Node selector for GPU nodes | `nvidia.com/gpu.present: "true"` |
| `daemonset.runtimeClassName` | Runtime class for GPU access | `nvidia` | | `daemonset.runtimeClassName` | Runtime class for GPU access | `nvidia` |
...@@ -175,4 +174,3 @@ Ensure your storage class supports `ReadWriteMany` access mode for multi-node de ...@@ -175,4 +174,3 @@ Ensure your storage class supports `ReadWriteMany` access mode for multi-node de
## License ## License
Apache License 2.0 Apache License 2.0
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "chrek.fullname" . }}-config
namespace: {{ .Release.Namespace }}
labels:
{{- include "chrek.labels" . | nindent 4 }}
data:
config.yaml: |
# Chrek Configuration
# This ConfigMap provides static configuration for the checkpoint agent.
# Dynamic values (NODE_NAME, RESTRICTED_NAMESPACE, etc.) come from environment variables.
agent:
# How checkpoints are triggered: "http" for REST API, "watcher" for auto-checkpoint
signalSource: {{ .Values.config.agent.signalSource | quote }}
# Watcher/HTTP server address
listenAddr: {{ .Values.config.agent.listenAddr | quote }}
checkpoint:
# Base path for checkpoint directories (shared PVC mount path)
basePath: {{ .Values.storage.pvc.basePath | quote }}
criu:
# RPC options
ghostLimit: {{ .Values.config.checkpoint.criu.ghostLimit }}
timeout: {{ .Values.config.checkpoint.criu.timeout }}
logLevel: {{ .Values.config.checkpoint.criu.logLevel }}
workDir: {{ .Values.config.checkpoint.criu.workDir | quote }}
# K8s-specific options
leaveRunning: {{ .Values.config.checkpoint.criu.leaveRunning }}
shellJob: {{ .Values.config.checkpoint.criu.shellJob }}
tcpClose: {{ .Values.config.checkpoint.criu.tcpClose }}
fileLocks: {{ .Values.config.checkpoint.criu.fileLocks }}
orphanPtsMaster: {{ .Values.config.checkpoint.criu.orphanPtsMaster }}
extUnixSk: {{ .Values.config.checkpoint.criu.extUnixSk }}
linkRemap: {{ .Values.config.checkpoint.criu.linkRemap }}
extMasters: {{ .Values.config.checkpoint.criu.extMasters }}
manageCgroupsMode: {{ .Values.config.checkpoint.criu.manageCgroupsMode | quote }}
# Advanced options
autoDedup: {{ .Values.config.checkpoint.criu.autoDedup }}
lazyPages: {{ .Values.config.checkpoint.criu.lazyPages }}
# Config file options (NOT available via RPC)
libDir: {{ .Values.config.checkpoint.criu.libDir | quote }}
allowUprobes: {{ .Values.config.checkpoint.criu.allowUprobes }}
skipInFlight: {{ .Values.config.checkpoint.criu.skipInFlight }}
rootfsExclusions:
# System directories excluded from rootfs diff (NVIDIA GPU Operator injected)
systemDirs: {{ toYaml .Values.config.checkpoint.rootfsExclusions.systemDirs | nindent 10 }}
# Cache directories to exclude (reduces checkpoint size)
cacheDirs: {{ toYaml .Values.config.checkpoint.rootfsExclusions.cacheDirs | nindent 10 }}
# Additional custom exclusions
additionalExclusions: {{ toYaml .Values.config.checkpoint.rootfsExclusions.additionalExclusions | nindent 10 }}
# NOTE: Restore runtime configuration is NOT in this ConfigMap.
# Placeholder containers do not mount it. Restore uses hardcoded defaults
# + operator-injected env vars. CRIU options come from saved checkpoint manifest.
...@@ -76,13 +76,11 @@ spec: ...@@ -76,13 +76,11 @@ spec:
securityContext: securityContext:
privileged: true privileged: true
env: env:
# Dynamic values from Kubernetes downward API
- name: NODE_NAME - name: NODE_NAME
valueFrom: valueFrom:
fieldRef: fieldRef:
fieldPath: spec.nodeName fieldPath: spec.nodeName
# Agent mode: use "watcher" to watch for labeled pods
- name: CHECKPOINT_SIGNAL_FROM
value: "watcher"
{{- if .Values.rbac.namespaceRestricted }} {{- if .Values.rbac.namespaceRestricted }}
# Restrict pod watching to this namespace (namespace-scoped RBAC) # Restrict pod watching to this namespace (namespace-scoped RBAC)
- name: RESTRICTED_NAMESPACE - name: RESTRICTED_NAMESPACE
...@@ -90,34 +88,11 @@ spec: ...@@ -90,34 +88,11 @@ spec:
fieldRef: fieldRef:
fieldPath: metadata.namespace fieldPath: metadata.namespace
{{- end }} {{- end }}
# Checkpoint storage directory
- name: CHECKPOINT_DIR
value: {{ .Values.storage.pvc.basePath | quote }}
# Host proc mount point for CRIU operations
- name: HOST_PROC
value: "/host/proc"
# Containerd socket path
- name: CONTAINERD_SOCKET
value: {{ .Values.daemonset.containerRuntimeSocket }}
{{- if .Values.daemonset.criu.cudaPluginDir }}
# CUDA plugin directory for GPU checkpoint support
- name: CUDA_PLUGIN_DIR
value: {{ .Values.daemonset.criu.cudaPluginDir | quote }}
{{- end }}
{{- if .Values.daemonset.criu.ghostLimit }}
# CRIU ghost file size limit in bytes
- name: CRIU_GHOST_LIMIT
value: {{ .Values.daemonset.criu.ghostLimit | quote }}
{{- end }}
{{- if .Values.daemonset.criu.timeout }}
# CRIU timeout in seconds
- name: CRIU_TIMEOUT
value: {{ .Values.daemonset.criu.timeout | quote }}
{{- end }}
# Storage type (for future S3/OCI support)
- name: DYN_CHECKPOINT_STORAGE_TYPE
value: {{ .Values.storage.type | quote }}
volumeMounts: volumeMounts:
# Mount configuration ConfigMap
- name: config
mountPath: /etc/chrek
readOnly: true
{{- if eq .Values.storage.type "pvc" }} {{- if eq .Values.storage.type "pvc" }}
# Mount the checkpoint PVC (only for PVC storage type) # Mount the checkpoint PVC (only for PVC storage type)
- name: checkpoints - name: checkpoints
...@@ -155,6 +130,10 @@ spec: ...@@ -155,6 +130,10 @@ spec:
resources: resources:
{{- toYaml .Values.daemonset.resources | nindent 12 }} {{- toYaml .Values.daemonset.resources | nindent 12 }}
volumes: volumes:
# Configuration ConfigMap
- name: config
configMap:
name: {{ include "chrek.fullname" . }}-config
{{- if .Values.seccomp.deploy }} {{- if .Values.seccomp.deploy }}
# Seccomp profile ConfigMap (used by initContainer) # Seccomp profile ConfigMap (used by initContainer)
- name: seccomp-profiles - name: seccomp-profiles
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment