"lib/llm/vscode:/vscode.git/clone" did not exist on "fc161924b9e464ca921946fc07223e1e89cd39f5"
Unverified Commit f3aa1e01 authored by Julien Mancuso's avatar Julien Mancuso Committed by GitHub
Browse files

feat: introducing ChReK (Checkpoint Restore in K8s) (#4978)


Signed-off-by: default avatarJulien Mancuso <jmancuso@nvidia.com>
parent 44986bf5
// criu provides CRIU-specific configuration and utilities for restore operations.
package restore
import (
"os"
criurpc "github.com/checkpoint-restore/go-criu/v7/rpc"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
"google.golang.org/protobuf/proto"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/common"
)
// CRIURestoreConfig holds configuration for CRIU restore operations.
// Most options are always-on with safe defaults for K8s environments.
type CRIURestoreConfig struct {
ImageDirFD int32
RootPath string
LogLevel int32
LogFile string
WorkDirFD int32
NetNsFD int32
ExtMountMaps []*criurpc.ExtMountMap
}
// OpenImageDir opens a checkpoint directory and clears CLOEXEC for CRIU.
// Returns the opened file and its FD. Caller must close the file when done.
func OpenImageDir(checkpointPath string) (*os.File, int32, error) {
return common.OpenDirForCRIU(checkpointPath)
}
// OpenNetworkNamespace opens the target network namespace for restore.
// Returns the opened file and its FD. Caller must close the file when done.
func OpenNetworkNamespace(nsPath string) (*os.File, int32, error) {
return common.OpenDirForCRIU(nsPath)
}
// OpenWorkDir opens a work directory for CRIU and clears CLOEXEC.
// Returns the opened file and its FD, or nil/-1 if workDir is empty or fails.
func OpenWorkDir(workDir string, log *logrus.Entry) (*os.File, int32) {
if workDir == "" {
return nil, -1
}
// Ensure work directory exists
if err := os.MkdirAll(workDir, 0755); err != nil {
log.WithError(err).Warn("Failed to create CRIU work directory, using default")
return nil, -1
}
workDirFile, err := os.Open(workDir)
if err != nil {
log.WithError(err).Warn("Failed to open CRIU work directory, using default")
return nil, -1
}
if _, err := unix.FcntlInt(workDirFile.Fd(), unix.F_SETFD, 0); err != nil {
log.WithError(err).Warn("Failed to clear CLOEXEC on work dir")
workDirFile.Close()
return nil, -1
}
log.WithField("path", workDir).Info("Using custom CRIU work directory")
return workDirFile, int32(workDirFile.Fd())
}
// BuildRestoreCRIUOpts creates CRIU options for restore from a config struct.
//
// Always-on options for K8s:
// - ShellJob: containers are often session leaders
// - TcpClose: pod IPs change on restore/migration
// - FileLocks: applications use file locks
// - ExtUnixSk: containers have external Unix sockets
// - ManageCgroups (IGNORE): let K8s manage cgroups
func BuildRestoreCRIUOpts(cfg CRIURestoreConfig) *criurpc.CriuOpts {
cgMode := criurpc.CriuCgMode_IGNORE
criuOpts := &criurpc.CriuOpts{
ImagesDirFd: proto.Int32(cfg.ImageDirFD),
LogLevel: proto.Int32(cfg.LogLevel),
LogFile: proto.String(cfg.LogFile),
// Root filesystem - use current container's root
Root: proto.String(cfg.RootPath),
// Restore in detached mode - process runs in background
RstSibling: proto.Bool(true),
// Mount namespace compatibility mode for cross-container restore
MntnsCompatMode: proto.Bool(true),
// Always-on for K8s environments
ShellJob: proto.Bool(true),
TcpClose: proto.Bool(true),
FileLocks: proto.Bool(true),
ExtUnixSk: proto.Bool(true),
// Cgroup management - ignore to avoid conflicts
ManageCgroups: proto.Bool(true),
ManageCgroupsMode: &cgMode,
// Device and inode handling
EvasiveDevices: proto.Bool(true),
ForceIrmap: proto.Bool(true),
// External mount mappings
ExtMnt: cfg.ExtMountMaps,
}
// Add network namespace inheritance if provided
if cfg.NetNsFD >= 0 {
criuOpts.InheritFd = []*criurpc.InheritFd{
{
Key: proto.String("extNetNs"),
Fd: proto.Int32(cfg.NetNsFD),
},
}
}
// Add work directory if specified
if cfg.WorkDirFD >= 0 {
criuOpts.WorkDirFd = proto.Int32(cfg.WorkDirFD)
}
return criuOpts
}
package restore
import (
"encoding/json"
"fmt"
"os"
"os/exec"
"path/filepath"
"github.com/sirupsen/logrus"
)
const (
// RootfsDiffFilename is the name of the rootfs diff tar file
RootfsDiffFilename = "rootfs-diff.tar"
// DeletedFilesFilename is the name of the deleted files JSON
DeletedFilesFilename = "deleted-files.json"
)
// ApplyRootfsDiff extracts the rootfs-diff.tar from the checkpoint to the target root.
// This restores filesystem changes that were made in the original container.
func ApplyRootfsDiff(checkpointPath, targetRoot string, log *logrus.Entry) error {
rootfsDiffPath := filepath.Join(checkpointPath, RootfsDiffFilename)
// Check if rootfs-diff.tar exists
if _, err := os.Stat(rootfsDiffPath); os.IsNotExist(err) {
log.Info("No rootfs-diff.tar found, skipping filesystem restoration")
return nil
}
log.WithField("path", rootfsDiffPath).Info("Applying rootfs diff")
// Build tar command with options to handle conflicts:
// --keep-old-files: Don't overwrite existing files (may already be mounted)
// Exclude paths that are typically mounted read-only by the container runtime
cmd := exec.Command("tar",
"--keep-old-files",
"--exclude=./run/secrets",
"--exclude=./etc/resolv.conf",
"--exclude=./etc/hostname",
"--exclude=./etc/hosts",
"-C", targetRoot,
"-xf", rootfsDiffPath,
)
output, err := cmd.CombinedOutput()
if err != nil {
// Some failures are expected (read-only mounts, existing files)
// tar returns exit code 1 for "file exists" which is not fatal for us
if exitErr, ok := err.(*exec.ExitError); ok && exitErr.ExitCode() == 1 {
log.WithField("output", string(output)).Info("Rootfs diff applied (some files may have been skipped due to mounts)")
return nil
}
return fmt.Errorf("failed to extract rootfs diff: %w (output: %s)", err, string(output))
}
log.Info("Rootfs diff applied successfully")
return nil
}
// ApplyDeletedFiles removes files that were deleted in the original container.
// These are tracked via overlay whiteout markers (.wh.<filename>).
func ApplyDeletedFiles(checkpointPath, targetRoot string, log *logrus.Entry) error {
deletedFilesPath := filepath.Join(checkpointPath, DeletedFilesFilename)
// Check if deleted-files.json exists
data, err := os.ReadFile(deletedFilesPath)
if os.IsNotExist(err) {
log.Debug("No deleted-files.json found")
return nil
}
if err != nil {
return fmt.Errorf("failed to read deleted files list: %w", err)
}
log.Info("Applying deleted files from whiteout list")
// Parse JSON array of deleted file paths
var deletedFiles []string
if err := json.Unmarshal(data, &deletedFiles); err != nil {
return fmt.Errorf("failed to parse deleted files JSON: %w", err)
}
deletedCount := 0
for _, filePath := range deletedFiles {
if filePath == "" {
continue
}
targetPath := filepath.Join(targetRoot, filePath)
// Check if file exists before attempting deletion
if _, err := os.Stat(targetPath); os.IsNotExist(err) {
continue
}
if err := os.RemoveAll(targetPath); err != nil {
log.WithError(err).WithField("path", targetPath).Debug("Could not delete file")
continue
}
deletedCount++
}
log.WithField("count", deletedCount).Info("Deleted files applied")
return nil
}
// CheckpointFilesExist verifies that the checkpoint directory contains valid checkpoint files.
func CheckpointFilesExist(checkpointPath string) bool {
// Check for CRIU image files (core-*.img is always present)
matches, err := filepath.Glob(filepath.Join(checkpointPath, "core-*.img"))
if err != nil || len(matches) == 0 {
return false
}
return true
}
package restore
import (
"fmt"
criurpc "github.com/checkpoint-restore/go-criu/v7/rpc"
"google.golang.org/protobuf/proto"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/common"
)
// GenerateExtMountMaps generates external mount mappings for CRIU restore.
// It parses /proc/1/mountinfo (the restore container's mounts) and adds
// mappings for all mount points plus masked/readonly paths from common.
//
// If meta is nil or doesn't have OCI-derived paths, falls back to defaults.
func GenerateExtMountMaps(meta *common.CheckpointMetadata) ([]*criurpc.ExtMountMap, error) {
var maps []*criurpc.ExtMountMap
addedMounts := make(map[string]bool)
// Add root filesystem mapping first
maps = append(maps, &criurpc.ExtMountMap{
Key: proto.String("/"),
Val: proto.String("."),
})
addedMounts["/"] = true
// Parse /proc/1/mountinfo for all current mount points
mountPoints, err := common.GetMountPointPaths("/proc/1/mountinfo")
if err != nil {
return nil, fmt.Errorf("failed to parse mountinfo: %w", err)
}
for _, mountPoint := range mountPoints {
if addedMounts[mountPoint] || mountPoint == "/" {
continue
}
maps = append(maps, &criurpc.ExtMountMap{
Key: proto.String(mountPoint),
Val: proto.String(mountPoint),
})
addedMounts[mountPoint] = true
}
// Use masked paths from checkpoint metadata (OCI spec derived)
// Fall back to defaults for backwards compatibility
maskedPaths := common.DefaultMaskedPaths()
if meta != nil && len(meta.MaskedPaths) > 0 {
maskedPaths = meta.MaskedPaths
}
for _, path := range maskedPaths {
if addedMounts[path] {
continue
}
maps = append(maps, &criurpc.ExtMountMap{
Key: proto.String(path),
Val: proto.String(path),
})
addedMounts[path] = true
}
// Also add readonly paths from metadata if available
if meta != nil {
for _, path := range meta.ReadonlyPaths {
if addedMounts[path] {
continue
}
maps = append(maps, &criurpc.ExtMountMap{
Key: proto.String(path),
Val: proto.String(path),
})
addedMounts[path] = true
}
}
return maps, nil
}
// AddExtMountMap is a helper to create a single ExtMountMap entry.
func AddExtMountMap(key, val string) *criurpc.ExtMountMap {
return &criurpc.ExtMountMap{
Key: proto.String(key),
Val: proto.String(val),
}
}
package restore
import (
criu "github.com/checkpoint-restore/go-criu/v7"
"github.com/sirupsen/logrus"
)
// RestoreNotify implements criu.Notify for restore callbacks.
// It captures the restored process PID from the PostRestore callback.
type RestoreNotify struct {
criu.NoNotify // Embed no-op implementation for all methods
// RestoredPID is the PID of the restored process, set by PostRestore callback
RestoredPID int32
// log is the logger for notification events
log *logrus.Entry
}
// NewRestoreNotify creates a new RestoreNotify handler.
func NewRestoreNotify(log *logrus.Entry) *RestoreNotify {
return &RestoreNotify{
log: log,
}
}
// PreRestore is called before CRIU starts the restore operation.
func (n *RestoreNotify) PreRestore() error {
if n.log != nil {
n.log.Debug("CRIU pre-restore notification")
}
return nil
}
// PostRestore is called after CRIU completes the restore operation.
// The pid parameter contains the PID of the restored process.
func (n *RestoreNotify) PostRestore(pid int32) error {
n.RestoredPID = pid
if n.log != nil {
n.log.WithField("pid", pid).Info("CRIU post-restore notification: process restored")
}
return nil
}
// PostResume is called after the restored process has resumed execution.
func (n *RestoreNotify) PostResume() error {
if n.log != nil {
n.log.Debug("CRIU post-resume notification")
}
return nil
}
// Package restore provides CRIU restore operations for self-restoring placeholder containers.
package restore
import (
"context"
"os"
"strconv"
"time"
criurpc "github.com/checkpoint-restore/go-criu/v7/rpc"
"github.com/sirupsen/logrus"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/common"
)
// Config holds the configuration for the restore entrypoint.
// These values are typically set via environment variables.
type Config struct {
// CheckpointPath is the base directory containing checkpoints (default: /checkpoints)
// Env: DYN_CHECKPOINT_PATH
CheckpointPath string
// CheckpointHash is the ID/hash of the checkpoint to restore
// Env: DYN_CHECKPOINT_HASH
CheckpointHash string
// RestoreTrigger is the path to the trigger file that signals restore should start
RestoreTrigger string
// WaitForCheckpoint indicates whether to wait for a checkpoint to appear
WaitForCheckpoint bool
// WaitTimeout is the maximum time to wait for a checkpoint to become available
WaitTimeout time.Duration
// CRIULogLevel is the CRIU verbosity level (0-4, default: 4)
CRIULogLevel int32
// DefaultCmd is the command to run if no checkpoint is available
DefaultCmd string
// Debug enables debug logging
Debug bool
// EmbeddedCheckpointPath is the path to an embedded checkpoint within the image
// When set, the checkpoint data is baked into the container image itself
EmbeddedCheckpointPath string
// SkipInFlightConnections skips in-flight TCP connections during restore
SkipInFlightConnections bool
// AutoDedup enables auto-deduplication of memory pages
AutoDedup bool
// LazyPages enables lazy page migration (experimental)
LazyPages bool
// CRIUWorkDir is an alternative work directory for CRIU (instead of /tmp)
// Useful when /tmp has mount issues
CRIUWorkDir string
// CUDAPluginDir is the path to CRIU CUDA plugin directory (e.g., /usr/local/lib/criu)
// When set, a CRIU config file is created with libdir for CUDA plugin discovery during restore.
CUDAPluginDir string
// CRIUTimeout is the CRIU timeout in seconds (required for CUDA restores)
CRIUTimeout uint32
// RestoreMarkerFile is the path to a marker file created before CRIU restore.
// The restored process can check for this file to detect it was restored.
RestoreMarkerFile string
}
// DefaultEmbeddedCheckpointPath is the default path for embedded checkpoints
const DefaultEmbeddedCheckpointPath = "/embedded-checkpoint"
// ConfigFromEnv creates a Config from environment variables.
func ConfigFromEnv() *Config {
cfg := &Config{
CheckpointPath: getEnvOrDefault("DYN_CHECKPOINT_PATH", "/checkpoints"),
CheckpointHash: os.Getenv("DYN_CHECKPOINT_HASH"),
RestoreTrigger: getEnvOrDefault("RESTORE_TRIGGER", "/tmp/restore-trigger"),
WaitForCheckpoint: os.Getenv("WAIT_FOR_CHECKPOINT") == "1",
WaitTimeout: parseDurationOrDefault("RESTORE_WAIT_TIMEOUT", 300*time.Second),
CRIULogLevel: parseIntOrDefault("CRIU_LOG_LEVEL", 4),
DefaultCmd: os.Getenv("DEFAULT_CMD"),
Debug: os.Getenv("DEBUG") == "1",
EmbeddedCheckpointPath: getEnvOrDefault("EMBEDDED_CHECKPOINT_PATH", DefaultEmbeddedCheckpointPath),
SkipInFlightConnections: os.Getenv("CRIU_SKIP_IN_FLIGHT") == "1",
AutoDedup: os.Getenv("CRIU_AUTO_DEDUP") == "1",
LazyPages: os.Getenv("CRIU_LAZY_PAGES") == "1",
CRIUWorkDir: getEnvOrDefault("CRIU_WORK_DIR", ""),
CUDAPluginDir: os.Getenv("CUDA_PLUGIN_DIR"), // For CUDA plugin discovery during restore
CRIUTimeout: uint32(parseIntOrDefault("CRIU_TIMEOUT", 0)),
RestoreMarkerFile: getEnvOrDefault("DYN_RESTORE_MARKER_FILE", "/tmp/dynamo-restored"),
}
return cfg
}
// RestoreOptions holds the options for a CRIU restore operation.
// Most CRIU options are hardcoded with safe K8s defaults.
type RestoreOptions struct {
// CheckpointPath is the path to the checkpoint directory
CheckpointPath string
// RootPath is the root filesystem path for restore (typically "/")
RootPath string
// PidFile is the path where CRIU writes the restored process PID
PidFile string
// LogFile is the name of the CRIU restore log file
LogFile string
// LogLevel is the CRIU logging verbosity (0-4)
LogLevel int32
// ExtMountMaps contains external mount mappings for CRIU
ExtMountMaps []*criurpc.ExtMountMap
// WorkDir is an alternative work directory for CRIU (instead of /tmp)
WorkDir string
// LibDir is the path to CRIU plugin directory (e.g., /usr/local/lib/criu)
// When set, a CRIU config file is created with libdir for CUDA plugin discovery.
LibDir string
// Timeout is the CRIU timeout in seconds (required for CUDA restores)
Timeout uint32
}
// DefaultRestoreOptions returns RestoreOptions with sensible defaults.
func DefaultRestoreOptions(checkpointPath string) *RestoreOptions {
return &RestoreOptions{
CheckpointPath: checkpointPath,
RootPath: "/",
PidFile: "/tmp/restored.pid",
LogFile: "restore.log",
LogLevel: 4,
}
}
// LoadRestoreOptions creates RestoreOptions from checkpoint metadata.
// CRIU options are hardcoded with safe K8s defaults; metadata is only used for mount mappings.
func LoadRestoreOptions(checkpointPath string, logLevel int32) (*RestoreOptions, error) {
opts := DefaultRestoreOptions(checkpointPath)
opts.LogLevel = logLevel
// Load metadata for OCI-derived paths (masked/readonly paths for external mounts)
meta, err := common.LoadMetadata(checkpointPath)
if err != nil {
// Return defaults if metadata is unavailable
// GenerateExtMountMaps with nil will use fallback defaults
return opts, nil
}
// Pre-generate external mount maps using OCI-derived paths from metadata
// This uses masked/readonly paths from the OCI spec instead of hardcoded defaults
extMounts, err := GenerateExtMountMaps(meta)
if err != nil {
// Fall back to defaults if generation fails
return opts, nil
}
opts.ExtMountMaps = extMounts
return opts, nil
}
// ShouldRestore checks if a restore should be performed.
// Returns the checkpoint path and true if restore should proceed.
// IMPORTANT: We check for checkpoint.done marker (not just metadata.json or inventory.img) because
// checkpoint.done is written LAST in the checkpoint process, after rootfs-diff.tar completes.
// Order: metadata.json -> CRIU dump (*.img files) -> rootfs-diff.tar -> checkpoint.done
func ShouldRestore(cfg *Config, log *logrus.Entry) (string, bool) {
// Method 0: Embedded checkpoint in image (highest priority)
// This is for self-contained checkpoint images where data is baked in
if cfg.EmbeddedCheckpointPath != "" {
metadataPath := cfg.EmbeddedCheckpointPath + "/" + common.MetadataFilename
if _, err := os.Stat(metadataPath); err == nil {
log.WithField("path", cfg.EmbeddedCheckpointPath).Info("Embedded checkpoint found in image")
return cfg.EmbeddedCheckpointPath, true
}
}
// Method 1: DYN_CHECKPOINT_HASH is set and checkpoint is fully complete
if cfg.CheckpointHash != "" {
checkpointPath := cfg.CheckpointPath + "/" + cfg.CheckpointHash
// Check for checkpoint.done marker (written LAST after rootfs-diff.tar completes)
donePath := checkpointPath + "/checkpoint.done"
if _, err := os.Stat(donePath); err == nil {
log.WithField("path", checkpointPath).Info("Checkpoint found (checkpoint.done marker present)")
return checkpointPath, true
}
// Fallback: check for metadata.json but warn about potential race condition
metadataPath := checkpointPath + "/" + common.MetadataFilename
if _, err := os.Stat(metadataPath); err == nil {
log.WithFields(logrus.Fields{
"path": checkpointPath,
"warning": "checkpoint.done marker not found, checkpoint may be incomplete",
}).Warn("Checkpoint metadata found but checkpoint.done missing - checkpoint may still be in progress")
// Don't return true here - wait for checkpoint.done
}
}
// Method 2: Restore trigger file exists with checkpoint path
if cfg.RestoreTrigger != "" {
data, err := os.ReadFile(cfg.RestoreTrigger)
if err == nil {
checkpointPath := string(data)
if checkpointPath != "" {
donePath := checkpointPath + "/checkpoint.done"
if _, err := os.Stat(donePath); err == nil {
log.WithField("path", checkpointPath).Info("Restore triggered via file (checkpoint.done marker present)")
return checkpointPath, true
}
}
}
}
return "", false
}
// WaitForCheckpoint waits for a checkpoint to become available.
func WaitForCheckpoint(ctx context.Context, cfg *Config, log *logrus.Entry) (string, error) {
log.WithField("timeout", cfg.WaitTimeout).Info("Waiting for checkpoint")
deadline := time.Now().Add(cfg.WaitTimeout)
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
lastLog := time.Now()
for {
select {
case <-ctx.Done():
return "", ctx.Err()
case <-ticker.C:
if path, ok := ShouldRestore(cfg, log); ok {
return path, nil
}
// Log progress every 30 seconds
if time.Since(lastLog) >= 30*time.Second {
elapsed := time.Since(deadline.Add(-cfg.WaitTimeout))
log.WithField("elapsed", elapsed).Info("Still waiting for checkpoint...")
lastLog = time.Now()
}
if time.Now().After(deadline) {
return "", context.DeadlineExceeded
}
}
}
}
// Helper functions
func getEnvOrDefault(key, defaultValue string) string {
if value := os.Getenv(key); value != "" {
return value
}
return defaultValue
}
func parseDurationOrDefault(key string, defaultValue time.Duration) time.Duration {
value := os.Getenv(key)
if value == "" {
return defaultValue
}
seconds, err := strconv.Atoi(value)
if err != nil {
return defaultValue
}
return time.Duration(seconds) * time.Second
}
func parseIntOrDefault(key string, defaultValue int32) int32 {
value := os.Getenv(key)
if value == "" {
return defaultValue
}
i, err := strconv.Atoi(value)
if err != nil {
return defaultValue
}
return int32(i)
}
package restore
import (
"fmt"
"io"
"os"
"os/exec"
"os/signal"
"strconv"
"strings"
"syscall"
"time"
"github.com/sirupsen/logrus"
)
// MonitorProcess monitors the restored process and returns its exit code.
// It blocks until the process exits. Does not forward stdout/stderr.
// For output forwarding, use ForwardProcessOutput instead.
func MonitorProcess(pid int, log *logrus.Entry) int {
log.WithField("pid", pid).Info("Monitoring restored process")
for {
// Check if process still exists by sending signal 0
proc, err := os.FindProcess(pid)
if err != nil {
log.WithError(err).Error("Failed to find process")
return 1
}
err = proc.Signal(syscall.Signal(0))
if err != nil {
// Process has exited
log.WithField("pid", pid).Info("Restored process exited")
// Try to read exit status from /proc/<pid>/stat
// If process is gone, assume exit code 0
exitCode := getExitCode(pid)
log.WithField("exit_code", exitCode).Info("Restored process exit status")
return exitCode
}
time.Sleep(time.Second)
}
}
// ForwardProcessOutput forwards the stdout and stderr of a restored process
// to our own stdout/stderr via /proc/<pid>/fd/1 and /proc/<pid>/fd/2.
// This ensures logs from the restored process appear in kubectl logs.
// Returns the exit code of the process.
func ForwardProcessOutput(pid int, log *logrus.Entry) int {
log.WithField("pid", pid).Info("Forwarding output from restored process")
// Try to open the process's stdout and stderr via /proc
stdoutPath := fmt.Sprintf("/proc/%d/fd/1", pid)
stderrPath := fmt.Sprintf("/proc/%d/fd/2", pid)
// Channel to signal when copying goroutines should stop
done := make(chan struct{})
// Forward stdout
go forwardFD(stdoutPath, os.Stdout, "stdout", log, done)
// Forward stderr
go forwardFD(stderrPath, os.Stderr, "stderr", log, done)
// Wait for process to exit
exitCode := waitForProcess(pid, log)
// Signal goroutines to stop
close(done)
// Give goroutines a moment to flush any remaining output
time.Sleep(100 * time.Millisecond)
return exitCode
}
// forwardFD copies data from a file descriptor path to a writer.
// It handles the case where the FD may not be readable.
func forwardFD(fdPath string, dst io.Writer, name string, log *logrus.Entry, done <-chan struct{}) {
// Try to open the FD path
src, err := os.Open(fdPath)
if err != nil {
log.WithError(err).WithField("path", fdPath).Debug("Could not open process FD for forwarding")
return
}
defer src.Close()
// Check what kind of file this is
stat, err := src.Stat()
if err != nil {
log.WithError(err).WithField("path", fdPath).Debug("Could not stat process FD")
return
}
log.WithFields(logrus.Fields{
"name": name,
"mode": stat.Mode().String(),
"path": fdPath,
}).Debug("Forwarding process output")
// Copy data until done or EOF
buf := make([]byte, 4096)
for {
select {
case <-done:
return
default:
// Set a read deadline to allow checking done channel periodically
src.SetReadDeadline(time.Now().Add(100 * time.Millisecond))
n, err := src.Read(buf)
if n > 0 {
dst.Write(buf[:n])
}
if err != nil {
if os.IsTimeout(err) {
continue
}
if err != io.EOF {
log.WithError(err).WithField("name", name).Debug("Error reading from process FD")
}
return
}
}
}
}
// waitForProcess waits for a process to exit and returns its exit code.
func waitForProcess(pid int, log *logrus.Entry) int {
for {
// Check if process still exists by sending signal 0
proc, err := os.FindProcess(pid)
if err != nil {
log.WithError(err).Error("Failed to find process")
return 1
}
err = proc.Signal(syscall.Signal(0))
if err != nil {
// Process has exited
log.WithField("pid", pid).Info("Restored process exited")
// Try to get exit status
exitCode := getExitCode(pid)
log.WithField("exit_code", exitCode).Info("Restored process exit status")
return exitCode
}
time.Sleep(100 * time.Millisecond)
}
}
// getExitCode attempts to get the exit code of a process.
// Returns 0 if unable to determine the exit code.
func getExitCode(pid int) int {
// Try to wait for the process (only works if we're the parent)
proc, err := os.FindProcess(pid)
if err != nil {
return 0
}
// Try waitpid with WNOHANG - this may not work for non-child processes
var wstatus syscall.WaitStatus
wpid, err := syscall.Wait4(pid, &wstatus, syscall.WNOHANG, nil)
if err == nil && wpid == pid {
if wstatus.Exited() {
return wstatus.ExitStatus()
}
if wstatus.Signaled() {
return 128 + int(wstatus.Signal())
}
}
// If we can't wait on it, check if it's still running
if proc.Signal(syscall.Signal(0)) != nil {
// Process is gone, assume clean exit
return 0
}
return 0
}
// SetupSignalForwarding sets up signal forwarding to the restored process.
// Returns a cleanup function that should be called when done.
func SetupSignalForwarding(pid int, log *logrus.Entry) func() {
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGTERM, syscall.SIGINT, syscall.SIGQUIT)
done := make(chan struct{})
go func() {
select {
case sig := <-sigChan:
log.WithFields(logrus.Fields{
"signal": sig,
"pid": pid,
}).Info("Forwarding signal to restored process")
proc, err := os.FindProcess(pid)
if err == nil {
proc.Signal(sig)
}
case <-done:
return
}
}()
return func() {
signal.Stop(sigChan)
close(done)
}
}
// WaitForPidFile waits for the CRIU PID file to be created and returns the PID.
func WaitForPidFile(pidFile string, timeout time.Duration, log *logrus.Entry) (int, error) {
deadline := time.Now().Add(timeout)
for time.Now().Before(deadline) {
data, err := os.ReadFile(pidFile)
if err == nil {
pidStr := strings.TrimSpace(string(data))
pid, err := strconv.Atoi(pidStr)
if err == nil && pid > 0 {
return pid, nil
}
}
time.Sleep(100 * time.Millisecond)
}
return 0, fmt.Errorf("timeout waiting for PID file %s after %v", pidFile, timeout)
}
// RunDefault runs the default command when no checkpoint is available.
// It attempts to detect and run the appropriate default command for the container.
func RunDefault(cfg *Config, log *logrus.Entry) error {
// If DEFAULT_CMD is set, use it
if cfg.DefaultCmd != "" {
log.WithField("cmd", cfg.DefaultCmd).Info("Running default command")
return execCommand(cfg.DefaultCmd)
}
// Try common application entrypoints
if _, err := os.Stat("/docker-entrypoint.sh"); err == nil {
log.Info("Running docker-entrypoint.sh")
return execCommand("/docker-entrypoint.sh nginx -g 'daemon off;'")
}
// Check for nginx
if _, err := exec.LookPath("nginx"); err == nil {
log.Info("Running nginx")
return execCommand("nginx -g 'daemon off;'")
}
// Fallback to sleep infinity
log.Warn("No default command specified and no known entrypoint found, sleeping")
return execCommand("sleep infinity")
}
// execCommand executes a command by replacing the current process.
func execCommand(cmdLine string) error {
// Parse command line - simple split by spaces
// For complex commands, shell wrapper is needed
parts := strings.Fields(cmdLine)
if len(parts) == 0 {
return fmt.Errorf("empty command")
}
cmd := parts[0]
args := parts
// Find the executable path
path, err := exec.LookPath(cmd)
if err != nil {
// Try running through shell for complex commands
path = "/bin/sh"
args = []string{"sh", "-c", cmdLine}
}
// Replace current process with the command
return syscall.Exec(path, args, os.Environ())
}
This diff is collapsed.
This diff is collapsed.
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Smart entrypoint wrapper for CRIU checkpoint/restore
# Automatically detects checkpoints and falls back to cold start if not found
#
# Behavior:
# 1. If DYN_CHECKPOINT_HASH is set and checkpoint exists -> restore
# 2. If WAIT_FOR_CHECKPOINT=1 -> wait for checkpoint (restore-entrypoint handles this)
# 3. Otherwise -> execute provided command (cold start)
set -e
# Enable debug output if DEBUG=1
if [ "${DEBUG:-0}" = "1" ]; then
set -x
fi
# Configuration from environment
CHECKPOINT_PATH="${DYN_CHECKPOINT_PATH:-/checkpoints}"
CHECKPOINT_HASH="${DYN_CHECKPOINT_HASH:-}"
WAIT_FOR_CHECKPOINT="${WAIT_FOR_CHECKPOINT:-0}"
# Log function for consistent output
log() {
echo "[smart-entrypoint] $*" >&2
}
# Check if a checkpoint exists and should be restored
should_restore_checkpoint() {
# If WAIT_FOR_CHECKPOINT is set, always use restore-entrypoint
# (it will wait for the checkpoint to appear)
if [ "$WAIT_FOR_CHECKPOINT" = "1" ]; then
log "WAIT_FOR_CHECKPOINT=1, will wait for checkpoint via restore-entrypoint"
return 0
fi
# If checkpoint hash is not set, no restore
if [ -z "$CHECKPOINT_HASH" ]; then
log "DYN_CHECKPOINT_HASH not set, no checkpoint to restore"
return 1
fi
# Check if checkpoint directory exists
CHECKPOINT_DIR="$CHECKPOINT_PATH/$CHECKPOINT_HASH"
if [ ! -d "$CHECKPOINT_DIR" ]; then
log "Checkpoint directory not found: $CHECKPOINT_DIR"
return 1
fi
# Check for checkpoint.done marker which is written LAST in the checkpoint process
# This is more reliable than inventory.img (created by CRIU) or rootfs-diff.tar (may be mid-write)
# Order: metadata.json -> CRIU dump (*.img) -> rootfs-diff.tar -> checkpoint.done
DONE_MARKER="$CHECKPOINT_DIR/checkpoint.done"
if [ ! -f "$DONE_MARKER" ]; then
log "Checkpoint incomplete - checkpoint.done not found in: $CHECKPOINT_DIR"
log "Checkpoint may still be in progress..."
return 1
fi
log "Checkpoint found: $CHECKPOINT_HASH (checkpoint.done marker present)"
return 0
}
# Main logic
if should_restore_checkpoint; then
log "=========================================="
log "CHECKPOINT RESTORE MODE"
log "=========================================="
log "Checkpoint: $CHECKPOINT_HASH"
log "Location: $CHECKPOINT_PATH/$CHECKPOINT_HASH"
log "Invoking restore-entrypoint..."
log "=========================================="
# Execute restore-entrypoint
# Any args passed to this script are forwarded (though restore-entrypoint ignores them)
exec /restore-entrypoint "$@"
else
log "=========================================="
log "COLD START MODE"
log "=========================================="
# No checkpoint found or not requested - fall back to cold start
if [ $# -eq 0 ]; then
# No args provided - this is likely an error
log "ERROR: No checkpoint to restore and no command provided"
log "Set DYN_CHECKPOINT_HASH to restore a checkpoint, or provide a command to run"
exit 1
fi
log "No checkpoint to restore"
log "Executing command: $*"
log "=========================================="
# Execute the provided command
exec "$@"
fi
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: v2
name: chrek
description: Checkpoint/Restore infrastructure for Dynamo (PVC + DaemonSet + CRIU Agent)
type: application
version: 0.1.0
appVersion: "1.0"
keywords:
- nvidia
- dynamo
- checkpoint
- criu
- gpu
home: https://github.com/ai-dynamo/dynamo
sources:
- https://github.com/ai-dynamo/dynamo
maintainers:
- name: NVIDIA
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment