Unverified Commit f3aa1e01 authored by Julien Mancuso's avatar Julien Mancuso Committed by GitHub
Browse files

feat: introducing ChReK (Checkpoint Restore in K8s) (#4978)


Signed-off-by: default avatarJulien Mancuso <jmancuso@nvidia.com>
parent 44986bf5
// criu provides CRIU-specific configuration and utilities for restore operations.
package restore
import (
"os"
criurpc "github.com/checkpoint-restore/go-criu/v7/rpc"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
"google.golang.org/protobuf/proto"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/common"
)
// CRIURestoreConfig holds configuration for CRIU restore operations.
// Most options are always-on with safe defaults for K8s environments.
type CRIURestoreConfig struct {
ImageDirFD int32
RootPath string
LogLevel int32
LogFile string
WorkDirFD int32
NetNsFD int32
ExtMountMaps []*criurpc.ExtMountMap
}
// OpenImageDir opens a checkpoint directory and clears CLOEXEC for CRIU.
// Returns the opened file and its FD. Caller must close the file when done.
func OpenImageDir(checkpointPath string) (*os.File, int32, error) {
return common.OpenDirForCRIU(checkpointPath)
}
// OpenNetworkNamespace opens the target network namespace for restore.
// Returns the opened file and its FD. Caller must close the file when done.
func OpenNetworkNamespace(nsPath string) (*os.File, int32, error) {
return common.OpenDirForCRIU(nsPath)
}
// OpenWorkDir opens a work directory for CRIU and clears CLOEXEC.
// Returns the opened file and its FD, or nil/-1 if workDir is empty or fails.
func OpenWorkDir(workDir string, log *logrus.Entry) (*os.File, int32) {
if workDir == "" {
return nil, -1
}
// Ensure work directory exists
if err := os.MkdirAll(workDir, 0755); err != nil {
log.WithError(err).Warn("Failed to create CRIU work directory, using default")
return nil, -1
}
workDirFile, err := os.Open(workDir)
if err != nil {
log.WithError(err).Warn("Failed to open CRIU work directory, using default")
return nil, -1
}
if _, err := unix.FcntlInt(workDirFile.Fd(), unix.F_SETFD, 0); err != nil {
log.WithError(err).Warn("Failed to clear CLOEXEC on work dir")
workDirFile.Close()
return nil, -1
}
log.WithField("path", workDir).Info("Using custom CRIU work directory")
return workDirFile, int32(workDirFile.Fd())
}
// BuildRestoreCRIUOpts creates CRIU options for restore from a config struct.
//
// Always-on options for K8s:
// - ShellJob: containers are often session leaders
// - TcpClose: pod IPs change on restore/migration
// - FileLocks: applications use file locks
// - ExtUnixSk: containers have external Unix sockets
// - ManageCgroups (IGNORE): let K8s manage cgroups
func BuildRestoreCRIUOpts(cfg CRIURestoreConfig) *criurpc.CriuOpts {
cgMode := criurpc.CriuCgMode_IGNORE
criuOpts := &criurpc.CriuOpts{
ImagesDirFd: proto.Int32(cfg.ImageDirFD),
LogLevel: proto.Int32(cfg.LogLevel),
LogFile: proto.String(cfg.LogFile),
// Root filesystem - use current container's root
Root: proto.String(cfg.RootPath),
// Restore in detached mode - process runs in background
RstSibling: proto.Bool(true),
// Mount namespace compatibility mode for cross-container restore
MntnsCompatMode: proto.Bool(true),
// Always-on for K8s environments
ShellJob: proto.Bool(true),
TcpClose: proto.Bool(true),
FileLocks: proto.Bool(true),
ExtUnixSk: proto.Bool(true),
// Cgroup management - ignore to avoid conflicts
ManageCgroups: proto.Bool(true),
ManageCgroupsMode: &cgMode,
// Device and inode handling
EvasiveDevices: proto.Bool(true),
ForceIrmap: proto.Bool(true),
// External mount mappings
ExtMnt: cfg.ExtMountMaps,
}
// Add network namespace inheritance if provided
if cfg.NetNsFD >= 0 {
criuOpts.InheritFd = []*criurpc.InheritFd{
{
Key: proto.String("extNetNs"),
Fd: proto.Int32(cfg.NetNsFD),
},
}
}
// Add work directory if specified
if cfg.WorkDirFD >= 0 {
criuOpts.WorkDirFd = proto.Int32(cfg.WorkDirFD)
}
return criuOpts
}
package restore
import (
"encoding/json"
"fmt"
"os"
"os/exec"
"path/filepath"
"github.com/sirupsen/logrus"
)
const (
// RootfsDiffFilename is the name of the rootfs diff tar file
RootfsDiffFilename = "rootfs-diff.tar"
// DeletedFilesFilename is the name of the deleted files JSON
DeletedFilesFilename = "deleted-files.json"
)
// ApplyRootfsDiff extracts the rootfs-diff.tar from the checkpoint to the target root.
// This restores filesystem changes that were made in the original container.
func ApplyRootfsDiff(checkpointPath, targetRoot string, log *logrus.Entry) error {
rootfsDiffPath := filepath.Join(checkpointPath, RootfsDiffFilename)
// Check if rootfs-diff.tar exists
if _, err := os.Stat(rootfsDiffPath); os.IsNotExist(err) {
log.Info("No rootfs-diff.tar found, skipping filesystem restoration")
return nil
}
log.WithField("path", rootfsDiffPath).Info("Applying rootfs diff")
// Build tar command with options to handle conflicts:
// --keep-old-files: Don't overwrite existing files (may already be mounted)
// Exclude paths that are typically mounted read-only by the container runtime
cmd := exec.Command("tar",
"--keep-old-files",
"--exclude=./run/secrets",
"--exclude=./etc/resolv.conf",
"--exclude=./etc/hostname",
"--exclude=./etc/hosts",
"-C", targetRoot,
"-xf", rootfsDiffPath,
)
output, err := cmd.CombinedOutput()
if err != nil {
// Some failures are expected (read-only mounts, existing files)
// tar returns exit code 1 for "file exists" which is not fatal for us
if exitErr, ok := err.(*exec.ExitError); ok && exitErr.ExitCode() == 1 {
log.WithField("output", string(output)).Info("Rootfs diff applied (some files may have been skipped due to mounts)")
return nil
}
return fmt.Errorf("failed to extract rootfs diff: %w (output: %s)", err, string(output))
}
log.Info("Rootfs diff applied successfully")
return nil
}
// ApplyDeletedFiles removes files that were deleted in the original container.
// These are tracked via overlay whiteout markers (.wh.<filename>).
func ApplyDeletedFiles(checkpointPath, targetRoot string, log *logrus.Entry) error {
deletedFilesPath := filepath.Join(checkpointPath, DeletedFilesFilename)
// Check if deleted-files.json exists
data, err := os.ReadFile(deletedFilesPath)
if os.IsNotExist(err) {
log.Debug("No deleted-files.json found")
return nil
}
if err != nil {
return fmt.Errorf("failed to read deleted files list: %w", err)
}
log.Info("Applying deleted files from whiteout list")
// Parse JSON array of deleted file paths
var deletedFiles []string
if err := json.Unmarshal(data, &deletedFiles); err != nil {
return fmt.Errorf("failed to parse deleted files JSON: %w", err)
}
deletedCount := 0
for _, filePath := range deletedFiles {
if filePath == "" {
continue
}
targetPath := filepath.Join(targetRoot, filePath)
// Check if file exists before attempting deletion
if _, err := os.Stat(targetPath); os.IsNotExist(err) {
continue
}
if err := os.RemoveAll(targetPath); err != nil {
log.WithError(err).WithField("path", targetPath).Debug("Could not delete file")
continue
}
deletedCount++
}
log.WithField("count", deletedCount).Info("Deleted files applied")
return nil
}
// CheckpointFilesExist verifies that the checkpoint directory contains valid checkpoint files.
func CheckpointFilesExist(checkpointPath string) bool {
// Check for CRIU image files (core-*.img is always present)
matches, err := filepath.Glob(filepath.Join(checkpointPath, "core-*.img"))
if err != nil || len(matches) == 0 {
return false
}
return true
}
package restore
import (
"fmt"
criurpc "github.com/checkpoint-restore/go-criu/v7/rpc"
"google.golang.org/protobuf/proto"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/common"
)
// GenerateExtMountMaps generates external mount mappings for CRIU restore.
// It parses /proc/1/mountinfo (the restore container's mounts) and adds
// mappings for all mount points plus masked/readonly paths from common.
//
// If meta is nil or doesn't have OCI-derived paths, falls back to defaults.
func GenerateExtMountMaps(meta *common.CheckpointMetadata) ([]*criurpc.ExtMountMap, error) {
var maps []*criurpc.ExtMountMap
addedMounts := make(map[string]bool)
// Add root filesystem mapping first
maps = append(maps, &criurpc.ExtMountMap{
Key: proto.String("/"),
Val: proto.String("."),
})
addedMounts["/"] = true
// Parse /proc/1/mountinfo for all current mount points
mountPoints, err := common.GetMountPointPaths("/proc/1/mountinfo")
if err != nil {
return nil, fmt.Errorf("failed to parse mountinfo: %w", err)
}
for _, mountPoint := range mountPoints {
if addedMounts[mountPoint] || mountPoint == "/" {
continue
}
maps = append(maps, &criurpc.ExtMountMap{
Key: proto.String(mountPoint),
Val: proto.String(mountPoint),
})
addedMounts[mountPoint] = true
}
// Use masked paths from checkpoint metadata (OCI spec derived)
// Fall back to defaults for backwards compatibility
maskedPaths := common.DefaultMaskedPaths()
if meta != nil && len(meta.MaskedPaths) > 0 {
maskedPaths = meta.MaskedPaths
}
for _, path := range maskedPaths {
if addedMounts[path] {
continue
}
maps = append(maps, &criurpc.ExtMountMap{
Key: proto.String(path),
Val: proto.String(path),
})
addedMounts[path] = true
}
// Also add readonly paths from metadata if available
if meta != nil {
for _, path := range meta.ReadonlyPaths {
if addedMounts[path] {
continue
}
maps = append(maps, &criurpc.ExtMountMap{
Key: proto.String(path),
Val: proto.String(path),
})
addedMounts[path] = true
}
}
return maps, nil
}
// AddExtMountMap is a helper to create a single ExtMountMap entry.
func AddExtMountMap(key, val string) *criurpc.ExtMountMap {
return &criurpc.ExtMountMap{
Key: proto.String(key),
Val: proto.String(val),
}
}
package restore
import (
criu "github.com/checkpoint-restore/go-criu/v7"
"github.com/sirupsen/logrus"
)
// RestoreNotify implements criu.Notify for restore callbacks.
// It captures the restored process PID from the PostRestore callback.
type RestoreNotify struct {
criu.NoNotify // Embed no-op implementation for all methods
// RestoredPID is the PID of the restored process, set by PostRestore callback
RestoredPID int32
// log is the logger for notification events
log *logrus.Entry
}
// NewRestoreNotify creates a new RestoreNotify handler.
func NewRestoreNotify(log *logrus.Entry) *RestoreNotify {
return &RestoreNotify{
log: log,
}
}
// PreRestore is called before CRIU starts the restore operation.
func (n *RestoreNotify) PreRestore() error {
if n.log != nil {
n.log.Debug("CRIU pre-restore notification")
}
return nil
}
// PostRestore is called after CRIU completes the restore operation.
// The pid parameter contains the PID of the restored process.
func (n *RestoreNotify) PostRestore(pid int32) error {
n.RestoredPID = pid
if n.log != nil {
n.log.WithField("pid", pid).Info("CRIU post-restore notification: process restored")
}
return nil
}
// PostResume is called after the restored process has resumed execution.
func (n *RestoreNotify) PostResume() error {
if n.log != nil {
n.log.Debug("CRIU post-resume notification")
}
return nil
}
// Package restore provides CRIU restore operations for self-restoring placeholder containers.
package restore
import (
"context"
"os"
"strconv"
"time"
criurpc "github.com/checkpoint-restore/go-criu/v7/rpc"
"github.com/sirupsen/logrus"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/common"
)
// Config holds the configuration for the restore entrypoint.
// These values are typically set via environment variables.
type Config struct {
// CheckpointPath is the base directory containing checkpoints (default: /checkpoints)
// Env: DYN_CHECKPOINT_PATH
CheckpointPath string
// CheckpointHash is the ID/hash of the checkpoint to restore
// Env: DYN_CHECKPOINT_HASH
CheckpointHash string
// RestoreTrigger is the path to the trigger file that signals restore should start
RestoreTrigger string
// WaitForCheckpoint indicates whether to wait for a checkpoint to appear
WaitForCheckpoint bool
// WaitTimeout is the maximum time to wait for a checkpoint to become available
WaitTimeout time.Duration
// CRIULogLevel is the CRIU verbosity level (0-4, default: 4)
CRIULogLevel int32
// DefaultCmd is the command to run if no checkpoint is available
DefaultCmd string
// Debug enables debug logging
Debug bool
// EmbeddedCheckpointPath is the path to an embedded checkpoint within the image
// When set, the checkpoint data is baked into the container image itself
EmbeddedCheckpointPath string
// SkipInFlightConnections skips in-flight TCP connections during restore
SkipInFlightConnections bool
// AutoDedup enables auto-deduplication of memory pages
AutoDedup bool
// LazyPages enables lazy page migration (experimental)
LazyPages bool
// CRIUWorkDir is an alternative work directory for CRIU (instead of /tmp)
// Useful when /tmp has mount issues
CRIUWorkDir string
// CUDAPluginDir is the path to CRIU CUDA plugin directory (e.g., /usr/local/lib/criu)
// When set, a CRIU config file is created with libdir for CUDA plugin discovery during restore.
CUDAPluginDir string
// CRIUTimeout is the CRIU timeout in seconds (required for CUDA restores)
CRIUTimeout uint32
// RestoreMarkerFile is the path to a marker file created before CRIU restore.
// The restored process can check for this file to detect it was restored.
RestoreMarkerFile string
}
// DefaultEmbeddedCheckpointPath is the default path for embedded checkpoints
const DefaultEmbeddedCheckpointPath = "/embedded-checkpoint"
// ConfigFromEnv creates a Config from environment variables.
func ConfigFromEnv() *Config {
cfg := &Config{
CheckpointPath: getEnvOrDefault("DYN_CHECKPOINT_PATH", "/checkpoints"),
CheckpointHash: os.Getenv("DYN_CHECKPOINT_HASH"),
RestoreTrigger: getEnvOrDefault("RESTORE_TRIGGER", "/tmp/restore-trigger"),
WaitForCheckpoint: os.Getenv("WAIT_FOR_CHECKPOINT") == "1",
WaitTimeout: parseDurationOrDefault("RESTORE_WAIT_TIMEOUT", 300*time.Second),
CRIULogLevel: parseIntOrDefault("CRIU_LOG_LEVEL", 4),
DefaultCmd: os.Getenv("DEFAULT_CMD"),
Debug: os.Getenv("DEBUG") == "1",
EmbeddedCheckpointPath: getEnvOrDefault("EMBEDDED_CHECKPOINT_PATH", DefaultEmbeddedCheckpointPath),
SkipInFlightConnections: os.Getenv("CRIU_SKIP_IN_FLIGHT") == "1",
AutoDedup: os.Getenv("CRIU_AUTO_DEDUP") == "1",
LazyPages: os.Getenv("CRIU_LAZY_PAGES") == "1",
CRIUWorkDir: getEnvOrDefault("CRIU_WORK_DIR", ""),
CUDAPluginDir: os.Getenv("CUDA_PLUGIN_DIR"), // For CUDA plugin discovery during restore
CRIUTimeout: uint32(parseIntOrDefault("CRIU_TIMEOUT", 0)),
RestoreMarkerFile: getEnvOrDefault("DYN_RESTORE_MARKER_FILE", "/tmp/dynamo-restored"),
}
return cfg
}
// RestoreOptions holds the options for a CRIU restore operation.
// Most CRIU options are hardcoded with safe K8s defaults.
type RestoreOptions struct {
// CheckpointPath is the path to the checkpoint directory
CheckpointPath string
// RootPath is the root filesystem path for restore (typically "/")
RootPath string
// PidFile is the path where CRIU writes the restored process PID
PidFile string
// LogFile is the name of the CRIU restore log file
LogFile string
// LogLevel is the CRIU logging verbosity (0-4)
LogLevel int32
// ExtMountMaps contains external mount mappings for CRIU
ExtMountMaps []*criurpc.ExtMountMap
// WorkDir is an alternative work directory for CRIU (instead of /tmp)
WorkDir string
// LibDir is the path to CRIU plugin directory (e.g., /usr/local/lib/criu)
// When set, a CRIU config file is created with libdir for CUDA plugin discovery.
LibDir string
// Timeout is the CRIU timeout in seconds (required for CUDA restores)
Timeout uint32
}
// DefaultRestoreOptions returns RestoreOptions with sensible defaults.
func DefaultRestoreOptions(checkpointPath string) *RestoreOptions {
return &RestoreOptions{
CheckpointPath: checkpointPath,
RootPath: "/",
PidFile: "/tmp/restored.pid",
LogFile: "restore.log",
LogLevel: 4,
}
}
// LoadRestoreOptions creates RestoreOptions from checkpoint metadata.
// CRIU options are hardcoded with safe K8s defaults; metadata is only used for mount mappings.
func LoadRestoreOptions(checkpointPath string, logLevel int32) (*RestoreOptions, error) {
opts := DefaultRestoreOptions(checkpointPath)
opts.LogLevel = logLevel
// Load metadata for OCI-derived paths (masked/readonly paths for external mounts)
meta, err := common.LoadMetadata(checkpointPath)
if err != nil {
// Return defaults if metadata is unavailable
// GenerateExtMountMaps with nil will use fallback defaults
return opts, nil
}
// Pre-generate external mount maps using OCI-derived paths from metadata
// This uses masked/readonly paths from the OCI spec instead of hardcoded defaults
extMounts, err := GenerateExtMountMaps(meta)
if err != nil {
// Fall back to defaults if generation fails
return opts, nil
}
opts.ExtMountMaps = extMounts
return opts, nil
}
// ShouldRestore checks if a restore should be performed.
// Returns the checkpoint path and true if restore should proceed.
// IMPORTANT: We check for checkpoint.done marker (not just metadata.json or inventory.img) because
// checkpoint.done is written LAST in the checkpoint process, after rootfs-diff.tar completes.
// Order: metadata.json -> CRIU dump (*.img files) -> rootfs-diff.tar -> checkpoint.done
func ShouldRestore(cfg *Config, log *logrus.Entry) (string, bool) {
// Method 0: Embedded checkpoint in image (highest priority)
// This is for self-contained checkpoint images where data is baked in
if cfg.EmbeddedCheckpointPath != "" {
metadataPath := cfg.EmbeddedCheckpointPath + "/" + common.MetadataFilename
if _, err := os.Stat(metadataPath); err == nil {
log.WithField("path", cfg.EmbeddedCheckpointPath).Info("Embedded checkpoint found in image")
return cfg.EmbeddedCheckpointPath, true
}
}
// Method 1: DYN_CHECKPOINT_HASH is set and checkpoint is fully complete
if cfg.CheckpointHash != "" {
checkpointPath := cfg.CheckpointPath + "/" + cfg.CheckpointHash
// Check for checkpoint.done marker (written LAST after rootfs-diff.tar completes)
donePath := checkpointPath + "/checkpoint.done"
if _, err := os.Stat(donePath); err == nil {
log.WithField("path", checkpointPath).Info("Checkpoint found (checkpoint.done marker present)")
return checkpointPath, true
}
// Fallback: check for metadata.json but warn about potential race condition
metadataPath := checkpointPath + "/" + common.MetadataFilename
if _, err := os.Stat(metadataPath); err == nil {
log.WithFields(logrus.Fields{
"path": checkpointPath,
"warning": "checkpoint.done marker not found, checkpoint may be incomplete",
}).Warn("Checkpoint metadata found but checkpoint.done missing - checkpoint may still be in progress")
// Don't return true here - wait for checkpoint.done
}
}
// Method 2: Restore trigger file exists with checkpoint path
if cfg.RestoreTrigger != "" {
data, err := os.ReadFile(cfg.RestoreTrigger)
if err == nil {
checkpointPath := string(data)
if checkpointPath != "" {
donePath := checkpointPath + "/checkpoint.done"
if _, err := os.Stat(donePath); err == nil {
log.WithField("path", checkpointPath).Info("Restore triggered via file (checkpoint.done marker present)")
return checkpointPath, true
}
}
}
}
return "", false
}
// WaitForCheckpoint waits for a checkpoint to become available.
func WaitForCheckpoint(ctx context.Context, cfg *Config, log *logrus.Entry) (string, error) {
log.WithField("timeout", cfg.WaitTimeout).Info("Waiting for checkpoint")
deadline := time.Now().Add(cfg.WaitTimeout)
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
lastLog := time.Now()
for {
select {
case <-ctx.Done():
return "", ctx.Err()
case <-ticker.C:
if path, ok := ShouldRestore(cfg, log); ok {
return path, nil
}
// Log progress every 30 seconds
if time.Since(lastLog) >= 30*time.Second {
elapsed := time.Since(deadline.Add(-cfg.WaitTimeout))
log.WithField("elapsed", elapsed).Info("Still waiting for checkpoint...")
lastLog = time.Now()
}
if time.Now().After(deadline) {
return "", context.DeadlineExceeded
}
}
}
}
// Helper functions
func getEnvOrDefault(key, defaultValue string) string {
if value := os.Getenv(key); value != "" {
return value
}
return defaultValue
}
func parseDurationOrDefault(key string, defaultValue time.Duration) time.Duration {
value := os.Getenv(key)
if value == "" {
return defaultValue
}
seconds, err := strconv.Atoi(value)
if err != nil {
return defaultValue
}
return time.Duration(seconds) * time.Second
}
func parseIntOrDefault(key string, defaultValue int32) int32 {
value := os.Getenv(key)
if value == "" {
return defaultValue
}
i, err := strconv.Atoi(value)
if err != nil {
return defaultValue
}
return int32(i)
}
package restore
import (
"fmt"
"io"
"os"
"os/exec"
"os/signal"
"strconv"
"strings"
"syscall"
"time"
"github.com/sirupsen/logrus"
)
// MonitorProcess monitors the restored process and returns its exit code.
// It blocks until the process exits. Does not forward stdout/stderr.
// For output forwarding, use ForwardProcessOutput instead.
func MonitorProcess(pid int, log *logrus.Entry) int {
log.WithField("pid", pid).Info("Monitoring restored process")
for {
// Check if process still exists by sending signal 0
proc, err := os.FindProcess(pid)
if err != nil {
log.WithError(err).Error("Failed to find process")
return 1
}
err = proc.Signal(syscall.Signal(0))
if err != nil {
// Process has exited
log.WithField("pid", pid).Info("Restored process exited")
// Try to read exit status from /proc/<pid>/stat
// If process is gone, assume exit code 0
exitCode := getExitCode(pid)
log.WithField("exit_code", exitCode).Info("Restored process exit status")
return exitCode
}
time.Sleep(time.Second)
}
}
// ForwardProcessOutput forwards the stdout and stderr of a restored process
// to our own stdout/stderr via /proc/<pid>/fd/1 and /proc/<pid>/fd/2.
// This ensures logs from the restored process appear in kubectl logs.
// Returns the exit code of the process.
func ForwardProcessOutput(pid int, log *logrus.Entry) int {
log.WithField("pid", pid).Info("Forwarding output from restored process")
// Try to open the process's stdout and stderr via /proc
stdoutPath := fmt.Sprintf("/proc/%d/fd/1", pid)
stderrPath := fmt.Sprintf("/proc/%d/fd/2", pid)
// Channel to signal when copying goroutines should stop
done := make(chan struct{})
// Forward stdout
go forwardFD(stdoutPath, os.Stdout, "stdout", log, done)
// Forward stderr
go forwardFD(stderrPath, os.Stderr, "stderr", log, done)
// Wait for process to exit
exitCode := waitForProcess(pid, log)
// Signal goroutines to stop
close(done)
// Give goroutines a moment to flush any remaining output
time.Sleep(100 * time.Millisecond)
return exitCode
}
// forwardFD copies data from a file descriptor path to a writer.
// It handles the case where the FD may not be readable.
func forwardFD(fdPath string, dst io.Writer, name string, log *logrus.Entry, done <-chan struct{}) {
// Try to open the FD path
src, err := os.Open(fdPath)
if err != nil {
log.WithError(err).WithField("path", fdPath).Debug("Could not open process FD for forwarding")
return
}
defer src.Close()
// Check what kind of file this is
stat, err := src.Stat()
if err != nil {
log.WithError(err).WithField("path", fdPath).Debug("Could not stat process FD")
return
}
log.WithFields(logrus.Fields{
"name": name,
"mode": stat.Mode().String(),
"path": fdPath,
}).Debug("Forwarding process output")
// Copy data until done or EOF
buf := make([]byte, 4096)
for {
select {
case <-done:
return
default:
// Set a read deadline to allow checking done channel periodically
src.SetReadDeadline(time.Now().Add(100 * time.Millisecond))
n, err := src.Read(buf)
if n > 0 {
dst.Write(buf[:n])
}
if err != nil {
if os.IsTimeout(err) {
continue
}
if err != io.EOF {
log.WithError(err).WithField("name", name).Debug("Error reading from process FD")
}
return
}
}
}
}
// waitForProcess waits for a process to exit and returns its exit code.
func waitForProcess(pid int, log *logrus.Entry) int {
for {
// Check if process still exists by sending signal 0
proc, err := os.FindProcess(pid)
if err != nil {
log.WithError(err).Error("Failed to find process")
return 1
}
err = proc.Signal(syscall.Signal(0))
if err != nil {
// Process has exited
log.WithField("pid", pid).Info("Restored process exited")
// Try to get exit status
exitCode := getExitCode(pid)
log.WithField("exit_code", exitCode).Info("Restored process exit status")
return exitCode
}
time.Sleep(100 * time.Millisecond)
}
}
// getExitCode attempts to get the exit code of a process.
// Returns 0 if unable to determine the exit code.
func getExitCode(pid int) int {
// Try to wait for the process (only works if we're the parent)
proc, err := os.FindProcess(pid)
if err != nil {
return 0
}
// Try waitpid with WNOHANG - this may not work for non-child processes
var wstatus syscall.WaitStatus
wpid, err := syscall.Wait4(pid, &wstatus, syscall.WNOHANG, nil)
if err == nil && wpid == pid {
if wstatus.Exited() {
return wstatus.ExitStatus()
}
if wstatus.Signaled() {
return 128 + int(wstatus.Signal())
}
}
// If we can't wait on it, check if it's still running
if proc.Signal(syscall.Signal(0)) != nil {
// Process is gone, assume clean exit
return 0
}
return 0
}
// SetupSignalForwarding sets up signal forwarding to the restored process.
// Returns a cleanup function that should be called when done.
func SetupSignalForwarding(pid int, log *logrus.Entry) func() {
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGTERM, syscall.SIGINT, syscall.SIGQUIT)
done := make(chan struct{})
go func() {
select {
case sig := <-sigChan:
log.WithFields(logrus.Fields{
"signal": sig,
"pid": pid,
}).Info("Forwarding signal to restored process")
proc, err := os.FindProcess(pid)
if err == nil {
proc.Signal(sig)
}
case <-done:
return
}
}()
return func() {
signal.Stop(sigChan)
close(done)
}
}
// WaitForPidFile waits for the CRIU PID file to be created and returns the PID.
func WaitForPidFile(pidFile string, timeout time.Duration, log *logrus.Entry) (int, error) {
deadline := time.Now().Add(timeout)
for time.Now().Before(deadline) {
data, err := os.ReadFile(pidFile)
if err == nil {
pidStr := strings.TrimSpace(string(data))
pid, err := strconv.Atoi(pidStr)
if err == nil && pid > 0 {
return pid, nil
}
}
time.Sleep(100 * time.Millisecond)
}
return 0, fmt.Errorf("timeout waiting for PID file %s after %v", pidFile, timeout)
}
// RunDefault runs the default command when no checkpoint is available.
// It attempts to detect and run the appropriate default command for the container.
func RunDefault(cfg *Config, log *logrus.Entry) error {
// If DEFAULT_CMD is set, use it
if cfg.DefaultCmd != "" {
log.WithField("cmd", cfg.DefaultCmd).Info("Running default command")
return execCommand(cfg.DefaultCmd)
}
// Try common application entrypoints
if _, err := os.Stat("/docker-entrypoint.sh"); err == nil {
log.Info("Running docker-entrypoint.sh")
return execCommand("/docker-entrypoint.sh nginx -g 'daemon off;'")
}
// Check for nginx
if _, err := exec.LookPath("nginx"); err == nil {
log.Info("Running nginx")
return execCommand("nginx -g 'daemon off;'")
}
// Fallback to sleep infinity
log.Warn("No default command specified and no known entrypoint found, sleeping")
return execCommand("sleep infinity")
}
// execCommand executes a command by replacing the current process.
func execCommand(cmdLine string) error {
// Parse command line - simple split by spaces
// For complex commands, shell wrapper is needed
parts := strings.Fields(cmdLine)
if len(parts) == 0 {
return fmt.Errorf("empty command")
}
cmd := parts[0]
args := parts
// Find the executable path
path, err := exec.LookPath(cmd)
if err != nil {
// Try running through shell for complex commands
path = "/bin/sh"
args = []string{"sh", "-c", cmdLine}
}
// Replace current process with the command
return syscall.Exec(path, args, os.Environ())
}
package restore
import (
"context"
"fmt"
"os"
"path/filepath"
"strings"
"time"
criu "github.com/checkpoint-restore/go-criu/v7"
"github.com/sirupsen/logrus"
"google.golang.org/protobuf/proto"
)
// Restore performs the CRIU restore operation using go-criu.
// Returns the PID of the restored process.
func Restore(ctx context.Context, opts *RestoreOptions, log *logrus.Entry) (int, error) {
log.WithField("checkpoint", opts.CheckpointPath).Info("Starting CRIU restore")
// 1. Open checkpoint directory
imageDir, imageDirFD, err := OpenImageDir(opts.CheckpointPath)
if err != nil {
return 0, err
}
defer imageDir.Close()
log.WithField("fd", imageDirFD).Debug("Opened checkpoint directory")
// 2. Generate external mount mappings if not already set
if opts.ExtMountMaps == nil {
extMounts, err := GenerateExtMountMaps(nil)
if err != nil {
return 0, fmt.Errorf("failed to generate mount maps: %w", err)
}
opts.ExtMountMaps = extMounts
}
log.WithField("mount_count", len(opts.ExtMountMaps)).Debug("External mount maps ready")
// 3. Open target network namespace
netNsFile, netNsFD, err := OpenNetworkNamespace("/proc/1/ns/net")
if err != nil {
return 0, err
}
defer netNsFile.Close()
log.WithField("fd", netNsFD).Debug("Opened target network namespace")
// 4. Open work directory if specified
var workDirFile *os.File
var workDirFD int32 = -1
if opts.WorkDir != "" {
workDirFile, workDirFD = OpenWorkDir(opts.WorkDir, log)
if workDirFile != nil {
defer workDirFile.Close()
}
}
// 5. Build CRIU options
cfg := CRIURestoreConfig{
ImageDirFD: imageDirFD,
RootPath: opts.RootPath,
LogLevel: opts.LogLevel,
LogFile: opts.LogFile,
WorkDirFD: workDirFD,
NetNsFD: netNsFD,
ExtMountMaps: opts.ExtMountMaps,
}
criuOpts := BuildRestoreCRIUOpts(cfg)
// 6. Create CRIU config file for CUDA plugin if libdir is specified
if opts.LibDir != "" {
if opts.Timeout == 0 {
return 0, fmt.Errorf("CRIU_TIMEOUT environment variable must be set for CUDA restores")
}
configPath := filepath.Join(opts.CheckpointPath, "restore-criu.conf")
configContent := fmt.Sprintf(`enable-external-masters
libdir %s
tcp-close
link-remap
timeout %d
allow-uprobes
skip-in-flight
`, opts.LibDir, opts.Timeout)
if err := os.WriteFile(configPath, []byte(configContent), 0644); err != nil {
log.WithError(err).Warn("Failed to write CRIU config file for restore")
} else {
criuOpts.ConfigFile = proto.String(configPath)
log.WithFields(logrus.Fields{
"config_path": configPath,
"lib_dir": opts.LibDir,
}).Info("Created CRIU config file with libdir for CUDA plugin")
}
}
// 7. Execute CRIU restore
c := criu.MakeCriu()
notify := NewRestoreNotify(log)
log.Info("Executing CRIU restore")
criuExecStart := time.Now()
if err := c.Restore(criuOpts, notify); err != nil {
log.WithField("duration", time.Since(criuExecStart)).Error("CRIU c.Restore failed")
logCRIUErrors(opts.CheckpointPath, opts.LogFile, log)
return 0, fmt.Errorf("CRIU restore failed: %w", err)
}
log.WithFields(logrus.Fields{
"pid": notify.RestoredPID,
"duration": time.Since(criuExecStart),
}).Info("CRIU c.Restore completed successfully")
// 8. Get restored PID
if notify.RestoredPID > 0 {
return int(notify.RestoredPID), nil
}
// Fallback: try to read from PID file
if opts.PidFile != "" {
pid, err := WaitForPidFile(opts.PidFile, 10*time.Second, log)
if err != nil {
return 0, fmt.Errorf("failed to get restored PID: %w", err)
}
return pid, nil
}
return 0, fmt.Errorf("could not determine restored process PID")
}
// logCRIUErrors reads CRIU log file and logs errors.
func logCRIUErrors(checkpointPath, logFile string, log *logrus.Entry) {
logPath := filepath.Join(checkpointPath, logFile)
data, err := os.ReadFile(logPath)
if err != nil {
log.WithError(err).Warn("Could not read CRIU log file")
return
}
log.Error("=== CRIU RESTORE LOG START ===")
for _, line := range strings.Split(string(data), "\n") {
if line != "" {
log.Error(line)
}
}
log.Error("=== CRIU RESTORE LOG END ===")
// Copy log to shared directory if CRIU_LOG_DIR is set
if logDir := os.Getenv("CRIU_LOG_DIR"); logDir != "" {
if err := os.MkdirAll(logDir, 0755); err == nil {
destPath := filepath.Join(logDir, fmt.Sprintf("restore-%d.log", time.Now().Unix()))
if err := os.WriteFile(destPath, data, 0644); err == nil {
log.WithField("path", destPath).Info("CRIU log copied to shared directory")
}
}
}
}
// Run is the main entry point for the restore entrypoint.
// It orchestrates the entire restore process.
func Run(ctx context.Context, cfg *Config, log *logrus.Entry) error {
log.Info("=== Self-Restoring Placeholder Entrypoint ===")
log.WithFields(logrus.Fields{
"checkpoint_path": cfg.CheckpointPath,
"checkpoint_hash": cfg.CheckpointHash,
"embedded_checkpoint_path": cfg.EmbeddedCheckpointPath,
"wait_for_checkpoint": cfg.WaitForCheckpoint,
"restore_marker_file": cfg.RestoreMarkerFile,
}).Info("Configuration")
// Check CRIU availability
c := criu.MakeCriu()
version, err := c.GetCriuVersion()
if err != nil {
log.WithError(err).Error("CRIU is not available")
log.Info("Falling back to default command")
return RunDefault(cfg, log)
}
log.WithField("version", version).Info("CRIU version")
// Determine checkpoint path
var checkpointPath string
var shouldRestore bool
// Check if we should restore immediately
checkpointPath, shouldRestore = ShouldRestore(cfg, log)
// If not and we're configured to wait, wait for checkpoint
if !shouldRestore && cfg.WaitForCheckpoint {
log.Info("Waiting for checkpoint...")
var err error
checkpointPath, err = WaitForCheckpoint(ctx, cfg, log)
if err != nil {
log.WithError(err).Info("No checkpoint received, running default command")
return RunDefault(cfg, log)
}
shouldRestore = true
}
// If no checkpoint, run default command
if !shouldRestore {
log.Info("No checkpoint configured, running default command")
return RunDefault(cfg, log)
}
// Perform restore
log.WithField("checkpoint", checkpointPath).Info("Checkpoint available, starting restore")
restoreStart := time.Now()
// Apply filesystem changes
rootfsDiffStart := time.Now()
if err := ApplyRootfsDiff(checkpointPath, "/", log); err != nil {
log.WithError(err).Error("Failed to apply rootfs diff")
}
log.WithField("duration", time.Since(rootfsDiffStart)).Info("ApplyRootfsDiff completed")
deletedFilesStart := time.Now()
if err := ApplyDeletedFiles(checkpointPath, "/", log); err != nil {
log.WithError(err).Error("Failed to apply deleted files")
}
log.WithField("duration", time.Since(deletedFilesStart)).Info("ApplyDeletedFiles completed")
// Load restore options from metadata
loadOptsStart := time.Now()
opts, err := LoadRestoreOptions(checkpointPath, cfg.CRIULogLevel)
if err != nil {
log.WithError(err).Warn("Could not load restore options from metadata, using defaults")
}
log.WithField("duration", time.Since(loadOptsStart)).Info("LoadRestoreOptions completed")
// Apply additional config options
if cfg.CRIUWorkDir != "" {
opts.WorkDir = cfg.CRIUWorkDir
}
// Set CUDA plugin directory and timeout for restore config file
if cfg.CUDAPluginDir != "" {
if cfg.CRIUTimeout == 0 {
return fmt.Errorf("CRIU_TIMEOUT environment variable must be set for CUDA restores")
}
opts.LibDir = cfg.CUDAPluginDir
opts.Timeout = cfg.CRIUTimeout
log.WithFields(logrus.Fields{
"lib_dir": cfg.CUDAPluginDir,
"timeout": cfg.CRIUTimeout,
}).Info("CUDA plugin directory and timeout configured for restore")
}
// Write restore marker file before CRIU restore
// This allows the restored process to detect it's been restored
if cfg.RestoreMarkerFile != "" {
if err := os.WriteFile(cfg.RestoreMarkerFile, []byte("restored"), 0644); err != nil {
log.WithError(err).Warn("Failed to write restore marker file")
} else {
log.WithField("path", cfg.RestoreMarkerFile).Info("Wrote restore marker file")
}
}
// Perform CRIU restore (CUDA plugin handles CUDA state automatically)
criuRestoreStart := time.Now()
pid, err := Restore(ctx, opts, log)
if err != nil {
log.WithField("duration", time.Since(criuRestoreStart)).WithError(err).Error("Restore failed, falling back to default command")
if cfg.Debug {
log.Info("DEBUG mode: sleeping 300s to allow log collection...")
time.Sleep(300 * time.Second)
}
return RunDefault(cfg, log)
}
criuRestoreDuration := time.Since(criuRestoreStart)
log.WithField("duration", criuRestoreDuration).Info("CRIU Restore completed (CUDA state restored by plugin)")
totalDuration := time.Since(restoreStart)
log.WithFields(logrus.Fields{
"total_duration": totalDuration,
"criu_restore_duration": criuRestoreDuration,
}).Info("=== Restore operation completed ===")
// Set up signal forwarding and forward stdout/stderr from restored process
cleanup := SetupSignalForwarding(pid, log)
defer cleanup()
// Use ForwardProcessOutput to ensure restored process logs appear in kubectl logs
exitCode := ForwardProcessOutput(pid, log)
os.Exit(exitCode)
return nil
}
// Package watcher provides Kubernetes pod watching for automatic checkpointing.
package watcher
import (
"context"
"encoding/json"
"fmt"
"net/http"
"os"
"path/filepath"
"sync"
"time"
"github.com/sirupsen/logrus"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/cache"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/checkpoint"
checkpointk8s "github.com/ai-dynamo/dynamo/deploy/chrek/pkg/checkpoint/k8s"
)
const (
// LabelCheckpointSource is the label that triggers automatic checkpointing
LabelCheckpointSource = "nvidia.com/checkpoint-source"
// LabelCheckpointHash is the label specifying the checkpoint identity hash
LabelCheckpointHash = "nvidia.com/checkpoint-hash"
// EnvCheckpointSignalFile is the env var in the pod specifying the signal file path
EnvCheckpointSignalFile = "DYN_CHECKPOINT_SIGNAL_FILE"
)
// SignalFile represents the content of a checkpoint completion signal file
type SignalFile struct {
CheckpointID string `json:"checkpoint_id"`
CheckpointPath string `json:"checkpoint_path"`
Timestamp time.Time `json:"timestamp"`
Success bool `json:"success"`
Error string `json:"error,omitempty"`
}
// Config holds watcher configuration
type Config struct {
NodeName string
CheckpointDir string
HostProc string
ListenAddr string // HTTP server address for health checks (e.g., ":8080")
RestrictedNamespace string // Optional: restrict watching to this namespace (empty = cluster-wide)
// GPU/CUDA checkpoint options (passed to checkpoint.Options)
CUDAPluginDir string // Path to CRIU CUDA plugin directory
GhostLimit uint32 // Ghost file size limit in bytes (default: 512MB for GPU)
Timeout uint32 // CRIU timeout in seconds
ExternalMounts []string // Additional external mount mappings
}
// Watcher watches for pods with checkpoint labels and triggers checkpoints
type Watcher struct {
config Config
clientset kubernetes.Interface
discoveryClient *checkpointk8s.DiscoveryClient
checkpointer *checkpoint.Checkpointer
log *logrus.Entry
// Track pods checkpoint status: "in_progress", "completed", or "" (not started/failed)
checkpointed map[string]string
checkpointedMu sync.RWMutex
stopCh chan struct{}
}
// NewWatcher creates a new pod watcher
func NewWatcher(cfg Config, discoveryClient *checkpointk8s.DiscoveryClient, checkpointer *checkpoint.Checkpointer) (*Watcher, error) {
// Create in-cluster Kubernetes client
restConfig, err := rest.InClusterConfig()
if err != nil {
return nil, fmt.Errorf("failed to get in-cluster config: %w", err)
}
clientset, err := kubernetes.NewForConfig(restConfig)
if err != nil {
return nil, fmt.Errorf("failed to create kubernetes client: %w", err)
}
return &Watcher{
config: cfg,
clientset: clientset,
discoveryClient: discoveryClient,
checkpointer: checkpointer,
log: logrus.WithField("component", "watcher"),
checkpointed: make(map[string]string),
stopCh: make(chan struct{}),
}, nil
}
// Start begins watching for pods and starts the health check server
func (w *Watcher) Start(ctx context.Context) error {
w.log.WithFields(logrus.Fields{
"node": w.config.NodeName,
"label": LabelCheckpointSource,
"signal_file_env": EnvCheckpointSignalFile,
}).Info("Starting pod watcher")
// Start health check HTTP server if address is configured
if w.config.ListenAddr != "" {
httpServer := w.startHealthServer(ctx)
defer func() {
shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
httpServer.Shutdown(shutdownCtx)
}()
}
// Create informer factory with label selector and optional namespace restriction
labelSelector := labels.SelectorFromSet(labels.Set{
LabelCheckpointSource: "true",
}).String()
factoryOptions := []informers.SharedInformerOption{
informers.WithTweakListOptions(func(opts *metav1.ListOptions) {
opts.LabelSelector = labelSelector
}),
}
// If namespace is specified, restrict watching to that namespace
if w.config.RestrictedNamespace != "" {
w.log.WithField("namespace", w.config.RestrictedNamespace).Info("Restricting pod watching to namespace")
factoryOptions = append(factoryOptions, informers.WithNamespace(w.config.RestrictedNamespace))
} else {
w.log.Info("Watching pods cluster-wide (all namespaces)")
}
factory := informers.NewSharedInformerFactoryWithOptions(
w.clientset,
30*time.Second,
factoryOptions...,
)
podInformer := factory.Core().V1().Pods().Informer()
// Add event handlers
podInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: func(obj interface{}) {
pod := obj.(*corev1.Pod)
w.handlePodEvent(ctx, pod)
},
UpdateFunc: func(oldObj, newObj interface{}) {
pod := newObj.(*corev1.Pod)
w.handlePodEvent(ctx, pod)
},
})
// Start informer
go factory.Start(w.stopCh)
// Wait for cache sync
if !cache.WaitForCacheSync(w.stopCh, podInformer.HasSynced) {
return fmt.Errorf("failed to sync informer cache")
}
w.log.Info("Pod watcher started and cache synced")
// Wait for context cancellation
<-ctx.Done()
close(w.stopCh)
return nil
}
// HealthResponse is the response for health check endpoint
type HealthResponse struct {
Status string `json:"status"`
NodeName string `json:"node_name"`
}
// startHealthServer starts an HTTP server for health checks
func (w *Watcher) startHealthServer(ctx context.Context) *http.Server {
mux := http.NewServeMux()
mux.HandleFunc("/health", func(rw http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodGet {
http.Error(rw, "Method not allowed", http.StatusMethodNotAllowed)
return
}
rw.Header().Set("Content-Type", "application/json")
json.NewEncoder(rw).Encode(HealthResponse{
Status: "healthy",
NodeName: w.config.NodeName,
})
})
server := &http.Server{
Addr: w.config.ListenAddr,
Handler: mux,
ReadTimeout: 10 * time.Second,
WriteTimeout: 10 * time.Second,
IdleTimeout: 60 * time.Second,
}
go func() {
w.log.WithField("addr", w.config.ListenAddr).Info("Starting health check server")
if err := server.ListenAndServe(); err != nil && err != http.ErrServerClosed {
w.log.WithError(err).Error("Health check server error")
}
}()
return server
}
// Stop stops the watcher
func (w *Watcher) Stop() {
close(w.stopCh)
}
// handlePodEvent processes a pod event
func (w *Watcher) handlePodEvent(ctx context.Context, pod *corev1.Pod) {
// Filter to pods on this node
if pod.Spec.NodeName != w.config.NodeName {
return
}
// Check if pod is Ready
if !w.isPodReady(pod) {
return
}
// Check if we've already checkpointed this pod
podKey := fmt.Sprintf("%s/%s", pod.Namespace, pod.Name)
// Get checkpoint ID from label (uses the checkpoint hash)
checkpointID, ok := pod.Labels[LabelCheckpointHash]
if !ok || checkpointID == "" {
w.log.WithField("pod", podKey).Warn("Pod has checkpoint label but no checkpoint-hash label")
return
}
// Check if checkpoint is already in progress or completed for this pod
w.checkpointedMu.Lock()
status := w.checkpointed[podKey]
if status == "completed" || status == "in_progress" {
w.checkpointedMu.Unlock()
return
}
// Mark as in_progress to prevent concurrent attempts
w.checkpointed[podKey] = "in_progress"
w.checkpointedMu.Unlock()
// Trigger checkpoint
w.log.WithFields(logrus.Fields{
"pod": podKey,
"checkpoint_id": checkpointID,
}).Info("Pod ready, triggering checkpoint")
go w.doCheckpoint(ctx, pod, checkpointID, podKey)
}
// isPodReady checks if all containers in the pod are ready
func (w *Watcher) isPodReady(pod *corev1.Pod) bool {
if pod.Status.Phase != corev1.PodRunning {
return false
}
for _, cond := range pod.Status.Conditions {
if cond.Type == corev1.PodReady && cond.Status == corev1.ConditionTrue {
return true
}
}
return false
}
// doCheckpoint performs the checkpoint and writes the signal file
func (w *Watcher) doCheckpoint(ctx context.Context, pod *corev1.Pod, checkpointID, podKey string) {
log := w.log.WithFields(logrus.Fields{
"pod": podKey,
"checkpoint_id": checkpointID,
})
// Find the main container and get signal file path from env
var containerID string
var signalFilePath string
for _, container := range pod.Spec.Containers {
if container.Name == "main" || len(pod.Spec.Containers) == 1 {
// Get signal file path from environment
for _, env := range container.Env {
if env.Name == EnvCheckpointSignalFile {
signalFilePath = env.Value
break
}
}
break
}
}
// Get container ID from status
for _, cs := range pod.Status.ContainerStatuses {
if cs.Name == "main" || len(pod.Status.ContainerStatuses) == 1 {
// Remove containerd:// prefix
containerID = cs.ContainerID
if len(containerID) > 13 && containerID[:13] == "containerd://" {
containerID = containerID[13:]
}
break
}
}
if containerID == "" {
log.Error("Could not find container ID")
w.checkpointedMu.Lock()
delete(w.checkpointed, podKey)
w.checkpointedMu.Unlock()
return
}
if signalFilePath == "" {
log.Warn("No DYN_CHECKPOINT_SIGNAL_FILE env var found, signal file will not be written")
}
log.WithFields(logrus.Fields{
"container_id": containerID,
"signal_file_path": signalFilePath,
}).Info("Found container, starting checkpoint")
// Resolve container to get PID for signal file writing
containerInfo, err := w.discoveryClient.ResolveContainer(ctx, containerID)
if err != nil {
log.WithError(err).Error("Failed to resolve container")
w.checkpointedMu.Lock()
delete(w.checkpointed, podKey)
w.checkpointedMu.Unlock()
return
}
// Perform checkpoint
opts := checkpoint.Options{
ContainerID: containerID,
CheckpointID: checkpointID,
CheckpointDir: w.config.CheckpointDir,
NodeName: w.config.NodeName,
PodName: pod.Name,
PodNamespace: pod.Namespace,
CUDAPluginDir: w.config.CUDAPluginDir,
GhostLimit: w.config.GhostLimit,
Timeout: w.config.Timeout,
ExternalMounts: w.config.ExternalMounts,
}
result, err := w.checkpointer.Checkpoint(ctx, opts)
if err != nil {
log.WithError(err).Error("Checkpoint failed")
// Write failure marker to PVC so restore pods know checkpoint failed
checkpointDir := filepath.Join(w.config.CheckpointDir, checkpointID)
w.writeCheckpointDoneMarker(checkpointDir, checkpointID, false, err.Error(), log)
if signalFilePath != "" {
w.writeSignalFileToPod(int(containerInfo.PID), signalFilePath, checkpointID, "", false, err.Error())
}
// Clear the in_progress status so checkpoint can be retried
w.checkpointedMu.Lock()
delete(w.checkpointed, podKey)
w.checkpointedMu.Unlock()
return
}
log.WithField("checkpoint_dir", result.CheckpointDir).Info("Checkpoint completed successfully")
// Write checkpoint.done marker to PVC for cross-node restore detection
// This is written AFTER rootfs-diff.tar is complete, so it's safe to use as a completion marker
w.writeCheckpointDoneMarker(result.CheckpointDir, checkpointID, true, "", log)
// Write signal file to pod's hostPath for checkpoint job pod to exit
if signalFilePath != "" {
w.writeSignalFileToPod(int(containerInfo.PID), signalFilePath, checkpointID, result.CheckpointDir, true, "")
}
// Mark as completed so we don't checkpoint again
w.checkpointedMu.Lock()
w.checkpointed[podKey] = "completed"
w.checkpointedMu.Unlock()
}
// writeSignalFileToPod writes a signal file to the checkpointed pod's filesystem
// via /proc/<pid>/root to indicate checkpoint completion
func (w *Watcher) writeSignalFileToPod(pid int, signalFilePath, checkpointID, checkpointPath string, success bool, errMsg string) {
signal := SignalFile{
CheckpointID: checkpointID,
CheckpointPath: checkpointPath,
Timestamp: time.Now().UTC(),
Success: success,
Error: errMsg,
}
data, err := json.MarshalIndent(signal, "", " ")
if err != nil {
w.log.WithError(err).Error("Failed to marshal signal file")
return
}
// Write to the pod's filesystem via /proc/<pid>/root
// signalFilePath is the path inside the pod (e.g., /var/lib/dynamo-checkpoint/signal.done)
hostSignalPath := fmt.Sprintf("%s/%d/root%s", w.config.HostProc, pid, signalFilePath)
// Ensure signal directory exists in pod's filesystem
signalDir := filepath.Dir(hostSignalPath)
if err := os.MkdirAll(signalDir, 0755); err != nil {
w.log.WithError(err).WithField("path", signalDir).Error("Failed to create signal directory in pod")
return
}
if err := os.WriteFile(hostSignalPath, data, 0644); err != nil {
w.log.WithError(err).WithField("path", hostSignalPath).Error("Failed to write signal file to pod")
return
}
w.log.WithFields(logrus.Fields{
"host_path": hostSignalPath,
"pod_path": signalFilePath,
"pid": pid,
"success": success,
}).Info("Signal file written to pod filesystem")
}
// writeCheckpointDoneMarker writes a checkpoint.done marker file to the checkpoint directory on shared PVC.
// This file is written AFTER all checkpoint steps complete (including rootfs-diff.tar).
// Restore pods on ANY node check for this file to know the checkpoint is complete and safe to restore.
// This is separate from writeSignalFileToPod which signals the checkpoint job pod to exit.
func (w *Watcher) writeCheckpointDoneMarker(checkpointDir, checkpointID string, success bool, errMsg string, log *logrus.Entry) {
markerPath := filepath.Join(checkpointDir, "checkpoint.done")
marker := SignalFile{
CheckpointID: checkpointID,
CheckpointPath: checkpointDir,
Timestamp: time.Now().UTC(),
Success: success,
Error: errMsg,
}
data, err := json.MarshalIndent(marker, "", " ")
if err != nil {
log.WithError(err).Error("Failed to marshal checkpoint.done marker")
return
}
if err := os.WriteFile(markerPath, data, 0644); err != nil {
log.WithError(err).WithField("path", markerPath).Error("Failed to write checkpoint.done marker")
return
}
log.WithFields(logrus.Fields{
"path": markerPath,
"success": success,
}).Info("checkpoint.done marker written to PVC")
}
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Smart entrypoint wrapper for CRIU checkpoint/restore
# Automatically detects checkpoints and falls back to cold start if not found
#
# Behavior:
# 1. If DYN_CHECKPOINT_HASH is set and checkpoint exists -> restore
# 2. If WAIT_FOR_CHECKPOINT=1 -> wait for checkpoint (restore-entrypoint handles this)
# 3. Otherwise -> execute provided command (cold start)
set -e
# Enable debug output if DEBUG=1
if [ "${DEBUG:-0}" = "1" ]; then
set -x
fi
# Configuration from environment
CHECKPOINT_PATH="${DYN_CHECKPOINT_PATH:-/checkpoints}"
CHECKPOINT_HASH="${DYN_CHECKPOINT_HASH:-}"
WAIT_FOR_CHECKPOINT="${WAIT_FOR_CHECKPOINT:-0}"
# Log function for consistent output
log() {
echo "[smart-entrypoint] $*" >&2
}
# Check if a checkpoint exists and should be restored
should_restore_checkpoint() {
# If WAIT_FOR_CHECKPOINT is set, always use restore-entrypoint
# (it will wait for the checkpoint to appear)
if [ "$WAIT_FOR_CHECKPOINT" = "1" ]; then
log "WAIT_FOR_CHECKPOINT=1, will wait for checkpoint via restore-entrypoint"
return 0
fi
# If checkpoint hash is not set, no restore
if [ -z "$CHECKPOINT_HASH" ]; then
log "DYN_CHECKPOINT_HASH not set, no checkpoint to restore"
return 1
fi
# Check if checkpoint directory exists
CHECKPOINT_DIR="$CHECKPOINT_PATH/$CHECKPOINT_HASH"
if [ ! -d "$CHECKPOINT_DIR" ]; then
log "Checkpoint directory not found: $CHECKPOINT_DIR"
return 1
fi
# Check for checkpoint.done marker which is written LAST in the checkpoint process
# This is more reliable than inventory.img (created by CRIU) or rootfs-diff.tar (may be mid-write)
# Order: metadata.json -> CRIU dump (*.img) -> rootfs-diff.tar -> checkpoint.done
DONE_MARKER="$CHECKPOINT_DIR/checkpoint.done"
if [ ! -f "$DONE_MARKER" ]; then
log "Checkpoint incomplete - checkpoint.done not found in: $CHECKPOINT_DIR"
log "Checkpoint may still be in progress..."
return 1
fi
log "Checkpoint found: $CHECKPOINT_HASH (checkpoint.done marker present)"
return 0
}
# Main logic
if should_restore_checkpoint; then
log "=========================================="
log "CHECKPOINT RESTORE MODE"
log "=========================================="
log "Checkpoint: $CHECKPOINT_HASH"
log "Location: $CHECKPOINT_PATH/$CHECKPOINT_HASH"
log "Invoking restore-entrypoint..."
log "=========================================="
# Execute restore-entrypoint
# Any args passed to this script are forwarded (though restore-entrypoint ignores them)
exec /restore-entrypoint "$@"
else
log "=========================================="
log "COLD START MODE"
log "=========================================="
# No checkpoint found or not requested - fall back to cold start
if [ $# -eq 0 ]; then
# No args provided - this is likely an error
log "ERROR: No checkpoint to restore and no command provided"
log "Set DYN_CHECKPOINT_HASH to restore a checkpoint, or provide a command to run"
exit 1
fi
log "No checkpoint to restore"
log "Executing command: $*"
log "=========================================="
# Execute the provided command
exec "$@"
fi
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: v2
name: chrek
description: Checkpoint/Restore infrastructure for Dynamo (PVC + DaemonSet + CRIU Agent)
type: application
version: 0.1.0
appVersion: "1.0"
keywords:
- nvidia
- dynamo
- checkpoint
- criu
- gpu
home: https://github.com/ai-dynamo/dynamo
sources:
- https://github.com/ai-dynamo/dynamo
maintainers:
- name: NVIDIA
# Chrek Helm Chart
> ⚠️ **Experimental Feature**: ChReK is currently in **beta/preview**. It requires privileged mode for restore operations, which may not be suitable for all production environments. See [Prerequisites](#prerequisites) for security considerations.
This Helm chart deploys the checkpoint/restore infrastructure for NVIDIA Dynamo, including:
- Persistent Volume Claim (PVC) for checkpoint storage
- DaemonSet running the CRIU checkpoint agent
- RBAC resources (ServiceAccount, Role, RoleBinding)
- Seccomp profile for blocking io_uring syscalls
**Note:**
- Each namespace gets its own isolated checkpoint infrastructure with namespace-scoped RBAC
- **Currently only supports vLLM backend** (SGLang and TensorRT-LLM support planned)
## Prerequisites
⚠️ **Security Warning**: ChReK restore operations require **privileged mode**, which grants containers elevated host access. This may violate security policies in production environments. Only deploy in environments where privileged containers are acceptable.
- Kubernetes 1.21+
- GPU nodes with NVIDIA runtime (`nvidia` runtime class)
- CRIU support in the container runtime (containerd with CRIU plugin)
- NVIDIA Dynamo operator installed (cluster-wide or namespace-scoped)
- RWX (ReadWriteMany) storage class for multi-node deployments
- **Security clearance for privileged pods** (required for restore operations)
## Installation
> **Note:** The ChReK Helm chart is not yet published to a public Helm repository. For now, you must build and deploy from source.
### Building from Source
```bash
# Set environment
export NAMESPACE=my-team # Your target namespace
export DOCKER_SERVER=your-registry.com/ # Your container registry
export IMAGE_TAG=latest
# Build ChReK agent image
cd deploy/chrek
docker build --target agent -t $DOCKER_SERVER/chrek-agent:$IMAGE_TAG .
docker push $DOCKER_SERVER/chrek-agent:$IMAGE_TAG
cd -
# Install ChReK chart with custom image
helm install chrek ./deploy/helm/charts/chrek/ \
--namespace ${NAMESPACE} \
--create-namespace \
--set daemonset.image.repository=${DOCKER_SERVER}/chrek-agent \
--set daemonset.image.tag=${IMAGE_TAG} \
--set daemonset.imagePullSecrets[0].name=your-registry-secret
```
## Configuration
See `values.yaml` for all configuration options.
### Key Configuration Options
| Parameter | Description | Default |
|-----------|-------------|---------|
| `storage.type` | Storage type: `pvc` (only supported), `s3` and `oci` planned | `pvc` |
| `storage.pvc.create` | Create a new PVC | `true` |
| `storage.pvc.name` | PVC name (must match operator config) | `chrek-pvc` |
| `storage.pvc.size` | PVC size | `100Gi` |
| `storage.pvc.storageClass` | Storage class name | `""` (default) |
| `storage.signalHostPath` | Host path for signal files | `/var/lib/chrek/signals` |
| `daemonset.image.repository` | DaemonSet image repository | `nvidia/chrek-agent` |
| `daemonset.nodeSelector` | Node selector for GPU nodes | `nvidia.com/gpu.present: "true"` |
| `daemonset.runtimeClassName` | Runtime class for GPU access | `nvidia` |
| `daemonset.criu.timeout` | CRIU timeout in seconds | `"21600"` (6 hours) |
| `daemonset.criu.ghostLimit` | CRIU ghost file size limit | `"512MB"` |
| `rbac.namespaceRestricted` | Use namespace-scoped RBAC | `true` |
## Usage
After installing this chart, enable checkpointing in your DynamoGraphDeployment:
```yaml
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: my-model
namespace: my-team
spec:
services:
worker:
checkpoint:
enabled: true
mode: auto
identity:
model: Qwen/Qwen3-0.6B
backendFramework: vllm
```
## Multi-Namespace Deployment
To enable checkpointing in multiple namespaces, install this chart in each namespace:
```bash
# Namespace A
helm install chrek nvidia/chrek -n team-a
# Namespace B
helm install chrek nvidia/chrek -n team-b
```
Each namespace will have its own isolated checkpoint storage.
## Verification
```bash
# Check PVC
kubectl get pvc chrek-pvc -n my-team
# Check DaemonSet
kubectl get daemonset -n my-team
# Check DaemonSet pods are running
kubectl get pods -n my-team -l app.kubernetes.io/name=chrek
```
## Uninstallation
```bash
helm uninstall chrek -n my-team
```
**Note:** This will NOT delete the PVC by default. To delete the PVC:
```bash
kubectl delete pvc chrek-pvc -n my-team
```
## Troubleshooting
### DaemonSet pods not starting
Check if GPU nodes have the correct labels and runtime class:
```bash
kubectl get nodes -l nvidia.com/gpu.present=true
kubectl describe node <node-name> | grep -A 5 "Runtime Class"
```
If nodes don't have the `nvidia.com/gpu.present` label, you can add it:
```bash
kubectl label node <node-name> nvidia.com/gpu.present=true
```
### Checkpoint job fails
Check DaemonSet logs:
```bash
kubectl logs -n my-team -l app.kubernetes.io/name=chrek
```
### PVC not mounting
Check PVC status and events:
```bash
kubectl describe pvc chrek-pvc -n my-team
```
Ensure your storage class supports `ReadWriteMany` access mode for multi-node deployments.
## Related Documentation
- [ChReK Overview](../../../../docs/kubernetes/chrek/README.md) - ChReK architecture and use cases
- [ChReK with Dynamo Platform](../../../../docs/kubernetes/chrek/dynamo.md) - Integration guide
- [ChReK Standalone Usage](../../../../docs/kubernetes/chrek/standalone.md) - Use ChReK without Dynamo Platform
## License
Apache License 2.0
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
{{/*
Expand the name of the chart.
*/}}
{{- define "chrek.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
{{- end }}
{{/*
Create a default fully qualified app name.
*/}}
{{- define "chrek.fullname" -}}
{{- if .Values.fullnameOverride }}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- $name := default .Chart.Name .Values.nameOverride }}
{{- if contains $name .Release.Name }}
{{- .Release.Name | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
{{- end }}
{{- end }}
{{- end }}
{{/*
Create chart name and version as used by the chart label.
*/}}
{{- define "chrek.chart" -}}
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
{{- end }}
{{/*
Common labels
*/}}
{{- define "chrek.labels" -}}
helm.sh/chart: {{ include "chrek.chart" . }}
{{ include "chrek.selectorLabels" . }}
{{- if .Chart.AppVersion }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
{{- end }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
app.kubernetes.io/component: checkpoint-agent
{{- end }}
{{/*
Selector labels
*/}}
{{- define "chrek.selectorLabels" -}}
app.kubernetes.io/name: {{ include "chrek.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
{{- end }}
{{/*
Create the name of the service account to use
*/}}
{{- define "chrek.serviceAccountName" -}}
{{- if .Values.serviceAccount.create }}
{{- default (include "chrek.fullname" . ) .Values.serviceAccount.name }}
{{- else }}
{{- default "default" .Values.serviceAccount.name }}
{{- end }}
{{- end }}
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: {{ include "chrek.fullname" . }}-agent
namespace: {{ .Release.Namespace }}
labels:
{{- include "chrek.labels" . | nindent 4 }}
app.kubernetes.io/component: checkpoint-agent
spec:
selector:
matchLabels:
app.kubernetes.io/name: checkpoint-agent
app.kubernetes.io/instance: {{ .Release.Name }}
template:
metadata:
labels:
app.kubernetes.io/name: checkpoint-agent
app.kubernetes.io/instance: {{ .Release.Name }}
{{- with .Values.daemonset.podLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.daemonset.podAnnotations }}
annotations:
{{- toYaml . | nindent 8 }}
{{- end }}
spec:
serviceAccountName: {{ include "chrek.serviceAccountName" . }}
hostPID: true
hostIPC: true
hostNetwork: true
{{- with .Values.daemonset.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
tolerations:
# Default: tolerate all taints (allow running on any node)
- operator: Exists
{{- with .Values.daemonset.tolerations }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.daemonset.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- if .Values.daemonset.runtimeClassName }}
# Use specified runtime class for GPU access (e.g., nvidia for CUDA checkpointing)
runtimeClassName: {{ .Values.daemonset.runtimeClassName }}
{{- end }}
{{- if .Values.seccomp.deploy }}
initContainers:
# Deploy seccomp profile to host before starting the agent
# This profile blocks io_uring syscalls that CRIU doesn't support
- name: deploy-seccomp
image: busybox:latest
command:
- sh
- -c
- |
mkdir -p /host-seccomp/profiles
cp /seccomp-profiles/block-iouring.json /host-seccomp/profiles/block-iouring.json
echo "Deployed seccomp profile to /var/lib/kubelet/seccomp/profiles/block-iouring.json"
volumeMounts:
- name: seccomp-profiles
mountPath: /seccomp-profiles
readOnly: true
- name: host-seccomp
mountPath: /host-seccomp
{{- end }}
containers:
- name: agent
image: "{{ .Values.daemonset.image.repository }}:{{ .Values.daemonset.image.tag }}"
imagePullPolicy: {{ .Values.daemonset.image.pullPolicy }}
securityContext:
privileged: true
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
# Agent mode: use "watcher" to watch for labeled pods
- name: CHECKPOINT_SIGNAL_FROM
value: "watcher"
{{- if .Values.rbac.namespaceRestricted }}
# Restrict pod watching to this namespace (namespace-scoped RBAC)
- name: RESTRICTED_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
{{- end }}
# Checkpoint storage directory
- name: CHECKPOINT_DIR
value: {{ .Values.storage.pvc.basePath | quote }}
# Host proc mount point for CRIU operations
- name: HOST_PROC
value: "/host/proc"
# Containerd socket path
- name: CONTAINERD_SOCKET
value: {{ .Values.daemonset.containerRuntimeSocket }}
{{- if .Values.daemonset.criu.cudaPluginDir }}
# CUDA plugin directory for GPU checkpoint support
- name: CUDA_PLUGIN_DIR
value: {{ .Values.daemonset.criu.cudaPluginDir | quote }}
{{- end }}
{{- if .Values.daemonset.criu.ghostLimit }}
# CRIU ghost file size limit in bytes
- name: CRIU_GHOST_LIMIT
value: {{ .Values.daemonset.criu.ghostLimit | quote }}
{{- end }}
{{- if .Values.daemonset.criu.timeout }}
# CRIU timeout in seconds
- name: CRIU_TIMEOUT
value: {{ .Values.daemonset.criu.timeout | quote }}
{{- end }}
# Storage type (for future S3/OCI support)
- name: DYN_CHECKPOINT_STORAGE_TYPE
value: {{ .Values.storage.type | quote }}
volumeMounts:
{{- if eq .Values.storage.type "pvc" }}
# Mount the checkpoint PVC (only for PVC storage type)
- name: checkpoints
mountPath: {{ .Values.storage.pvc.basePath }}
{{- end }}
# Mount containerd runtime directory for checkpoint operations
- name: containerd-run
mountPath: /run/containerd
# Mount kubelet pods directory for volume discovery
- name: kubelet-pods
mountPath: /var/lib/kubelet/pods
readOnly: true
# Mount containerd storage for filesystem info
- name: containerd-storage
mountPath: /var/lib/containerd
readOnly: true
# Mount host proc for CRIU and signal file writing
- name: host-proc
mountPath: /host/proc
# Mount host cgroup for CRIU
- name: host-cgroup
mountPath: /sys/fs/cgroup
readOnly: true
{{- if and (eq .Values.storage.type "oci") .Values.storage.oci.credentialsSecretRef }}
# Mount docker config for OCI registry auth
- name: docker-config
mountPath: /root/.docker
readOnly: true
{{- end }}
{{- if and (eq .Values.storage.type "s3") .Values.storage.s3.credentialsSecretRef }}
envFrom:
- secretRef:
name: {{ .Values.storage.s3.credentialsSecretRef }}
{{- end }}
resources:
{{- toYaml .Values.daemonset.resources | nindent 12 }}
volumes:
{{- if .Values.seccomp.deploy }}
# Seccomp profile ConfigMap (used by initContainer)
- name: seccomp-profiles
configMap:
name: {{ include "chrek.fullname" . }}-seccomp
# Host seccomp directory (for deploying the profile)
- name: host-seccomp
hostPath:
path: /var/lib/kubelet/seccomp
type: DirectoryOrCreate
{{- end }}
{{- if eq .Values.storage.type "pvc" }}
- name: checkpoints
persistentVolumeClaim:
claimName: {{ .Values.storage.pvc.name }}
{{- end }}
# Containerd runtime directory (read-write for checkpoint operations)
- name: containerd-run
hostPath:
path: /run/containerd
type: Directory
# Kubelet pods directory (for volume discovery)
- name: kubelet-pods
hostPath:
path: /var/lib/kubelet/pods
type: Directory
# Containerd storage directory (for filesystem info)
- name: containerd-storage
hostPath:
path: /var/lib/containerd
type: Directory
# Host proc (for CRIU and signal files - needs write access)
- name: host-proc
hostPath:
path: /proc
type: Directory
# Host cgroup (for CRIU)
- name: host-cgroup
hostPath:
path: /sys/fs/cgroup
type: Directory
{{- if and (eq .Values.storage.type "oci") .Values.storage.oci.credentialsSecretRef }}
- name: docker-config
secret:
secretName: {{ .Values.storage.oci.credentialsSecretRef }}
{{- end }}
{{- with .Values.daemonset.imagePullSecrets }}
imagePullSecrets:
{{- toYaml . | nindent 8 }}
{{- end }}
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
{{- if and (eq .Values.storage.type "pvc") .Values.storage.pvc.create }}
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: {{ .Values.storage.pvc.name }}
namespace: {{ .Release.Namespace }}
labels:
app.kubernetes.io/name: {{ include "chrek.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
app.kubernetes.io/component: storage
spec:
accessModes:
- {{ .Values.storage.pvc.accessMode }}
resources:
requests:
storage: {{ .Values.storage.pvc.size }}
{{- if .Values.storage.pvc.storageClass }}
storageClassName: {{ .Values.storage.pvc.storageClass }}
{{- end }}
{{- end }}
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
{{- if .Values.rbac.create }}
{{- if .Values.rbac.namespaceRestricted }}
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: {{ include "chrek.fullname" . }}-agent
namespace: {{ .Release.Namespace }}
labels:
{{- include "chrek.labels" . | nindent 4 }}
app.kubernetes.io/component: checkpoint-agent
rules:
# Watch pods in this namespace to detect checkpoint-source pods becoming ready
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "list", "watch"]
{{- else }}
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ include "chrek.fullname" . }}-agent
labels:
{{- include "chrek.labels" . | nindent 4 }}
app.kubernetes.io/component: checkpoint-agent
rules:
# Watch pods cluster-wide to detect checkpoint-source pods on assigned nodes
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "list", "watch"]
{{- end }}
{{- end }}
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
{{- if .Values.rbac.create }}
{{- if .Values.rbac.namespaceRestricted }}
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: {{ include "chrek.fullname" . }}-agent
namespace: {{ .Release.Namespace }}
labels:
{{- include "chrek.labels" . | nindent 4 }}
app.kubernetes.io/component: checkpoint-agent
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: {{ include "chrek.fullname" . }}-agent
subjects:
- kind: ServiceAccount
name: {{ include "chrek.serviceAccountName" . }}
namespace: {{ .Release.Namespace }}
{{- else }}
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{ include "chrek.fullname" . }}-agent
labels:
{{- include "chrek.labels" . | nindent 4 }}
app.kubernetes.io/component: checkpoint-agent
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: {{ include "chrek.fullname" . }}-agent
subjects:
- kind: ServiceAccount
name: {{ include "chrek.serviceAccountName" . }}
namespace: {{ .Release.Namespace }}
{{- end }}
{{- end }}
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
{{- if .Values.seccomp.deploy }}
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "chrek.fullname" . }}-seccomp
namespace: {{ .Release.Namespace }}
labels:
{{- include "chrek.labels" . | nindent 4 }}
app.kubernetes.io/component: seccomp
data:
block-iouring.json: |
{
"defaultAction": "SCMP_ACT_ALLOW",
"architectures": ["SCMP_ARCH_X86_64", "SCMP_ARCH_X86", "SCMP_ARCH_X32"],
"syscalls": [
{
"names": ["io_uring_setup", "io_uring_enter", "io_uring_register"],
"action": "SCMP_ACT_ERRNO",
"comment": "Block io_uring syscalls - CRIU doesn't support io_uring memory mappings"
}
]
}
{{- end }}
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
{{- if .Values.serviceAccount.create }}
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ include "chrek.serviceAccountName" . }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "chrek.labels" . | nindent 4 }}
app.kubernetes.io/component: checkpoint-agent
{{- with .Values.serviceAccount.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
{{- end }}
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Chrek - Checkpoint/Restore Infrastructure
# This chart deploys the checkpoint storage (PVC) and CRIU agent (DaemonSet)
# in a single namespace. Install this chart in each namespace where you want
# to enable checkpoint/restore functionality for DynamoGraphDeployments.
# Storage configuration for checkpoints
storage:
# Storage type: pvc (default), s3, or oci
type: pvc
# PVC configuration (when type=pvc)
pvc:
# Create a new PVC (set to false if using existing PVC)
create: true
# PVC name - must match operator configuration
name: chrek-pvc
# PVC size
size: 100Gi
# Storage class (leave empty for default)
storageClass: ""
# Access mode - ReadWriteMany required for multi-pod access
accessMode: ReadWriteMany
# Base path for checkpoints (mounted in pods)
basePath: /checkpoints
# S3 configuration (when type=s3)
s3:
# S3 URI (e.g., s3://my-bucket/checkpoints)
uri: ""
# Credentials are expected via IRSA or mounted secrets
# OCI configuration (when type=oci)
oci:
# OCI URI (e.g., oci://registry.io/repo/checkpoints)
uri: ""
# Host path for signal files (inter-pod communication)
signalHostPath: /var/lib/chrek/signals
# DaemonSet configuration for chrek (checkpoint/restore) agent
daemonset:
# Container image
image:
repository: nvcr.io/nvidian/dynamo-dev/chrek-agent
tag: latest
pullPolicy: Always
# Image pull secrets
imagePullSecrets:
- name: ngc-secret
# Resource limits and requests
resources:
limits:
cpu: 2
memory: 4Gi
requests:
cpu: 500m
memory: 1Gi
# Node selector - target GPU nodes
nodeSelector:
nvidia.com/gpu.present: "true"
# Tolerations for GPU nodes
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
- key: dedicated
operator: Exists
effect: NoSchedule
# Runtime class name for GPU access
runtimeClassName: nvidia
# Pod labels
podLabels: {}
# Pod annotations
podAnnotations: {}
# Affinity rules
affinity: {}
# CRIU configuration
criu:
# CUDA plugin directory
cudaPluginDir: /usr/local/lib/criu
# CRIU timeout in seconds (6 hours)
timeout: "21600"
# Ghost file size limit in bytes
# 512MB is recommended for GPU workloads with large memory allocations
ghostLimit: "536870912"
# Container runtime socket path
containerRuntimeSocket: /run/containerd/containerd.sock
# Seccomp profile configuration
seccomp:
# Deploy seccomp profile for blocking io_uring (required for CRIU)
deploy: true
# Service account configuration
serviceAccount:
# Create service account
create: true
# Service account name (generated if not set)
name: ""
# Annotations for service account (e.g., for IRSA)
annotations: {}
# RBAC configuration
rbac:
# Create RBAC resources
create: true
# Namespace-scoped RBAC (recommended, required for PVC storage)
# - true (default): Creates Role/RoleBinding, agent watches pods in chart's namespace only
# - false: Creates ClusterRole/ClusterRoleBinding, agent watches all pods on assigned nodes
# Note: PVC storage requires namespace-scoped mode (true) as PVCs are namespace-scoped
namespaceRestricted: true
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment