Unverified Commit d381e6ff authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

feat(chrek): config refactor, /dev/shm support, and mount-policy rewrite (#5946)

parent b6824ae0
...@@ -10,10 +10,10 @@ import ( ...@@ -10,10 +10,10 @@ import (
"golang.org/x/sys/unix" "golang.org/x/sys/unix"
) )
// OpenDirForCRIU opens a directory and clears the CLOEXEC flag so the FD // OpenPathForCRIU opens a path (directory or file) and clears the CLOEXEC flag
// can be inherited by CRIU child processes. // so the FD can be inherited by CRIU child processes.
// Returns the opened file and its FD. Caller must close the file when done. // Returns the opened file and its FD. Caller must close the file when done.
func OpenDirForCRIU(path string) (*os.File, int32, error) { func OpenPathForCRIU(path string) (*os.File, int32, error) {
dir, err := os.Open(path) dir, err := os.Open(path)
if err != nil { if err != nil {
return nil, 0, fmt.Errorf("failed to open %s: %w", path, err) return nil, 0, fmt.Errorf("failed to open %s: %w", path, err)
...@@ -30,41 +30,6 @@ func OpenDirForCRIU(path string) (*os.File, int32, error) { ...@@ -30,41 +30,6 @@ func OpenDirForCRIU(path string) (*os.File, int32, error) {
return dir, int32(dir.Fd()), nil return dir, int32(dir.Fd()), nil
} }
// DefaultMaskedPaths returns the standard OCI masked paths.
// These paths are typically masked (made inaccessible) in containers.
// Used as fallback when checkpoint metadata doesn't include OCI-derived paths.
func DefaultMaskedPaths() []string {
return []string{
"/proc/bus",
"/proc/fs",
"/proc/irq",
"/proc/sys",
"/proc/sysrq-trigger",
"/proc/acpi",
"/proc/kcore",
"/proc/keys",
"/proc/latency_stats",
"/proc/timer_list",
"/proc/scsi",
"/proc/interrupts",
"/proc/asound",
"/sys/firmware",
"/sys/devices/virtual/powercap",
}
}
// DefaultReadonlyPaths returns the standard OCI readonly paths.
// These paths are typically mounted read-only in containers.
func DefaultReadonlyPaths() []string {
return []string{
"/proc/bus",
"/proc/fs",
"/proc/irq",
"/proc/sys",
"/proc/sysrq-trigger",
}
}
// CRIUMountPoint represents a parsed mount point from /proc/pid/mountinfo. // CRIUMountPoint represents a parsed mount point from /proc/pid/mountinfo.
type CRIUMountPoint struct { type CRIUMountPoint struct {
MountID string // Mount ID MountID string // Mount ID
......
// metadata.go handles checkpoint metadata for cross-node restore operations.
package common
import (
"encoding/json"
"fmt"
"os"
"path/filepath"
"time"
)
const (
// MetadataFilename is the name of the metadata file in checkpoint directories
MetadataFilename = "metadata.json"
// DescriptorsFilename is the name of the file descriptors file
DescriptorsFilename = "descriptors.json"
)
// CheckpointMetadata stores information needed for cross-node restore
type CheckpointMetadata struct {
// Checkpoint identification
CheckpointID string `json:"checkpoint_id"`
CreatedAt time.Time `json:"created_at"`
// Source information
SourceNode string `json:"source_node"`
SourcePodIP string `json:"source_pod_ip,omitempty"` // For cross-node TCP detection
ContainerID string `json:"container_id"`
PodName string `json:"pod_name"`
PodNamespace string `json:"pod_namespace"`
Image string `json:"image"`
// Process information
PID int `json:"pid"`
// Filesystem information
RootfsDiffPath string `json:"rootfs_diff_path,omitempty"` // Path to rootfs-diff.tar
UpperDir string `json:"upper_dir,omitempty"` // Original overlay upperdir
HasRootfsDiff bool `json:"has_rootfs_diff"` // Whether rootfs diff was captured
HasDeletedFiles bool `json:"has_deleted_files"` // Whether deleted files were tracked
// Mount mappings from original container
Mounts []MountMetadata `json:"mounts"`
// OCI spec derived paths (populated from containerd, used at restore)
// These replace hardcoded values with runtime-discovered configuration
MaskedPaths []string `json:"masked_paths,omitempty"` // From OCI spec Linux.MaskedPaths
ReadonlyPaths []string `json:"readonly_paths,omitempty"` // From OCI spec Linux.ReadonlyPaths
BindMountDests []string `json:"bind_mount_dests,omitempty"` // Destinations of bind mounts (for tar exclusions)
// Namespace information
Namespaces []NamespaceMetadata `json:"namespaces"`
// CRIU options used during checkpoint (for restore compatibility)
CRIUOptions CRIUOptionsMetadata `json:"criu_options"`
}
// CRIUOptionsMetadata stores CRIU options used during checkpoint.
// This allows restore to use compatible options.
// Note: In our implementation, most options are hardcoded as always-on for K8s,
// but we store them for compatibility and debugging purposes.
type CRIUOptionsMetadata struct {
TcpEstablished bool `json:"tcp_established"`
TcpClose bool `json:"tcp_close"`
ShellJob bool `json:"shell_job"`
FileLocks bool `json:"file_locks"`
LeaveRunning bool `json:"leave_running"`
LinkRemap bool `json:"link_remap"`
ExtMasters bool `json:"ext_masters"`
}
// MountMetadata stores information about a mount for remapping during restore
type MountMetadata struct {
ContainerPath string `json:"container_path"` // Path inside container (e.g., /usr/share/nginx/html)
HostPath string `json:"host_path"` // Original host path from mountinfo
OCISource string `json:"oci_source,omitempty"` // Source path from OCI spec (may differ from HostPath)
OCIType string `json:"oci_type,omitempty"` // Mount type from OCI spec (bind, tmpfs, etc.)
OCIOptions []string `json:"oci_options,omitempty"` // Mount options from OCI spec
VolumeType string `json:"volume_type"` // emptyDir, pvc, configMap, secret, hostPath (best-effort)
VolumeName string `json:"volume_name"` // Kubernetes volume name (best-effort from path parsing)
FSType string `json:"fs_type"` // Filesystem type from mountinfo
ReadOnly bool `json:"read_only"` // Whether mount is read-only
}
// NamespaceMetadata stores namespace information
type NamespaceMetadata struct {
Type string `json:"type"` // net, pid, mnt, etc.
Inode uint64 `json:"inode"` // Namespace inode
IsExternal bool `json:"is_external"` // Whether namespace is external (shared)
}
// NewCheckpointMetadata creates a new metadata instance
func NewCheckpointMetadata(checkpointID string) *CheckpointMetadata {
return &CheckpointMetadata{
CheckpointID: checkpointID,
CreatedAt: time.Now().UTC(),
Mounts: make([]MountMetadata, 0),
Namespaces: make([]NamespaceMetadata, 0),
}
}
// SaveMetadata writes metadata to a JSON file in the checkpoint directory
func SaveMetadata(checkpointDir string, meta *CheckpointMetadata) error {
data, err := json.MarshalIndent(meta, "", " ")
if err != nil {
return fmt.Errorf("failed to marshal metadata: %w", err)
}
metadataPath := filepath.Join(checkpointDir, MetadataFilename)
if err := os.WriteFile(metadataPath, data, 0644); err != nil {
return fmt.Errorf("failed to write metadata file: %w", err)
}
return nil
}
// LoadMetadata reads metadata from a checkpoint directory
func LoadMetadata(checkpointDir string) (*CheckpointMetadata, error) {
metadataPath := filepath.Join(checkpointDir, MetadataFilename)
data, err := os.ReadFile(metadataPath)
if err != nil {
return nil, fmt.Errorf("failed to read metadata file: %w", err)
}
var meta CheckpointMetadata
if err := json.Unmarshal(data, &meta); err != nil {
return nil, fmt.Errorf("failed to unmarshal metadata: %w", err)
}
return &meta, nil
}
// SaveDescriptors writes file descriptor information to the checkpoint directory
func SaveDescriptors(checkpointDir string, descriptors []string) error {
data, err := json.Marshal(descriptors)
if err != nil {
return fmt.Errorf("failed to marshal descriptors: %w", err)
}
descriptorsPath := filepath.Join(checkpointDir, DescriptorsFilename)
if err := os.WriteFile(descriptorsPath, data, 0600); err != nil {
return fmt.Errorf("failed to write descriptors file: %w", err)
}
return nil
}
// LoadDescriptors reads file descriptor information from checkpoint directory
func LoadDescriptors(checkpointDir string) ([]string, error) {
descriptorsPath := filepath.Join(checkpointDir, DescriptorsFilename)
data, err := os.ReadFile(descriptorsPath)
if err != nil {
return nil, fmt.Errorf("failed to read descriptors file: %w", err)
}
var descriptors []string
if err := json.Unmarshal(data, &descriptors); err != nil {
return nil, fmt.Errorf("failed to unmarshal descriptors: %w", err)
}
return descriptors, nil
}
// GetCheckpointDir returns the path to a checkpoint directory
func GetCheckpointDir(baseDir, checkpointID string) string {
return filepath.Join(baseDir, checkpointID)
}
// ListCheckpoints returns all checkpoint IDs in the base directory
func ListCheckpoints(baseDir string) ([]string, error) {
entries, err := os.ReadDir(baseDir)
if err != nil {
return nil, fmt.Errorf("failed to read checkpoint directory: %w", err)
}
var checkpoints []string
for _, entry := range entries {
if !entry.IsDir() {
continue
}
// Check if metadata file exists
metadataPath := filepath.Join(baseDir, entry.Name(), MetadataFilename)
if _, err := os.Stat(metadataPath); err == nil {
checkpoints = append(checkpoints, entry.Name())
}
}
return checkpoints, nil
}
// GetCheckpointInfo returns metadata for a specific checkpoint
func GetCheckpointInfo(baseDir, checkpointID string) (*CheckpointMetadata, error) {
checkpointDir := GetCheckpointDir(baseDir, checkpointID)
return LoadMetadata(checkpointDir)
}
// DeleteCheckpoint removes a checkpoint directory
func DeleteCheckpoint(baseDir, checkpointID string) error {
checkpointDir := GetCheckpointDir(baseDir, checkpointID)
return os.RemoveAll(checkpointDir)
}
// handlers.go provides HTTP handlers for the checkpoint agent server.
package httpApiServer
import (
"encoding/json"
"fmt"
"log"
"net/http"
"os"
"path/filepath"
"time"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/checkpoint"
)
// Handlers holds dependencies for HTTP handlers.
type Handlers struct {
cfg ServerConfig
checkpointer *checkpoint.Checkpointer
}
// NewHandlers creates a new Handlers instance.
func NewHandlers(cfg ServerConfig, checkpointer *checkpoint.Checkpointer) *Handlers {
return &Handlers{
cfg: cfg,
checkpointer: checkpointer,
}
}
// HandleHealth handles GET /health requests.
func (h *Handlers) HandleHealth(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodGet {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
resp := HealthResponse{
Status: "healthy",
NodeName: h.cfg.NodeName,
}
writeJSON(w, http.StatusOK, resp)
}
// HandleCheckpoint handles POST /checkpoint requests.
func (h *Handlers) HandleCheckpoint(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
var req CheckpointRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
writeJSON(w, http.StatusBadRequest, CheckpointResponse{
Success: false,
Error: fmt.Sprintf("Invalid request body: %v", err),
})
return
}
if req.ContainerID == "" {
writeJSON(w, http.StatusBadRequest, CheckpointResponse{
Success: false,
Error: "container_id is required",
})
return
}
if req.CheckpointID == "" {
req.CheckpointID = fmt.Sprintf("ckpt-%d", time.Now().UnixNano())
}
// Build checkpoint params
params := checkpoint.CheckpointRequest{
ContainerID: req.ContainerID,
ContainerName: req.ContainerName,
CheckpointID: req.CheckpointID,
CheckpointDir: h.cfg.CheckpointSpec.BasePath,
NodeName: h.cfg.NodeName,
PodName: req.PodName,
PodNamespace: req.PodNamespace,
}
// Copy checkpoint spec and disable CUDA if requested.
checkpointSpec := *h.cfg.CheckpointSpec
if req.DisableCUDA {
checkpointSpec.CRIU.LibDir = ""
}
ctx := r.Context()
result, err := h.checkpointer.Checkpoint(ctx, params, &checkpointSpec)
if err != nil {
log.Printf("Checkpoint failed: %v", err)
writeJSON(w, http.StatusInternalServerError, CheckpointResponse{
Success: false,
Error: err.Error(),
})
return
}
// Write checkpoint.done marker so restore-entrypoint can detect this checkpoint
checkpointDonePath := result.CheckpointDir + "/" + checkpoint.CheckpointDoneFilename
if err := os.WriteFile(checkpointDonePath, []byte(time.Now().Format(time.RFC3339)), 0644); err != nil {
log.Printf("Failed to write checkpoint.done marker: %v", err)
writeJSON(w, http.StatusInternalServerError, CheckpointResponse{
Success: false,
Error: fmt.Sprintf("Checkpoint succeeded but failed to write done marker: %v", err),
})
return
}
log.Printf("Wrote checkpoint.done marker: %s", checkpointDonePath)
log.Printf("Checkpoint successful: %s", result.CheckpointID)
writeJSON(w, http.StatusOK, CheckpointResponse{
Success: true,
CheckpointID: result.CheckpointID,
Message: fmt.Sprintf("Checkpoint created successfully at %s", result.CheckpointDir),
})
}
// HandleListCheckpoints handles GET /checkpoints requests.
func (h *Handlers) HandleListCheckpoints(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodGet {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
checkpointIDs, err := checkpoint.ListCheckpoints(h.cfg.CheckpointSpec.BasePath)
if err != nil {
writeJSON(w, http.StatusInternalServerError, map[string]string{
"error": err.Error(),
})
return
}
var checkpoints []CheckpointInfo
for _, id := range checkpointIDs {
meta, err := checkpoint.ReadCheckpointManifest(filepath.Join(h.cfg.CheckpointSpec.BasePath, id))
if err != nil {
continue
}
checkpoints = append(checkpoints, CheckpointInfo{
ID: meta.CheckpointID,
CreatedAt: meta.CreatedAt,
SourceNode: meta.K8s.SourceNode,
ContainerID: meta.K8s.ContainerID,
PodName: meta.K8s.PodName,
PodNamespace: meta.K8s.PodNamespace,
})
}
writeJSON(w, http.StatusOK, ListCheckpointsResponse{
Checkpoints: checkpoints,
})
}
// writeJSON writes a JSON response.
func writeJSON(w http.ResponseWriter, status int, data interface{}) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(status)
json.NewEncoder(w).Encode(data)
}
// middleware.go provides HTTP middleware for the server.
package httpApiServer
import (
"log"
"net/http"
"time"
)
// LoggingMiddleware wraps an HTTP handler and logs request details.
func LoggingMiddleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
start := time.Now()
log.Printf("Started %s %s", r.Method, r.URL.Path)
next.ServeHTTP(w, r)
log.Printf("Completed %s %s in %v", r.Method, r.URL.Path, time.Since(start))
})
}
// server.go provides the HTTP server for the checkpoint agent.
package httpApiServer
import (
"context"
"log"
"net/http"
"time"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/checkpoint"
)
// ServerConfig holds the configuration for the HTTP API server.
type ServerConfig struct {
ListenAddr string
NodeName string
CheckpointSpec *checkpoint.CheckpointSpec
}
// Server is the HTTP API server for checkpoint operations.
type Server struct {
cfg ServerConfig
handlers *Handlers
httpServer *http.Server
}
// NewServer creates a new Server instance.
func NewServer(cfg ServerConfig, checkpointer *checkpoint.Checkpointer) *Server {
handlers := NewHandlers(cfg, checkpointer)
// Setup routes
mux := http.NewServeMux()
mux.HandleFunc("/health", handlers.HandleHealth)
mux.HandleFunc("/checkpoint", handlers.HandleCheckpoint)
mux.HandleFunc("/checkpoints", handlers.HandleListCheckpoints)
// WriteTimeout must exceed the CRIU checkpoint timeout since /checkpoint
// blocks until the dump completes. Add 60s buffer for pre/post work.
writeTimeout := time.Duration(cfg.CheckpointSpec.CRIU.Timeout)*time.Second + 60*time.Second
if writeTimeout < 300*time.Second {
writeTimeout = 300 * time.Second
}
httpServer := &http.Server{
Addr: cfg.ListenAddr,
Handler: LoggingMiddleware(mux),
ReadTimeout: 30 * time.Second,
WriteTimeout: writeTimeout,
IdleTimeout: 120 * time.Second,
}
return &Server{
cfg: cfg,
handlers: handlers,
httpServer: httpServer,
}
}
// Start starts the HTTP server.
// This method blocks until the server is shut down.
func (s *Server) Start() error {
log.Printf("HTTP API server listening on %s", s.cfg.ListenAddr)
return s.httpServer.ListenAndServe()
}
// Shutdown gracefully shuts down the server.
func (s *Server) Shutdown(ctx context.Context) error {
log.Println("Shutting down HTTP server...")
return s.httpServer.Shutdown(ctx)
}
// Addr returns the server's listen address.
func (s *Server) Addr() string {
return s.cfg.ListenAddr
}
// Package server provides HTTP server functionality for the checkpoint agent.
package httpApiServer
import "time"
// CheckpointRequest is the request body for checkpoint operations.
type CheckpointRequest struct {
ContainerID string `json:"container_id"`
ContainerName string `json:"container_name,omitempty"` // K8s container name (for volume type lookup)
CheckpointID string `json:"checkpoint_id"`
PodName string `json:"pod_name,omitempty"`
PodNamespace string `json:"pod_namespace,omitempty"`
DisableCUDA bool `json:"disable_cuda,omitempty"` // Disable CUDA plugin for non-GPU workloads
}
// CheckpointResponse is the response for checkpoint operations.
type CheckpointResponse struct {
Success bool `json:"success"`
CheckpointID string `json:"checkpoint_id,omitempty"`
Message string `json:"message,omitempty"`
Error string `json:"error,omitempty"`
}
// CheckpointInfo represents information about a checkpoint.
type CheckpointInfo struct {
ID string `json:"id"`
CreatedAt time.Time `json:"created_at"`
SourceNode string `json:"source_node"`
ContainerID string `json:"container_id"`
PodName string `json:"pod_name"`
PodNamespace string `json:"pod_namespace"`
}
// ListCheckpointsResponse is the response for list checkpoints.
type ListCheckpointsResponse struct {
Checkpoints []CheckpointInfo `json:"checkpoints"`
}
// HealthResponse is the response for health check.
type HealthResponse struct {
Status string `json:"status"`
NodeName string `json:"node_name"`
}
// config.go defines the RestoreRequest struct for CRIU restore operations.
// CRIU options come from the saved CheckpointManifest, not from this request.
//
// The restore-entrypoint runs in placeholder containers which do NOT mount the
// ConfigMap. Static defaults are hardcoded here; per-pod dynamic values come
// from environment variables injected by the operator.
package restore
import (
"context"
"encoding/json"
"fmt"
"os"
"strings"
"time"
"github.com/sirupsen/logrus"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/checkpoint"
)
const (
// RestoreLogFilename is the CRIU restore log filename.
RestoreLogFilename = "restore.log"
// CRIULogDir is the directory where CRIU restore logs are copied for debugging.
CRIULogDir = "/checkpoints/restore-logs"
// RestoreTriggerPath is the default path to the trigger file for trigger-based restore.
RestoreTriggerPath = "/tmp/restore-trigger"
)
// RestoreRequest holds runtime request inputs for the restore entrypoint.
// CRIU options are NOT stored here - they come from the saved CheckpointManifest.
type RestoreRequest struct {
// === Per-pod dynamic values (from operator-injected env vars) ===
// CheckpointPath is the base directory containing checkpoints.
CheckpointPath string
// CheckpointHash is the ID/hash of the checkpoint to restore.
CheckpointHash string
// CheckpointLocation is the full resolved path to the checkpoint directory.
CheckpointLocation string
// SkipWaitForCheckpoint controls the entrypoint behavior.
SkipWaitForCheckpoint bool
// ColdStartArgs is the command+args to exec if no checkpoint is available.
ColdStartArgs []string
// Debug enables debug logging.
Debug bool
// === Static defaults (hardcoded) ===
// RestoreMarkerFilePath is where restore-entrypoint writes a marker before CRIU restore.
RestoreMarkerFilePath string
// RestoreTrigger is the path to the trigger file that signals restore should start.
RestoreTrigger string
// WaitTimeout is the maximum time to wait for a checkpoint.
// Zero means wait indefinitely.
WaitTimeout time.Duration
}
// ConfigError represents a configuration validation error.
type ConfigError struct {
Field string
Message string
}
func (e *ConfigError) Error() string {
return fmt.Sprintf("config error: %s: %s", e.Field, e.Message)
}
// NewRestoreRequest creates a RestoreRequest with hardcoded defaults and
// operator-injected environment variable values.
func NewRestoreRequest(args []string) (*RestoreRequest, error) {
cfg := &RestoreRequest{
RestoreTrigger: RestoreTriggerPath,
ColdStartArgs: args,
}
if v := os.Getenv("DYN_CHECKPOINT_PATH"); v != "" {
cfg.CheckpointPath = v
}
if v := os.Getenv("DYN_CHECKPOINT_HASH"); v != "" {
cfg.CheckpointHash = v
}
if v := os.Getenv("DYN_CHECKPOINT_LOCATION"); v != "" {
cfg.CheckpointLocation = v
} else if cfg.CheckpointPath != "" && cfg.CheckpointHash != "" {
cfg.CheckpointLocation = cfg.CheckpointPath + "/" + cfg.CheckpointHash
}
cfg.SkipWaitForCheckpoint = os.Getenv("SKIP_WAIT_FOR_CHECKPOINT") == "1"
cfg.Debug = os.Getenv("DEBUG") == "1"
cfg.RestoreMarkerFilePath = os.Getenv("DYN_RESTORE_MARKER_FILE")
if cfg.RestoreMarkerFilePath == "" {
return nil, &ConfigError{
Field: "DYN_RESTORE_MARKER_FILE",
Message: "must be set",
}
}
return cfg, nil
}
type checkpointDoneMarker struct {
Success bool `json:"success"`
Error string `json:"error,omitempty"`
}
func checkpointDoneSucceeded(donePath string, log *logrus.Entry) bool {
data, err := os.ReadFile(donePath)
if err != nil {
log.WithError(err).WithField("path", donePath).Warn("Failed to read checkpoint.done marker")
return false
}
var marker checkpointDoneMarker
if err := json.Unmarshal(data, &marker); err != nil {
log.WithError(err).WithField("path", donePath).Warn("Failed to parse checkpoint.done marker")
return false
}
if !marker.Success {
fields := logrus.Fields{"path": donePath}
if marker.Error != "" {
fields["error"] = marker.Error
}
log.WithFields(fields).Warn("checkpoint.done marker reports failed checkpoint")
return false
}
return true
}
// ShouldRestore checks if a restore should be performed.
// Returns the checkpoint path and true if restore should proceed.
func ShouldRestore(cfg *RestoreRequest, log *logrus.Entry) (string, bool) {
// Method 1: Checkpoint location is set and checkpoint is fully complete
if cfg.CheckpointLocation != "" {
donePath := cfg.CheckpointLocation + "/" + checkpoint.CheckpointDoneFilename
if _, err := os.Stat(donePath); err == nil {
if checkpointDoneSucceeded(donePath, log) {
log.WithField("path", cfg.CheckpointLocation).Info("Checkpoint found (checkpoint.done success=true)")
return cfg.CheckpointLocation, true
}
}
// Fallback: check for manifest.yaml but warn about potential race condition.
manifestPath := cfg.CheckpointLocation + "/" + checkpoint.CheckpointManifestFilename
if _, err := os.Stat(manifestPath); err == nil {
log.WithFields(logrus.Fields{
"path": cfg.CheckpointLocation,
"warning": "checkpoint.done marker not found, checkpoint may be incomplete",
}).Warn("Checkpoint manifest found but checkpoint.done missing - checkpoint may still be in progress")
}
}
// Method 2: Restore trigger file exists with checkpoint path
if cfg.RestoreTrigger != "" {
data, err := os.ReadFile(cfg.RestoreTrigger)
if err == nil {
checkpointPath := strings.TrimSpace(string(data))
if checkpointPath != "" {
donePath := checkpointPath + "/" + checkpoint.CheckpointDoneFilename
if _, err := os.Stat(donePath); err == nil {
if checkpointDoneSucceeded(donePath, log) {
log.WithField("path", checkpointPath).Info("Restore triggered via file (checkpoint.done success=true)")
return checkpointPath, true
}
}
}
}
}
return "", false
}
// WaitForCheckpoint waits for a checkpoint to become available.
// If cfg.WaitTimeout is zero, waits indefinitely (until ctx is cancelled).
func WaitForCheckpoint(ctx context.Context, cfg *RestoreRequest, log *logrus.Entry) (string, error) {
if cfg.WaitTimeout > 0 {
log.WithField("timeout", cfg.WaitTimeout).Info("Waiting for checkpoint")
} else {
log.Info("Waiting for checkpoint indefinitely")
}
startTime := time.Now()
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
lastLog := time.Now()
for {
select {
case <-ctx.Done():
return "", ctx.Err()
case <-ticker.C:
if path, ok := ShouldRestore(cfg, log); ok {
return path, nil
}
// Log progress every 30 seconds
if time.Since(lastLog) >= 30*time.Second {
elapsed := time.Since(startTime)
log.WithField("elapsed", elapsed).Info("Still waiting for checkpoint...")
lastLog = time.Now()
}
// Only enforce deadline if WaitTimeout is set (non-zero)
if cfg.WaitTimeout > 0 && time.Since(startTime) >= cfg.WaitTimeout {
return "", fmt.Errorf("timed out waiting for checkpoint after %s", cfg.WaitTimeout)
}
}
}
}
...@@ -12,28 +12,42 @@ import ( ...@@ -12,28 +12,42 @@ import (
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/common" "github.com/ai-dynamo/dynamo/deploy/chrek/pkg/common"
) )
// CRIURestoreConfig holds configuration for CRIU restore operations. // CRIURestorePlan holds configuration for CRIU restore operations.
// Most options are always-on with safe defaults for K8s environments. // Most fields come from the saved CheckpointManifest.CRIUDump.CRIU settings.
type CRIURestoreConfig struct { type CRIURestorePlan struct {
ImageDirFD int32 // File descriptors
RootPath string ImageDirFD int32
LogLevel int32 WorkDirFD int32
LogFile string NetNsFD int32
WorkDirFD int32
NetNsFD int32 // Paths
RootPath string
LogFile string
// Options from CheckpointManifest.CRIUDump.CRIU.
LogLevel int32
Timeout uint32 // CRIU timeout in seconds (0 = no timeout, required for CUDA)
ShellJob bool // Allow session leaders (containers are often session leaders)
TcpClose bool // Close TCP connections (pod IPs change on restore)
FileLocks bool // Allow file locks
ExtUnixSk bool // Allow external Unix sockets
LinkRemap bool // Handle deleted-but-open files via CRIU link remap
ManageCgroupsMode string // Cgroup handling mode: "ignore" lets K8s manage cgroups
// External mount mappings (from CheckpointManifest.CRIUDump.ExtMnt).
ExtMountMaps []*criurpc.ExtMountMap ExtMountMaps []*criurpc.ExtMountMap
} }
// OpenImageDir opens a checkpoint directory and clears CLOEXEC for CRIU. // OpenImageDir opens a checkpoint directory and clears CLOEXEC for CRIU.
// Returns the opened file and its FD. Caller must close the file when done. // Returns the opened file and its FD. Caller must close the file when done.
func OpenImageDir(checkpointPath string) (*os.File, int32, error) { func OpenImageDir(checkpointPath string) (*os.File, int32, error) {
return common.OpenDirForCRIU(checkpointPath) return common.OpenPathForCRIU(checkpointPath)
} }
// OpenNetworkNamespace opens the target network namespace for restore. // OpenNetworkNamespace opens the target network namespace for restore.
// Returns the opened file and its FD. Caller must close the file when done. // Returns the opened file and its FD. Caller must close the file when done.
func OpenNetworkNamespace(nsPath string) (*os.File, int32, error) { func OpenNetworkNamespace(nsPath string) (*os.File, int32, error) {
return common.OpenDirForCRIU(nsPath) return common.OpenPathForCRIU(nsPath)
} }
// OpenWorkDir opens a work directory for CRIU and clears CLOEXEC. // OpenWorkDir opens a work directory for CRIU and clears CLOEXEC.
...@@ -65,62 +79,87 @@ func OpenWorkDir(workDir string, log *logrus.Entry) (*os.File, int32) { ...@@ -65,62 +79,87 @@ func OpenWorkDir(workDir string, log *logrus.Entry) (*os.File, int32) {
return workDirFile, int32(workDirFile.Fd()) return workDirFile, int32(workDirFile.Fd())
} }
// BuildRestoreCRIUOpts creates CRIU options for restore from a config struct. // BuildCRIURestoreOptions creates CRIU options for restore from a runtime plan.
// //
// Always-on options for K8s: // Options from CheckpointManifest.CRIUDump.CRIU (saved at checkpoint time):
// - ShellJob: containers are often session leaders // - ShellJob, TcpClose, FileLocks, ExtUnixSk, LinkRemap, ManageCgroupsMode
// - TcpClose: pod IPs change on restore/migration //
// - FileLocks: applications use file locks // Hardcoded restore-specific options:
// - ExtUnixSk: containers have external Unix sockets // - RstSibling: restore in detached mode
// - ManageCgroups (IGNORE): let K8s manage cgroups // - MntnsCompatMode: cross-container restore
func BuildRestoreCRIUOpts(cfg CRIURestoreConfig) *criurpc.CriuOpts { // - EvasiveDevices, ForceIrmap: device/inode handling
cgMode := criurpc.CriuCgMode_IGNORE func BuildCRIURestoreOptions(plan CRIURestorePlan) *criurpc.CriuOpts {
// Map cgroup management mode from plan.
var cgMode criurpc.CriuCgMode
switch plan.ManageCgroupsMode {
case "soft":
cgMode = criurpc.CriuCgMode_SOFT
case "full":
cgMode = criurpc.CriuCgMode_FULL
case "strict":
cgMode = criurpc.CriuCgMode_STRICT
case "ignore", "":
cgMode = criurpc.CriuCgMode_IGNORE
default:
cgMode = criurpc.CriuCgMode_IGNORE
}
criuOpts := &criurpc.CriuOpts{ criuOpts := &criurpc.CriuOpts{
ImagesDirFd: proto.Int32(cfg.ImageDirFD), ImagesDirFd: proto.Int32(plan.ImageDirFD),
LogLevel: proto.Int32(cfg.LogLevel), LogLevel: proto.Int32(plan.LogLevel),
LogFile: proto.String(cfg.LogFile), LogFile: proto.String(plan.LogFile),
// Root filesystem - use current container's root // Root filesystem - use current container's root
Root: proto.String(cfg.RootPath), Root: proto.String(plan.RootPath),
// Restore in detached mode - process runs in background // Restore in detached mode - process runs in background (restore-specific)
RstSibling: proto.Bool(true), RstSibling: proto.Bool(true),
// Mount namespace compatibility mode for cross-container restore // Mount namespace mode:
MntnsCompatMode: proto.Bool(true), // - MntnsCompatMode=false (default): Uses mount-v2 with MOVE_MOUNT_SET_GROUP (kernel 5.15+)
// This is preferred as it doesn't create temp dirs in /tmp
// Always-on for K8s environments // - MntnsCompatMode=true: Uses compat mode which creates /tmp/cr-tmpfs.XXX
ShellJob: proto.Bool(true), // This can cause "Device or resource busy" errors on cleanup
TcpClose: proto.Bool(true), // We explicitly set to false to use mount-v2 (requires kernel 5.15+)
FileLocks: proto.Bool(true), MntnsCompatMode: proto.Bool(false),
ExtUnixSk: proto.Bool(true),
// Options from saved CheckpointManifest.CRIUDump.CRIU.
// Cgroup management - ignore to avoid conflicts ShellJob: proto.Bool(plan.ShellJob),
TcpClose: proto.Bool(plan.TcpClose),
FileLocks: proto.Bool(plan.FileLocks),
ExtUnixSk: proto.Bool(plan.ExtUnixSk),
LinkRemap: proto.Bool(plan.LinkRemap),
// Cgroup management from saved settings.
ManageCgroups: proto.Bool(true), ManageCgroups: proto.Bool(true),
ManageCgroupsMode: &cgMode, ManageCgroupsMode: &cgMode,
// Device and inode handling // Device and inode handling (restore-specific)
EvasiveDevices: proto.Bool(true), EvasiveDevices: proto.Bool(true),
ForceIrmap: proto.Bool(true), ForceIrmap: proto.Bool(true),
// External mount mappings // External mount mappings
ExtMnt: cfg.ExtMountMaps, ExtMnt: plan.ExtMountMaps,
} }
// Add network namespace inheritance if provided // Add network namespace inheritance if provided
if cfg.NetNsFD >= 0 { if plan.NetNsFD >= 0 {
criuOpts.InheritFd = []*criurpc.InheritFd{ criuOpts.InheritFd = []*criurpc.InheritFd{
{ {
Key: proto.String("extNetNs"), Key: proto.String("extNetNs"),
Fd: proto.Int32(cfg.NetNsFD), Fd: proto.Int32(plan.NetNsFD),
}, },
} }
} }
// Add work directory if specified // Add work directory if specified
if cfg.WorkDirFD >= 0 { if plan.WorkDirFD >= 0 {
criuOpts.WorkDirFd = proto.Int32(cfg.WorkDirFD) criuOpts.WorkDirFd = proto.Int32(plan.WorkDirFD)
}
// Add timeout if specified (required for CUDA restores)
if plan.Timeout > 0 {
criuOpts.Timeout = proto.Uint32(plan.Timeout)
} }
return criuOpts return criuOpts
......
...@@ -8,19 +8,14 @@ import ( ...@@ -8,19 +8,14 @@ import (
"path/filepath" "path/filepath"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
)
const ( "github.com/ai-dynamo/dynamo/deploy/chrek/pkg/checkpoint"
// RootfsDiffFilename is the name of the rootfs diff tar file
RootfsDiffFilename = "rootfs-diff.tar"
// DeletedFilesFilename is the name of the deleted files JSON
DeletedFilesFilename = "deleted-files.json"
) )
// ApplyRootfsDiff extracts the rootfs-diff.tar from the checkpoint to the target root. // ApplyRootfsDiff extracts the rootfs-diff.tar from the checkpoint to the target root.
// This restores filesystem changes that were made in the original container. // This restores filesystem changes that were made in the original container.
func ApplyRootfsDiff(checkpointPath, targetRoot string, log *logrus.Entry) error { func ApplyRootfsDiff(checkpointPath, targetRoot string, log *logrus.Entry) error {
rootfsDiffPath := filepath.Join(checkpointPath, RootfsDiffFilename) rootfsDiffPath := filepath.Join(checkpointPath, checkpoint.RootfsDiffFilename)
// Check if rootfs-diff.tar exists // Check if rootfs-diff.tar exists
if _, err := os.Stat(rootfsDiffPath); os.IsNotExist(err) { if _, err := os.Stat(rootfsDiffPath); os.IsNotExist(err) {
...@@ -30,15 +25,10 @@ func ApplyRootfsDiff(checkpointPath, targetRoot string, log *logrus.Entry) error ...@@ -30,15 +25,10 @@ func ApplyRootfsDiff(checkpointPath, targetRoot string, log *logrus.Entry) error
log.WithField("path", rootfsDiffPath).Info("Applying rootfs diff") log.WithField("path", rootfsDiffPath).Info("Applying rootfs diff")
// Build tar command with options to handle conflicts: // Exclusions are already applied at checkpoint time (bind mounts, system dirs, etc.)
// --keep-old-files: Don't overwrite existing files (may already be mounted) // so we just extract with --keep-old-files to avoid overwriting existing files.
// Exclude paths that are typically mounted read-only by the container runtime
cmd := exec.Command("tar", cmd := exec.Command("tar",
"--keep-old-files", "--keep-old-files",
"--exclude=./run/secrets",
"--exclude=./etc/resolv.conf",
"--exclude=./etc/hostname",
"--exclude=./etc/hosts",
"-C", targetRoot, "-C", targetRoot,
"-xf", rootfsDiffPath, "-xf", rootfsDiffPath,
) )
...@@ -61,7 +51,7 @@ func ApplyRootfsDiff(checkpointPath, targetRoot string, log *logrus.Entry) error ...@@ -61,7 +51,7 @@ func ApplyRootfsDiff(checkpointPath, targetRoot string, log *logrus.Entry) error
// ApplyDeletedFiles removes files that were deleted in the original container. // ApplyDeletedFiles removes files that were deleted in the original container.
// These are tracked via overlay whiteout markers (.wh.<filename>). // These are tracked via overlay whiteout markers (.wh.<filename>).
func ApplyDeletedFiles(checkpointPath, targetRoot string, log *logrus.Entry) error { func ApplyDeletedFiles(checkpointPath, targetRoot string, log *logrus.Entry) error {
deletedFilesPath := filepath.Join(checkpointPath, DeletedFilesFilename) deletedFilesPath := filepath.Join(checkpointPath, checkpoint.DeletedFilesFilename)
// Check if deleted-files.json exists // Check if deleted-files.json exists
data, err := os.ReadFile(deletedFilesPath) data, err := os.ReadFile(deletedFilesPath)
...@@ -109,8 +99,5 @@ func ApplyDeletedFiles(checkpointPath, targetRoot string, log *logrus.Entry) err ...@@ -109,8 +99,5 @@ func ApplyDeletedFiles(checkpointPath, targetRoot string, log *logrus.Entry) err
func CheckpointFilesExist(checkpointPath string) bool { func CheckpointFilesExist(checkpointPath string) bool {
// Check for CRIU image files (core-*.img is always present) // Check for CRIU image files (core-*.img is always present)
matches, err := filepath.Glob(filepath.Join(checkpointPath, "core-*.img")) matches, err := filepath.Glob(filepath.Join(checkpointPath, "core-*.img"))
if err != nil || len(matches) == 0 { return err == nil && len(matches) > 0
return false
}
return true
} }
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -63,7 +63,6 @@ See `values.yaml` for all configuration options. ...@@ -63,7 +63,6 @@ See `values.yaml` for all configuration options.
| `storage.pvc.name` | PVC name (must match operator config) | `chrek-pvc` | | `storage.pvc.name` | PVC name (must match operator config) | `chrek-pvc` |
| `storage.pvc.size` | PVC size | `100Gi` | | `storage.pvc.size` | PVC size | `100Gi` |
| `storage.pvc.storageClass` | Storage class name | `""` (default) | | `storage.pvc.storageClass` | Storage class name | `""` (default) |
| `storage.signalHostPath` | Host path for signal files | `/var/lib/chrek/signals` |
| `daemonset.image.repository` | DaemonSet image repository | `nvidia/chrek-agent` | | `daemonset.image.repository` | DaemonSet image repository | `nvidia/chrek-agent` |
| `daemonset.nodeSelector` | Node selector for GPU nodes | `nvidia.com/gpu.present: "true"` | | `daemonset.nodeSelector` | Node selector for GPU nodes | `nvidia.com/gpu.present: "true"` |
| `daemonset.runtimeClassName` | Runtime class for GPU access | `nvidia` | | `daemonset.runtimeClassName` | Runtime class for GPU access | `nvidia` |
...@@ -175,4 +174,3 @@ Ensure your storage class supports `ReadWriteMany` access mode for multi-node de ...@@ -175,4 +174,3 @@ Ensure your storage class supports `ReadWriteMany` access mode for multi-node de
## License ## License
Apache License 2.0 Apache License 2.0
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment