Unverified Commit d381e6ff authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

feat(chrek): config refactor, /dev/shm support, and mount-policy rewrite (#5946)

parent b6824ae0
......@@ -48,9 +48,6 @@ storage:
# OCI URI (e.g., oci://registry.io/repo/checkpoints)
uri: ""
# Host path for signal files (inter-pod communication)
signalHostPath: /var/lib/chrek/signals
# DaemonSet configuration for chrek (checkpoint/restore) agent
daemonset:
# Container image
......@@ -97,19 +94,6 @@ daemonset:
# Affinity rules
affinity: {}
# CRIU configuration
criu:
# CUDA plugin directory
cudaPluginDir: /usr/local/lib/criu
# CRIU timeout in seconds (6 hours)
timeout: "21600"
# Ghost file size limit in bytes
# 512MB is recommended for GPU workloads with large memory allocations
ghostLimit: "536870912"
# Container runtime socket path
containerRuntimeSocket: /run/containerd/containerd.sock
# Seccomp profile configuration
seccomp:
# Deploy seccomp profile for blocking io_uring (required for CRIU)
......@@ -135,3 +119,61 @@ rbac:
# Note: PVC storage requires namespace-scoped mode (true) as PVCs are namespace-scoped
namespaceRestricted: true
# Static configuration (loaded from ConfigMap)
# Dynamic values (NODE_NAME, RESTRICTED_NAMESPACE, etc.) still come from environment variables
config:
agent:
# How checkpoints are triggered: "http" or "watcher"
signalSource: "watcher"
# HTTP server address for health checks and API
listenAddr: ":8080"
checkpoint:
criu:
# Ghost file size limit in bytes (512MB recommended for GPU workloads)
ghostLimit: 536870912
# CRIU timeout in seconds (6 hours for large GPU checkpoints)
timeout: 21600
# CRIU logging verbosity (0-4)
logLevel: 4
# CRIU work directory for temporary files
workDir: "/var/criu-work"
# K8s-specific options (recommended defaults for containers)
leaveRunning: true # Keep process running after checkpoint
shellJob: true # Containers are often session leaders
tcpClose: true # Pod IPs change on restore/migration
fileLocks: true # Applications use file locks
orphanPtsMaster: true # Containers with TTYs
extUnixSk: true # External Unix sockets
linkRemap: true # Handle deleted-but-open files
extMasters: true # External bind mount masters
manageCgroupsMode: "ignore" # Let K8s manage cgroups (ignore/soft/full/strict)
# Advanced options
autoDedup: false # Auto-deduplication of memory pages
lazyPages: false # Lazy page migration (experimental)
# Config file options (NOT available via RPC - written to criu.conf)
libDir: "/usr/local/lib/criu" # Plugin directory (required for GPU checkpoints)
allowUprobes: true # Required for CUDA
skipInFlight: true # Skip in-flight TCP connections
rootfsExclusions:
# System directories excluded from rootfs diff capture
# These are injected by NVIDIA GPU Operator and cause conflicts during restore
systemDirs:
- "./usr"
- "./etc"
- "./opt"
- "./var"
- "./run"
# Cache directories to exclude (reduces checkpoint size)
cacheDirs:
- "./.cache/huggingface"
# Additional custom exclusions (application-specific)
additionalExclusions: []
# NOTE: Restore configuration is NOT in this ConfigMap.
# Placeholder containers do not mount it. Restore defaults are hardcoded in Go.
# CRIU options for restore come from the saved checkpoint manifest (manifest.yaml).
......@@ -151,8 +151,9 @@ spec:
- --checkpoint-enabled=true
- --checkpoint-storage-type={{ .Values.checkpoint.storage.type }}
- --checkpoint-signal-host-path={{ .Values.checkpoint.storage.signalHostPath }}
- --checkpoint-criu-timeout={{ .Values.checkpoint.criu.timeout }}
- --checkpoint-init-container-image={{ .Values.checkpoint.initContainerImage }}
- --checkpoint-ready-for-checkpoint-file-path={{ .Values.checkpoint.readyForCheckpointFilePath }}
- --checkpoint-restore-marker-file-path={{ .Values.checkpoint.restoreMarkerFilePath }}
{{- if eq .Values.checkpoint.storage.type "pvc" }}
- --checkpoint-pvc-name={{ .Values.checkpoint.storage.pvc.pvcName }}
- --checkpoint-pvc-base-path={{ .Values.checkpoint.storage.pvc.basePath }}
......
......@@ -157,6 +157,14 @@ checkpoint:
# Defaults to busybox:latest if not specified
initContainerImage: "busybox:latest"
# Path written by worker when model is loaded and ready for checkpointing
# Must match the path expected by checkpoint-enabled runtime images
readyForCheckpointFilePath: "/tmp/ready-for-checkpoint"
# Path written by restore-entrypoint after successful CRIU restore
# Must match the path expected by checkpoint-enabled runtime images
restoreMarkerFilePath: "/tmp/dynamo-restored"
# Storage configuration
# These settings tell the operator where to find checkpoint storage
# Must match the configuration in the chrek chart
......@@ -196,14 +204,6 @@ checkpoint:
# Reference to a docker config secret for registry authentication
credentialsSecretRef: ""
# CRIU timeout configuration (shared across checkpoint and restore)
criu:
# CRIU operation timeout in seconds
# Maximum time to wait for checkpoint/restore to complete
# Increase for models with very large memory footprints
# 21600s (6 hours) is recommended for large LLMs (70B+)
timeout: "21600"
# Webhook configuration
webhook:
# Enable admission webhooks for validation
......@@ -280,4 +280,3 @@ webhook:
# Time before expiration to renew root CA (e.g., "720h" for 30 days)
renewBefore: "720h"
......@@ -216,6 +216,15 @@ dynamo-operator:
# -- Whether to enable checkpoint/restore functionality
enabled: false
# -- Image used for init containers in checkpoint jobs (e.g., signal file cleanup)
initContainerImage: "busybox:latest"
# -- Path written by worker when model is loaded and ready for checkpointing
readyForCheckpointFilePath: "/tmp/ready-for-checkpoint"
# -- Path written by restore-entrypoint after successful CRIU restore
restoreMarkerFilePath: "/tmp/dynamo-restored"
# Storage configuration
# These settings tell the operator where to find checkpoint storage
# Must match the configuration in the chrek chart
......@@ -247,12 +256,6 @@ dynamo-operator:
# -- Reference to a docker config secret for registry authentication
credentialsSecretRef: ""
# CRIU timeout configuration (shared across checkpoint and restore)
criu:
# -- CRIU operation timeout in seconds. Default: 21600 (6 hours)
timeout: "21600"
# Grove component - distributed inference orchestration
grove:
# -- Whether to enable Grove for multi-node inference coordination, if enabled, the Grove operator will be deployed cluster-wide
......
......@@ -161,7 +161,6 @@ func main() {
var checkpointEnabled bool
var checkpointStorageType string
var checkpointSignalHostPath string
var checkpointCRIUTimeout string
var checkpointPVCName string
var checkpointPVCBasePath string
var checkpointS3URI string
......@@ -169,6 +168,8 @@ func main() {
var checkpointOCIURI string
var checkpointOCICredentialsSecret string
var checkpointInitContainerImage string
var checkpointReadyForCheckpointFilePath string
var checkpointRestoreMarkerFilePath string
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
flag.BoolVar(&enableLeaderElection, "leader-elect", false,
......@@ -229,11 +230,9 @@ func main() {
"Enable checkpoint/restore functionality")
flag.StringVar(&checkpointStorageType, "checkpoint-storage-type", commonController.CheckpointStorageTypePVC,
"Checkpoint storage backend type: pvc, s3, or oci")
flag.StringVar(&checkpointSignalHostPath, "checkpoint-signal-host-path", "",
flag.StringVar(&checkpointSignalHostPath, "checkpoint-signal-host-path", "/var/lib/chrek/signals",
"Host path for signal files used for checkpoint job coordination")
flag.StringVar(&checkpointCRIUTimeout, "checkpoint-criu-timeout", "21600",
"CRIU timeout in seconds (required for CUDA checkpoints/restores, default: 21600 = 6 hours)")
flag.StringVar(&checkpointPVCName, "checkpoint-pvc-name", "checkpoint-storage",
flag.StringVar(&checkpointPVCName, "checkpoint-pvc-name", "chrek-pvc",
"Name of the PVC for checkpoint storage (used when storage-type=pvc)")
flag.StringVar(&checkpointPVCBasePath, "checkpoint-pvc-base-path", "/checkpoints",
"Base path within the PVC for storing checkpoints (used when storage-type=pvc)")
......@@ -247,6 +246,11 @@ func main() {
"Docker config secret name for OCI registry auth (used when storage-type=oci)")
flag.StringVar(&checkpointInitContainerImage, "checkpoint-init-container-image", "busybox:latest",
"Image to use for checkpoint init containers (e.g., signal file cleanup)")
flag.StringVar(&checkpointReadyForCheckpointFilePath,
"checkpoint-ready-for-checkpoint-file-path", "/tmp/ready-for-checkpoint",
"Path written by the worker container when the model is loaded and ready for checkpointing")
flag.StringVar(&checkpointRestoreMarkerFilePath, "checkpoint-restore-marker-file-path", "/tmp/dynamo-restored",
"Path written by restore-entrypoint after successful CRIU restore")
opts := zap.Options{
Development: true,
}
......@@ -325,9 +329,10 @@ func main() {
},
DiscoveryBackend: discoveryBackend,
Checkpoint: commonController.CheckpointConfig{
Enabled: checkpointEnabled,
CRIUTimeout: checkpointCRIUTimeout,
InitContainerImage: checkpointInitContainerImage,
Enabled: checkpointEnabled,
InitContainerImage: checkpointInitContainerImage,
ReadyForCheckpointFilePath: checkpointReadyForCheckpointFilePath,
RestoreMarkerFilePath: checkpointRestoreMarkerFilePath,
Storage: commonController.CheckpointStorageConfig{
Type: checkpointStorageType,
SignalHostPath: checkpointSignalHostPath,
......
......@@ -20,7 +20,6 @@ package checkpoint
import (
"context"
"fmt"
"path/filepath"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
......@@ -46,26 +45,23 @@ func getCheckpointInfoFromCheckpoint(ckpt *nvidiacomv1alpha1.DynamoCheckpoint) *
return info
}
// DefaultCheckpointPVCName is the default PVC name for checkpoint storage
const DefaultCheckpointPVCName = "checkpoint-storage"
// getPVCBasePath returns the PVC base path from storage config, or the default
// getPVCBasePath returns the PVC base path from storage config.
// Only applicable for PVC storage type
func getPVCBasePath(storageConfig *controller_common.CheckpointStorageConfig) string {
if storageConfig != nil && storageConfig.PVC.BasePath != "" {
return storageConfig.PVC.BasePath
}
return consts.CheckpointBasePath
return ""
}
// GetPVCBasePath returns the configured PVC base path from controller config,
// or the default if not set. This is used by both CheckpointReconciler and DynamoGraphDeploymentReconciler.
// GetPVCBasePath returns the configured PVC base path from controller config.
// This is used by both CheckpointReconciler and DynamoGraphDeploymentReconciler.
// Only applicable for PVC storage type.
func GetPVCBasePath(config *controller_common.CheckpointConfig) string {
if config != nil && config.Enabled {
if config != nil {
return getPVCBasePath(&config.Storage)
}
return consts.CheckpointBasePath
return ""
}
// storageTypeToAPI converts controller_common storage type string to API enum
......@@ -161,81 +157,63 @@ func ResolveCheckpointForService(
return info, nil
}
// InjectCheckpointEnvVars adds checkpoint-related environment variables to a container
// Sets STORAGE_TYPE, LOCATION, PATH, HASH, and CRIU-related vars for unified storage backend handling.
func InjectCheckpointEnvVars(container *corev1.Container, info *CheckpointInfo, config *controller_common.CheckpointConfig) {
// InjectCheckpointEnvVars adds checkpoint-related environment variables to a restored/DGD container.
// Sets PATH, HASH, RESTORE_MARKER_FILE, and SKIP_WAIT_FOR_CHECKPOINT. The restore entrypoint constructs
// the full checkpoint location from PATH + "/" + HASH.
// DYN_CHECKPOINT_LOCATION is reserved for future S3/OCI support.
func InjectCheckpointEnvVars(container *corev1.Container, info *CheckpointInfo, checkpointConfig *controller_common.CheckpointConfig) {
if !info.Enabled {
return
}
// Determine storage type (default to PVC if not set)
storageType := info.StorageType
if storageType == "" {
storageType = nvidiacomv1alpha1.DynamoCheckpointStorageType(controller_common.CheckpointStorageTypePVC)
}
var envVars []corev1.EnvVar
envVars := []corev1.EnvVar{
{
Name: consts.EnvCheckpointStorageType,
Value: string(storageType),
},
}
// Location is the source (where to fetch from)
if info.Location != "" {
envVars = append(envVars, corev1.EnvVar{
Name: consts.EnvCheckpointLocation,
Value: info.Location,
})
// For PVC storage: inject base path so restore-entrypoint constructs location = path/hash.
// For S3/OCI (future): inject DYN_CHECKPOINT_LOCATION directly.
storageType := controller_common.CheckpointStorageTypePVC
if checkpointConfig != nil && checkpointConfig.Storage.Type != "" {
storageType = checkpointConfig.Storage.Type
}
// For PVC storage, also inject DYNAMO_CHECKPOINT_PATH (base directory)
// This is used by k8s-runc-bypass restore entrypoint
if string(storageType) == controller_common.CheckpointStorageTypePVC && info.Location != "" {
// Extract base path using filepath.Dir()
basePath := filepath.Dir(info.Location)
switch storageType {
case controller_common.CheckpointStorageTypePVC:
basePath := ""
if checkpointConfig != nil {
basePath = getPVCBasePath(&checkpointConfig.Storage)
}
envVars = append(envVars, corev1.EnvVar{
Name: consts.EnvCheckpointPath,
Value: basePath,
})
default:
// S3/OCI: inject full location URI directly
if info.Location != "" {
envVars = append(envVars, corev1.EnvVar{
Name: consts.EnvCheckpointLocation,
Value: info.Location,
})
}
}
// Include hash for debugging/observability and for k8s-runc-bypass
if info.Hash != "" {
envVars = append(envVars, corev1.EnvVar{
Name: consts.EnvCheckpointHash,
Value: info.Hash,
})
}
// Add CRIU-related env vars for restore operations
criuTimeout := consts.DefaultCRIUTimeout
if config != nil && config.CRIUTimeout != "" {
criuTimeout = config.CRIUTimeout
if checkpointConfig != nil && checkpointConfig.RestoreMarkerFilePath != "" {
envVars = append(envVars, corev1.EnvVar{
Name: consts.EnvRestoreMarkerFile,
Value: checkpointConfig.RestoreMarkerFilePath,
})
}
envVars = append(envVars,
corev1.EnvVar{
Name: consts.EnvRestoreMarkerFile,
Value: consts.RestoreMarkerFilePath,
},
corev1.EnvVar{
Name: consts.EnvCRIUWorkDir,
Value: consts.CRIUWorkDirPath,
},
corev1.EnvVar{
Name: consts.EnvCRIULogDir,
Value: consts.CRIULogDirPath,
},
corev1.EnvVar{
Name: consts.EnvCUDAPluginDir,
Value: consts.CUDAPluginDirPath,
},
corev1.EnvVar{
Name: consts.EnvCRIUTimeout,
Value: criuTimeout,
},
)
// Tell the restore entrypoint to check once and cold-start if no checkpoint is ready.
// Without this (standalone/DaemonSet path), the entrypoint polls indefinitely.
envVars = append(envVars, corev1.EnvVar{
Name: consts.EnvSkipWaitForCheckpoint,
Value: "1",
})
// Prepend checkpoint env vars to ensure they're available
container.Env = append(envVars, container.Env...)
......@@ -270,10 +248,6 @@ func InjectCheckpointVolumeMount(container *corev1.Container, basePath string) {
}
}
if basePath == "" {
basePath = consts.CheckpointBasePath
}
container.VolumeMounts = append(container.VolumeMounts, corev1.VolumeMount{
Name: consts.CheckpointVolumeName,
MountPath: basePath,
......@@ -292,8 +266,8 @@ func InjectCheckpointSignalVolume(podSpec *corev1.PodSpec, checkpointConfig *con
}
// Get signal host path from config or use default
signalHostPath := consts.CheckpointSignalHostPath
if checkpointConfig != nil && checkpointConfig.Storage.SignalHostPath != "" {
signalHostPath := ""
if checkpointConfig != nil {
signalHostPath = checkpointConfig.Storage.SignalHostPath
}
......@@ -465,10 +439,9 @@ func InjectCheckpointIntoPodSpec(
}
// 1. Handle command/args for checkpoint-enabled images
// When checkpoint is enabled, the image has a smart ENTRYPOINT (e.g., /smart-entrypoint.sh)
// that detects checkpoints and decides between restore and cold start.
// We need to pass the user's command as arguments to this ENTRYPOINT rather than
// overriding it with Command.
// When checkpoint is enabled, the image ENTRYPOINT is /restore-entrypoint which
// decides between restore and cold start. We pass the user's command as arguments
// to this ENTRYPOINT (used as cold-start fallback if no checkpoint is ready).
if len(mainContainer.Command) > 0 {
// Combine Command + Args into a single Args array
// This allows the image's ENTRYPOINT to receive the full command as arguments
......@@ -489,7 +462,7 @@ func InjectCheckpointIntoPodSpec(
}
podSpec.SecurityContext.SeccompProfile = &corev1.SeccompProfile{
Type: corev1.SeccompProfileTypeLocalhost,
LocalhostProfile: ptr.To("profiles/block-iouring.json"),
LocalhostProfile: ptr.To(consts.SeccompProfilePath),
}
// Apply container-level security context for CRIU restore
......@@ -514,32 +487,31 @@ func InjectCheckpointIntoPodSpec(
// S3 storage: location is s3:// URI
// URI format: s3://[endpoint/]bucket/prefix
info.StorageType = storageTypeToAPI(storageType)
s3URI := "s3://checkpoint-storage/checkpoints" // default
if storageConfig != nil && storageConfig.S3.URI != "" {
s3URI = storageConfig.S3.URI
if storageConfig == nil || storageConfig.S3.URI == "" {
return fmt.Errorf("S3 storage type selected but no S3 URI configured (set checkpoint.storage.s3.uri)")
}
// Append hash to the URI
info.Location = fmt.Sprintf("%s/%s.tar", s3URI, info.Hash)
info.Location = fmt.Sprintf("%s/%s.tar", storageConfig.S3.URI, info.Hash)
case controller_common.CheckpointStorageTypeOCI:
// OCI storage: location is oci:// URI
// URI format: oci://registry/repository
info.StorageType = storageTypeToAPI(storageType)
ociURI := "oci://localhost/checkpoints" // default
if storageConfig != nil && storageConfig.OCI.URI != "" {
ociURI = storageConfig.OCI.URI
if storageConfig == nil || storageConfig.OCI.URI == "" {
return fmt.Errorf("OCI storage type selected but no OCI URI configured (set checkpoint.storage.oci.uri)")
}
// Append hash as tag
info.Location = fmt.Sprintf("%s:%s", ociURI, info.Hash)
info.Location = fmt.Sprintf("%s:%s", storageConfig.OCI.URI, info.Hash)
default: // controller_common.CheckpointStorageTypePVC
// PVC storage: location is the checkpoint directory
// k8s-runc-bypass expects: /checkpoints/{hash}/ (directory with checkpoint data)
info.StorageType = storageTypeToAPI(storageType)
basePath := getPVCBasePath(storageConfig)
pvcName := DefaultCheckpointPVCName
if storageConfig != nil && storageConfig.PVC.PVCName != "" {
pvcName = storageConfig.PVC.PVCName
if storageConfig == nil || storageConfig.PVC.PVCName == "" {
return fmt.Errorf("PVC storage type selected but no PVC name configured (set checkpoint.storage.pvc.pvcName)")
}
pvcName := storageConfig.PVC.PVCName
if basePath == "" {
return fmt.Errorf("PVC storage type selected but no PVC base path configured (set checkpoint.storage.pvc.basePath)")
}
info.Location = fmt.Sprintf("%s/%s", basePath, info.Hash)
......
......@@ -121,68 +121,38 @@ const (
ResourceStateReady = "ready"
ResourceStateNotReady = "not_ready"
ResourceStateUnknown = "unknown"
// Checkpoint related constants
KubeLabelCheckpointSource = "nvidia.com/checkpoint-source"
KubeLabelCheckpointHash = "nvidia.com/checkpoint-hash"
KubeLabelCheckpointName = "nvidia.com/checkpoint-name"
// EnvCheckpointStorageType indicates the storage backend type (pvc, s3, oci)
EnvCheckpointStorageType = "DYN_CHECKPOINT_STORAGE_TYPE"
// EnvCheckpointLocation is the source location of the checkpoint
// For PVC: same as path (e.g., /checkpoints/{hash}.tar)
// For S3: s3://bucket/prefix/{hash}.tar
// For OCI: oci://registry/repo:{hash}
EnvCheckpointLocation = "DYN_CHECKPOINT_LOCATION"
// EnvCheckpointPath is the local path to the checkpoint tar file
// For PVC: same as location
// For S3/OCI: download destination (e.g., /tmp/{hash}.tar)
EnvCheckpointPath = "DYN_CHECKPOINT_PATH"
// EnvCheckpointHash is the identity hash (for debugging/observability)
EnvCheckpointHash = "DYN_CHECKPOINT_HASH"
// EnvCheckpointSignalFile is the full path to the signal file
// The DaemonSet writes this file after checkpoint is complete
// The checkpoint job pod waits for this file, then exits successfully
EnvCheckpointSignalFile = "DYN_CHECKPOINT_SIGNAL_FILE"
// EnvCheckpointReadyFile is the full path to a file the worker creates
// when the model is loaded and ready for checkpointing.
// The readiness probe watches this file to trigger DaemonSet checkpoint.
EnvCheckpointReadyFile = "DYN_CHECKPOINT_READY_FILE"
// CRIU-related environment variables for restore operations
// EnvRestoreMarkerFile is the file created by CRIU after successful restore
EnvRestoreMarkerFile = "DYN_RESTORE_MARKER_FILE"
// EnvCRIUWorkDir is the working directory for CRIU operations
EnvCRIUWorkDir = "CRIU_WORK_DIR"
// EnvCRIULogDir is the directory where CRIU writes logs
EnvCRIULogDir = "CRIU_LOG_DIR"
// EnvCUDAPluginDir is the directory containing CRIU CUDA plugins
EnvCUDAPluginDir = "CUDA_PLUGIN_DIR"
// EnvCRIUTimeout is the timeout for CRIU operations
EnvCRIUTimeout = "CRIU_TIMEOUT"
// CheckpointReadyFilePath is the default path for the ready file
CheckpointReadyFilePath = "/tmp/checkpoint-ready"
// RestoreMarkerFilePath is the default path for the restore marker
RestoreMarkerFilePath = "/tmp/dynamo-restored"
// CRIUWorkDirPath is the default CRIU work directory
CRIUWorkDirPath = "/var/criu-work"
// CRIULogDirPath is the default CRIU log directory
CRIULogDirPath = "/checkpoints/restore-logs"
// CUDAPluginDirPath is the default CUDA plugin directory
CUDAPluginDirPath = "/usr/local/lib/criu"
// DefaultCRIUTimeout is the default CRIU timeout in seconds (6 hours)
DefaultCRIUTimeout = "21600"
CheckpointVolumeName = "checkpoint-storage"
CheckpointSignalVolumeName = "checkpoint-signal"
CheckpointBasePath = "/checkpoints"
CheckpointSignalHostPath = "/var/lib/dynamo-checkpoint/signals"
CheckpointSignalMountPath = "/checkpoint-signal"
// PodInfo volume for Downward API (critical for CRIU restore)
// After CRIU restore, environment variables contain stale values from checkpoint pod.
// The Downward API files at /etc/podinfo always have current pod identity.
// Checkpoint/restore constants
// CROSS-REFERENCE: Some constants below are duplicated in the chrek package at
// deploy/chrek/pkg/config/constants.go. If you change a value here, update there too.
// Kubernetes labels
KubeLabelCheckpointSource = "nvidia.com/checkpoint-source" // Pod label that triggers DaemonSet auto-checkpoint
KubeLabelCheckpointHash = "nvidia.com/checkpoint-hash" // Checkpoint identity hash for deduplication
KubeLabelCheckpointName = "nvidia.com/checkpoint-name" // DynamoCheckpoint CR name reference
// Environment variables injected into pods
EnvCheckpointStorageType = "DYN_CHECKPOINT_STORAGE_TYPE" // Storage backend (pvc, s3, oci) — checkpoint job pods only
EnvCheckpointLocation = "DYN_CHECKPOINT_LOCATION" // Full checkpoint URI — future S3/OCI; for PVC, use PATH+HASH instead
EnvCheckpointPath = "DYN_CHECKPOINT_PATH" // Base checkpoint directory (e.g., /checkpoints) — PVC restored pods
EnvCheckpointHash = "DYN_CHECKPOINT_HASH" // Identity hash — all checkpoint-related pods
EnvCheckpointSignalFile = "DYN_CHECKPOINT_SIGNAL_FILE" // Signal file path — checkpoint job pods
EnvReadyForCheckpointFile = "DYN_READY_FOR_CHECKPOINT_FILE" // Ready-for-checkpoint file path — checkpoint job pods
EnvRestoreMarkerFile = "DYN_RESTORE_MARKER_FILE" // Restore marker path — injected into restore and checkpoint job pods
EnvSkipWaitForCheckpoint = "SKIP_WAIT_FOR_CHECKPOINT" // Skip polling, check once — restored/DGD pods
// Checkpoint pod-internal constants
CheckpointVolumeName = "checkpoint-storage" // Pod-internal volume name for checkpoint PVC
CheckpointSignalVolumeName = "checkpoint-signal" // Pod-internal volume name for signal hostPath
CheckpointSignalMountPath = "/checkpoint-signal" // Mount path for signal volume inside pods
SignalFileCleanupInitContainerName = "cleanup-signal-file" // Init container that removes stale signal files before job starts
// SeccompProfilePath is the localhost seccomp profile that blocks io_uring syscalls.
// Deployed to nodes by the chrek DaemonSet init container.
SeccompProfilePath = "profiles/block-iouring.json"
// Pod identity (Downward API) ---
// After CRIU restore, env vars contain stale values from the checkpoint pod.
// The Downward API files at /etc/podinfo always reflect the current pod.
PodInfoVolumeName = "podinfo"
PodInfoMountPath = "/etc/podinfo"
......
......@@ -64,14 +64,6 @@ func (r *CheckpointReconciler) GetRecorder() record.EventRecorder {
return r.Recorder
}
// getSignalHostPath returns the configured signal host path, or the default if not set
func (r *CheckpointReconciler) getSignalHostPath() string {
if r.Config.Checkpoint.Enabled && r.Config.Checkpoint.Storage.SignalHostPath != "" {
return r.Config.Checkpoint.Storage.SignalHostPath
}
return consts.CheckpointSignalHostPath
}
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamocheckpoints,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamocheckpoints/status,verbs=get;update;patch
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamocheckpoints/finalizers,verbs=update
......@@ -120,8 +112,11 @@ func (r *CheckpointReconciler) Reconcile(ctx context.Context, req ctrl.Request)
// Nothing to do, checkpoint is ready
return ctrl.Result{}, nil
case nvidiacomv1alpha1.DynamoCheckpointPhaseFailed:
// Could implement retry logic here
return ctrl.Result{}, nil
// Re-evaluate the Job in case retries succeeded after a transient failure.
if ckpt.Status.JobName == "" {
return ctrl.Result{}, nil
}
return r.handleCreating(ctx, ckpt)
default:
// Unknown phase, reset to Pending
ckpt.Status.Phase = nvidiacomv1alpha1.DynamoCheckpointPhasePending
......@@ -229,8 +224,15 @@ func (r *CheckpointReconciler) handleCreating(ctx context.Context, ckpt *nvidiac
return ctrl.Result{}, nil
}
// Check if job failed
if job.Status.Failed > 0 {
// Check if job reached terminal Failed condition.
jobFailed := false
for _, condition := range job.Status.Conditions {
if condition.Type == batchv1.JobFailed && condition.Status == corev1.ConditionTrue {
jobFailed = true
break
}
}
if jobFailed {
logger.Info("Checkpoint Job failed", "job", job.Name)
r.Recorder.Event(ckpt, corev1.EventTypeWarning, "CheckpointFailed", "Checkpoint creation failed")
......@@ -273,14 +275,14 @@ func (r *CheckpointReconciler) buildCheckpointJob(ckpt *nvidiacomv1alpha1.Dynamo
Name: consts.CheckpointSignalVolumeName,
VolumeSource: corev1.VolumeSource{
HostPath: &corev1.HostPathVolumeSource{
Path: r.getSignalHostPath(),
Path: r.Config.Checkpoint.Storage.SignalHostPath,
Type: &hostPathType,
},
},
})
// Compute the signal file path - unique per checkpoint hash
signalFilePath := consts.CheckpointSignalMountPath + "/" + ckpt.Status.IdentityHash + ".done"
signalFilePath := consts.CheckpointSignalMountPath + "/" + ckpt.Status.IdentityHash
// Add initContainer to clean up any leftover signal file from previous runs
// This ensures a fresh start for each checkpoint job without affecting the checkpoint itself
......@@ -288,7 +290,7 @@ func (r *CheckpointReconciler) buildCheckpointJob(ckpt *nvidiacomv1alpha1.Dynamo
initContainerImage := r.Config.Checkpoint.InitContainerImage
podTemplate.Spec.InitContainers = append(podTemplate.Spec.InitContainers, corev1.Container{
Name: "cleanup-signal-file",
Name: consts.SignalFileCleanupInitContainerName,
Image: initContainerImage,
Command: []string{
"sh",
......@@ -320,8 +322,8 @@ func (r *CheckpointReconciler) buildCheckpointJob(ckpt *nvidiacomv1alpha1.Dynamo
},
// Ready file: Worker creates this when model is loaded
corev1.EnvVar{
Name: consts.EnvCheckpointReadyFile,
Value: consts.CheckpointReadyFilePath,
Name: consts.EnvReadyForCheckpointFile,
Value: r.Config.Checkpoint.ReadyForCheckpointFilePath,
},
// Checkpoint hash: For idempotency check
corev1.EnvVar{
......@@ -338,6 +340,11 @@ func (r *CheckpointReconciler) buildCheckpointJob(ckpt *nvidiacomv1alpha1.Dynamo
Name: consts.EnvCheckpointStorageType,
Value: storageType,
},
// Restore marker: Written by restore-entrypoint after CRIU restore
corev1.EnvVar{
Name: consts.EnvRestoreMarkerFile,
Value: r.Config.Checkpoint.RestoreMarkerFilePath,
},
)
// Add signal volume mount (required for DaemonSet communication)
......@@ -353,9 +360,6 @@ func (r *CheckpointReconciler) buildCheckpointJob(ckpt *nvidiacomv1alpha1.Dynamo
if r.Config.Checkpoint.Storage.PVC.PVCName != "" {
pvcName := r.Config.Checkpoint.Storage.PVC.PVCName
basePath := r.Config.Checkpoint.Storage.PVC.BasePath
if basePath == "" {
basePath = consts.CheckpointBasePath
}
checkpoint.InjectCheckpointVolume(&podTemplate.Spec, pvcName)
checkpoint.InjectCheckpointVolumeMount(mainContainer, basePath)
}
......@@ -371,7 +375,7 @@ func (r *CheckpointReconciler) buildCheckpointJob(ckpt *nvidiacomv1alpha1.Dynamo
mainContainer.ReadinessProbe = &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
Exec: &corev1.ExecAction{
Command: []string{"cat", consts.CheckpointReadyFilePath},
Command: []string{"cat", r.Config.Checkpoint.ReadyForCheckpointFilePath},
},
},
InitialDelaySeconds: 15,
......@@ -391,14 +395,14 @@ func (r *CheckpointReconciler) buildCheckpointJob(ckpt *nvidiacomv1alpha1.Dynamo
podTemplate.Spec.SecurityContext = &corev1.PodSecurityContext{
SeccompProfile: &corev1.SeccompProfile{
Type: corev1.SeccompProfileTypeLocalhost,
LocalhostProfile: ptr.To("profiles/block-iouring.json"),
LocalhostProfile: ptr.To(consts.SeccompProfilePath),
},
}
// Build the Job
activeDeadlineSeconds := ckpt.Spec.Job.ActiveDeadlineSeconds
if activeDeadlineSeconds == nil {
defaultDeadline := int64(3600)
defaultDeadline := int64(3600) // 1 hour
activeDeadlineSeconds = &defaultDeadline
}
......@@ -410,7 +414,7 @@ func (r *CheckpointReconciler) buildCheckpointJob(ckpt *nvidiacomv1alpha1.Dynamo
ttlSeconds := ckpt.Spec.Job.TTLSecondsAfterFinished
if ttlSeconds == nil {
defaultTTL := int32(300)
defaultTTL := int32(300) // 5 minutes
ttlSeconds = &defaultTTL
}
......
......@@ -105,11 +105,13 @@ type CheckpointConfig struct {
Enabled bool
// Storage holds storage backend configuration
Storage CheckpointStorageConfig
// CRIUTimeout is the CRIU timeout in seconds (required for CUDA checkpoints/restores)
CRIUTimeout string
// InitContainerImage is the image used for init containers (e.g., signal file cleanup)
// Defaults to "busybox:latest" if not specified
InitContainerImage string
// ReadyForCheckpointFilePath is the file path used to signal model readiness for checkpoint jobs
ReadyForCheckpointFilePath string
// RestoreMarkerFilePath is the marker file path written after successful restore
RestoreMarkerFilePath string
}
// Checkpoint storage type constants
......
......@@ -198,7 +198,7 @@ Your checkpoint job MUST set these environment variables:
| Variable | Description | Example |
|----------|-------------|---------|
| `DYN_CHECKPOINT_SIGNAL_FILE` | Path where DaemonSet writes completion signal | `/checkpoint-signal/my-checkpoint.done` |
| `DYN_CHECKPOINT_READY_FILE` | Path where your app signals it's ready | `/tmp/checkpoint-ready` |
| `DYN_READY_FOR_CHECKPOINT_FILE` | Path where your app signals it's ready | `/tmp/ready-for-checkpoint` |
| `DYN_CHECKPOINT_HASH` | Unique identifier for this checkpoint | `abc123def456` |
| `DYN_CHECKPOINT_LOCATION` | Directory where checkpoint is stored | `/checkpoints/abc123def456` |
| `DYN_CHECKPOINT_STORAGE_TYPE` | Storage backend type | `pvc` |
......@@ -256,7 +256,7 @@ spec:
# This is what triggers the DaemonSet to start checkpointing
readinessProbe:
exec:
command: ["sh", "-c", "cat ${DYN_CHECKPOINT_READY_FILE}"]
command: ["sh", "-c", "cat ${DYN_READY_FOR_CHECKPOINT_FILE}"]
initialDelaySeconds: 15
periodSeconds: 2
......@@ -269,8 +269,8 @@ spec:
env:
- name: DYN_CHECKPOINT_SIGNAL_FILE
value: "/checkpoint-signal/my-checkpoint.done"
- name: DYN_CHECKPOINT_READY_FILE
value: "/tmp/checkpoint-ready"
- name: DYN_READY_FOR_CHECKPOINT_FILE
value: "/tmp/ready-for-checkpoint"
- name: DYN_CHECKPOINT_HASH
value: "abc123def456"
- name: DYN_CHECKPOINT_LOCATION
......@@ -315,8 +315,8 @@ import time
def main():
# 1. Check for checkpoint mode
signal_file = os.environ.get("DYN_CHECKPOINT_SIGNAL_FILE")
ready_file = os.environ.get("DYN_CHECKPOINT_READY_FILE")
restore_marker = os.environ.get("DYN_RESTORE_MARKER_FILE", "/tmp/dynamo-restored")
ready_file = os.environ.get("DYN_READY_FOR_CHECKPOINT_FILE")
restore_marker = os.environ.get("DYN_RESTORE_MARKER_FILE")
is_checkpoint_mode = signal_file is not None
......@@ -366,7 +366,7 @@ def main():
```yaml
readinessProbe:
exec:
command: ["sh", "-c", "cat ${DYN_CHECKPOINT_READY_FILE}"]
command: ["sh", "-c", "cat ${DYN_READY_FOR_CHECKPOINT_FILE}"]
initialDelaySeconds: 15
periodSeconds: 2
```
......@@ -415,9 +415,8 @@ spec:
- name: DYN_CHECKPOINT_PATH
value: "/checkpoints" # Base path (hash appended automatically)
# Optional: Customize restore marker file path
# - name: DYN_RESTORE_MARKER_FILE
# value: "/tmp/dynamo-restored"
- name: DYN_RESTORE_MARKER_FILE
value: "/tmp/dynamo-restored"
# GPU request
resources:
......@@ -458,7 +457,7 @@ spec:
| Variable | Required | Description |
|----------|----------|-------------|
| `DYN_CHECKPOINT_SIGNAL_FILE` | Yes | Full path to signal file (e.g., `/checkpoint-signal/my-checkpoint.done`) |
| `DYN_CHECKPOINT_READY_FILE` | Yes | Full path where app signals readiness (e.g., `/tmp/checkpoint-ready`) |
| `DYN_READY_FOR_CHECKPOINT_FILE` | Yes | Full path where app signals readiness (e.g., `/tmp/ready-for-checkpoint`) |
| `DYN_CHECKPOINT_HASH` | Yes | Unique checkpoint identifier (alphanumeric string) |
| `DYN_CHECKPOINT_LOCATION` | Yes | Directory where checkpoint is stored (e.g., `/checkpoints/abc123`) |
| `DYN_CHECKPOINT_STORAGE_TYPE` | Yes | Storage backend: `pvc`, `s3`, or `oci` |
......@@ -469,7 +468,7 @@ spec:
|----------|----------|-------------|
| `DYN_CHECKPOINT_HASH` | Yes | Checkpoint identifier (must match checkpoint job) |
| `DYN_CHECKPOINT_PATH` | Yes | Base checkpoint directory (hash appended automatically) |
| `DYN_RESTORE_MARKER_FILE` | No | Path for restore marker file (default: `/tmp/dynamo-restored`) |
| `DYN_RESTORE_MARKER_FILE` | Yes | Path for restore marker file |
### Optional CRIU Tuning (Advanced)
......@@ -500,7 +499,7 @@ spec:
┌─────────────────────────────────────────────────────────────┐
│ 2. Application loads model and creates ready file │
│ /tmp/checkpoint-ready
│ /tmp/ready-for-checkpoint │
└──────────────────────┬──────────────────────────────────────┘
......@@ -513,7 +512,7 @@ spec:
│ 4. ChReK DaemonSet detects: │
│ - Pod is Ready │
│ - Has checkpoint-source label │
│ - Ready file exists: /tmp/checkpoint-ready
│ - Ready file exists: /tmp/ready-for-checkpoint │
└──────────────────────┬──────────────────────────────────────┘
......@@ -603,7 +602,7 @@ spec:
3. Check ready file was created:
```bash
kubectl exec <pod-name> -- ls -la /tmp/checkpoint-ready
kubectl exec <pod-name> -- ls -la /tmp/ready-for-checkpoint
```
4. Check DaemonSet logs:
......
......@@ -201,7 +201,7 @@ Your checkpoint job MUST set these environment variables:
| Variable | Description | Example |
|----------|-------------|---------|
| `DYN_CHECKPOINT_SIGNAL_FILE` | Path where DaemonSet writes completion signal | `/checkpoint-signal/my-checkpoint.done` |
| `DYN_CHECKPOINT_READY_FILE` | Path where your app signals it's ready | `/tmp/checkpoint-ready` |
| `DYN_READY_FOR_CHECKPOINT_FILE` | Path where your app signals it's ready | `/tmp/ready-for-checkpoint` |
| `DYN_CHECKPOINT_HASH` | Unique identifier for this checkpoint | `abc123def456` |
| `DYN_CHECKPOINT_LOCATION` | Directory where checkpoint is stored | `/checkpoints/abc123def456` |
| `DYN_CHECKPOINT_STORAGE_TYPE` | Storage backend type | `pvc` |
......@@ -259,7 +259,7 @@ spec:
# This is what triggers the DaemonSet to start checkpointing
readinessProbe:
exec:
command: ["sh", "-c", "cat ${DYN_CHECKPOINT_READY_FILE}"]
command: ["sh", "-c", "cat ${DYN_READY_FOR_CHECKPOINT_FILE}"]
initialDelaySeconds: 15
periodSeconds: 2
......@@ -272,8 +272,8 @@ spec:
env:
- name: DYN_CHECKPOINT_SIGNAL_FILE
value: "/checkpoint-signal/my-checkpoint.done"
- name: DYN_CHECKPOINT_READY_FILE
value: "/tmp/checkpoint-ready"
- name: DYN_READY_FOR_CHECKPOINT_FILE
value: "/tmp/ready-for-checkpoint"
- name: DYN_CHECKPOINT_HASH
value: "abc123def456"
- name: DYN_CHECKPOINT_LOCATION
......@@ -318,8 +318,8 @@ import time
def main():
# 1. Check for checkpoint mode
signal_file = os.environ.get("DYN_CHECKPOINT_SIGNAL_FILE")
ready_file = os.environ.get("DYN_CHECKPOINT_READY_FILE")
restore_marker = os.environ.get("DYN_RESTORE_MARKER_FILE", "/tmp/dynamo-restored")
ready_file = os.environ.get("DYN_READY_FOR_CHECKPOINT_FILE")
restore_marker = os.environ.get("DYN_RESTORE_MARKER_FILE")
is_checkpoint_mode = signal_file is not None
......@@ -369,7 +369,7 @@ def main():
```yaml
readinessProbe:
exec:
command: ["sh", "-c", "cat ${DYN_CHECKPOINT_READY_FILE}"]
command: ["sh", "-c", "cat ${DYN_READY_FOR_CHECKPOINT_FILE}"]
initialDelaySeconds: 15
periodSeconds: 2
```
......@@ -418,9 +418,8 @@ spec:
- name: DYN_CHECKPOINT_PATH
value: "/checkpoints" # Base path (hash appended automatically)
# Optional: Customize restore marker file path
# - name: DYN_RESTORE_MARKER_FILE
# value: "/tmp/dynamo-restored"
- name: DYN_RESTORE_MARKER_FILE
value: "/tmp/dynamo-restored"
# GPU request
resources:
......@@ -461,7 +460,7 @@ spec:
| Variable | Required | Description |
|----------|----------|-------------|
| `DYN_CHECKPOINT_SIGNAL_FILE` | Yes | Full path to signal file (e.g., `/checkpoint-signal/my-checkpoint.done`) |
| `DYN_CHECKPOINT_READY_FILE` | Yes | Full path where app signals readiness (e.g., `/tmp/checkpoint-ready`) |
| `DYN_READY_FOR_CHECKPOINT_FILE` | Yes | Full path where app signals readiness (e.g., `/tmp/ready-for-checkpoint`) |
| `DYN_CHECKPOINT_HASH` | Yes | Unique checkpoint identifier (alphanumeric string) |
| `DYN_CHECKPOINT_LOCATION` | Yes | Directory where checkpoint is stored (e.g., `/checkpoints/abc123`) |
| `DYN_CHECKPOINT_STORAGE_TYPE` | Yes | Storage backend: `pvc`, `s3`, or `oci` |
......@@ -472,7 +471,7 @@ spec:
|----------|----------|-------------|
| `DYN_CHECKPOINT_HASH` | Yes | Checkpoint identifier (must match checkpoint job) |
| `DYN_CHECKPOINT_PATH` | Yes | Base checkpoint directory (hash appended automatically) |
| `DYN_RESTORE_MARKER_FILE` | No | Path for restore marker file (default: `/tmp/dynamo-restored`) |
| `DYN_RESTORE_MARKER_FILE` | Yes | Path for restore marker file |
### Optional CRIU Tuning (Advanced)
......@@ -503,7 +502,7 @@ spec:
┌─────────────────────────────────────────────────────────────┐
│ 2. Application loads model and creates ready file │
│ /tmp/checkpoint-ready
│ /tmp/ready-for-checkpoint │
└──────────────────────┬──────────────────────────────────────┘
......@@ -516,7 +515,7 @@ spec:
│ 4. ChReK DaemonSet detects: │
│ - Pod is Ready │
│ - Has checkpoint-source label │
│ - Ready file exists: /tmp/checkpoint-ready
│ - Ready file exists: /tmp/ready-for-checkpoint │
└──────────────────────┬──────────────────────────────────────┘
......@@ -606,7 +605,7 @@ spec:
3. Check ready file was created:
```bash
kubectl exec <pod-name> -- ls -la /tmp/checkpoint-ready
kubectl exec <pod-name> -- ls -la /tmp/ready-for-checkpoint
```
4. Check DaemonSet logs:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment