Unverified Commit d381e6ff authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

feat(chrek): config refactor, /dev/shm support, and mount-policy rewrite (#5946)

parent b6824ae0
...@@ -48,9 +48,6 @@ storage: ...@@ -48,9 +48,6 @@ storage:
# OCI URI (e.g., oci://registry.io/repo/checkpoints) # OCI URI (e.g., oci://registry.io/repo/checkpoints)
uri: "" uri: ""
# Host path for signal files (inter-pod communication)
signalHostPath: /var/lib/chrek/signals
# DaemonSet configuration for chrek (checkpoint/restore) agent # DaemonSet configuration for chrek (checkpoint/restore) agent
daemonset: daemonset:
# Container image # Container image
...@@ -97,19 +94,6 @@ daemonset: ...@@ -97,19 +94,6 @@ daemonset:
# Affinity rules # Affinity rules
affinity: {} affinity: {}
# CRIU configuration
criu:
# CUDA plugin directory
cudaPluginDir: /usr/local/lib/criu
# CRIU timeout in seconds (6 hours)
timeout: "21600"
# Ghost file size limit in bytes
# 512MB is recommended for GPU workloads with large memory allocations
ghostLimit: "536870912"
# Container runtime socket path
containerRuntimeSocket: /run/containerd/containerd.sock
# Seccomp profile configuration # Seccomp profile configuration
seccomp: seccomp:
# Deploy seccomp profile for blocking io_uring (required for CRIU) # Deploy seccomp profile for blocking io_uring (required for CRIU)
...@@ -135,3 +119,61 @@ rbac: ...@@ -135,3 +119,61 @@ rbac:
# Note: PVC storage requires namespace-scoped mode (true) as PVCs are namespace-scoped # Note: PVC storage requires namespace-scoped mode (true) as PVCs are namespace-scoped
namespaceRestricted: true namespaceRestricted: true
# Static configuration (loaded from ConfigMap)
# Dynamic values (NODE_NAME, RESTRICTED_NAMESPACE, etc.) still come from environment variables
config:
agent:
# How checkpoints are triggered: "http" or "watcher"
signalSource: "watcher"
# HTTP server address for health checks and API
listenAddr: ":8080"
checkpoint:
criu:
# Ghost file size limit in bytes (512MB recommended for GPU workloads)
ghostLimit: 536870912
# CRIU timeout in seconds (6 hours for large GPU checkpoints)
timeout: 21600
# CRIU logging verbosity (0-4)
logLevel: 4
# CRIU work directory for temporary files
workDir: "/var/criu-work"
# K8s-specific options (recommended defaults for containers)
leaveRunning: true # Keep process running after checkpoint
shellJob: true # Containers are often session leaders
tcpClose: true # Pod IPs change on restore/migration
fileLocks: true # Applications use file locks
orphanPtsMaster: true # Containers with TTYs
extUnixSk: true # External Unix sockets
linkRemap: true # Handle deleted-but-open files
extMasters: true # External bind mount masters
manageCgroupsMode: "ignore" # Let K8s manage cgroups (ignore/soft/full/strict)
# Advanced options
autoDedup: false # Auto-deduplication of memory pages
lazyPages: false # Lazy page migration (experimental)
# Config file options (NOT available via RPC - written to criu.conf)
libDir: "/usr/local/lib/criu" # Plugin directory (required for GPU checkpoints)
allowUprobes: true # Required for CUDA
skipInFlight: true # Skip in-flight TCP connections
rootfsExclusions:
# System directories excluded from rootfs diff capture
# These are injected by NVIDIA GPU Operator and cause conflicts during restore
systemDirs:
- "./usr"
- "./etc"
- "./opt"
- "./var"
- "./run"
# Cache directories to exclude (reduces checkpoint size)
cacheDirs:
- "./.cache/huggingface"
# Additional custom exclusions (application-specific)
additionalExclusions: []
# NOTE: Restore configuration is NOT in this ConfigMap.
# Placeholder containers do not mount it. Restore defaults are hardcoded in Go.
# CRIU options for restore come from the saved checkpoint manifest (manifest.yaml).
...@@ -151,8 +151,9 @@ spec: ...@@ -151,8 +151,9 @@ spec:
- --checkpoint-enabled=true - --checkpoint-enabled=true
- --checkpoint-storage-type={{ .Values.checkpoint.storage.type }} - --checkpoint-storage-type={{ .Values.checkpoint.storage.type }}
- --checkpoint-signal-host-path={{ .Values.checkpoint.storage.signalHostPath }} - --checkpoint-signal-host-path={{ .Values.checkpoint.storage.signalHostPath }}
- --checkpoint-criu-timeout={{ .Values.checkpoint.criu.timeout }}
- --checkpoint-init-container-image={{ .Values.checkpoint.initContainerImage }} - --checkpoint-init-container-image={{ .Values.checkpoint.initContainerImage }}
- --checkpoint-ready-for-checkpoint-file-path={{ .Values.checkpoint.readyForCheckpointFilePath }}
- --checkpoint-restore-marker-file-path={{ .Values.checkpoint.restoreMarkerFilePath }}
{{- if eq .Values.checkpoint.storage.type "pvc" }} {{- if eq .Values.checkpoint.storage.type "pvc" }}
- --checkpoint-pvc-name={{ .Values.checkpoint.storage.pvc.pvcName }} - --checkpoint-pvc-name={{ .Values.checkpoint.storage.pvc.pvcName }}
- --checkpoint-pvc-base-path={{ .Values.checkpoint.storage.pvc.basePath }} - --checkpoint-pvc-base-path={{ .Values.checkpoint.storage.pvc.basePath }}
......
...@@ -157,6 +157,14 @@ checkpoint: ...@@ -157,6 +157,14 @@ checkpoint:
# Defaults to busybox:latest if not specified # Defaults to busybox:latest if not specified
initContainerImage: "busybox:latest" initContainerImage: "busybox:latest"
# Path written by worker when model is loaded and ready for checkpointing
# Must match the path expected by checkpoint-enabled runtime images
readyForCheckpointFilePath: "/tmp/ready-for-checkpoint"
# Path written by restore-entrypoint after successful CRIU restore
# Must match the path expected by checkpoint-enabled runtime images
restoreMarkerFilePath: "/tmp/dynamo-restored"
# Storage configuration # Storage configuration
# These settings tell the operator where to find checkpoint storage # These settings tell the operator where to find checkpoint storage
# Must match the configuration in the chrek chart # Must match the configuration in the chrek chart
...@@ -196,14 +204,6 @@ checkpoint: ...@@ -196,14 +204,6 @@ checkpoint:
# Reference to a docker config secret for registry authentication # Reference to a docker config secret for registry authentication
credentialsSecretRef: "" credentialsSecretRef: ""
# CRIU timeout configuration (shared across checkpoint and restore)
criu:
# CRIU operation timeout in seconds
# Maximum time to wait for checkpoint/restore to complete
# Increase for models with very large memory footprints
# 21600s (6 hours) is recommended for large LLMs (70B+)
timeout: "21600"
# Webhook configuration # Webhook configuration
webhook: webhook:
# Enable admission webhooks for validation # Enable admission webhooks for validation
...@@ -280,4 +280,3 @@ webhook: ...@@ -280,4 +280,3 @@ webhook:
# Time before expiration to renew root CA (e.g., "720h" for 30 days) # Time before expiration to renew root CA (e.g., "720h" for 30 days)
renewBefore: "720h" renewBefore: "720h"
...@@ -216,6 +216,15 @@ dynamo-operator: ...@@ -216,6 +216,15 @@ dynamo-operator:
# -- Whether to enable checkpoint/restore functionality # -- Whether to enable checkpoint/restore functionality
enabled: false enabled: false
# -- Image used for init containers in checkpoint jobs (e.g., signal file cleanup)
initContainerImage: "busybox:latest"
# -- Path written by worker when model is loaded and ready for checkpointing
readyForCheckpointFilePath: "/tmp/ready-for-checkpoint"
# -- Path written by restore-entrypoint after successful CRIU restore
restoreMarkerFilePath: "/tmp/dynamo-restored"
# Storage configuration # Storage configuration
# These settings tell the operator where to find checkpoint storage # These settings tell the operator where to find checkpoint storage
# Must match the configuration in the chrek chart # Must match the configuration in the chrek chart
...@@ -247,12 +256,6 @@ dynamo-operator: ...@@ -247,12 +256,6 @@ dynamo-operator:
# -- Reference to a docker config secret for registry authentication # -- Reference to a docker config secret for registry authentication
credentialsSecretRef: "" credentialsSecretRef: ""
# CRIU timeout configuration (shared across checkpoint and restore)
criu:
# -- CRIU operation timeout in seconds. Default: 21600 (6 hours)
timeout: "21600"
# Grove component - distributed inference orchestration # Grove component - distributed inference orchestration
grove: grove:
# -- Whether to enable Grove for multi-node inference coordination, if enabled, the Grove operator will be deployed cluster-wide # -- Whether to enable Grove for multi-node inference coordination, if enabled, the Grove operator will be deployed cluster-wide
......
...@@ -161,7 +161,6 @@ func main() { ...@@ -161,7 +161,6 @@ func main() {
var checkpointEnabled bool var checkpointEnabled bool
var checkpointStorageType string var checkpointStorageType string
var checkpointSignalHostPath string var checkpointSignalHostPath string
var checkpointCRIUTimeout string
var checkpointPVCName string var checkpointPVCName string
var checkpointPVCBasePath string var checkpointPVCBasePath string
var checkpointS3URI string var checkpointS3URI string
...@@ -169,6 +168,8 @@ func main() { ...@@ -169,6 +168,8 @@ func main() {
var checkpointOCIURI string var checkpointOCIURI string
var checkpointOCICredentialsSecret string var checkpointOCICredentialsSecret string
var checkpointInitContainerImage string var checkpointInitContainerImage string
var checkpointReadyForCheckpointFilePath string
var checkpointRestoreMarkerFilePath string
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.") flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
flag.BoolVar(&enableLeaderElection, "leader-elect", false, flag.BoolVar(&enableLeaderElection, "leader-elect", false,
...@@ -229,11 +230,9 @@ func main() { ...@@ -229,11 +230,9 @@ func main() {
"Enable checkpoint/restore functionality") "Enable checkpoint/restore functionality")
flag.StringVar(&checkpointStorageType, "checkpoint-storage-type", commonController.CheckpointStorageTypePVC, flag.StringVar(&checkpointStorageType, "checkpoint-storage-type", commonController.CheckpointStorageTypePVC,
"Checkpoint storage backend type: pvc, s3, or oci") "Checkpoint storage backend type: pvc, s3, or oci")
flag.StringVar(&checkpointSignalHostPath, "checkpoint-signal-host-path", "", flag.StringVar(&checkpointSignalHostPath, "checkpoint-signal-host-path", "/var/lib/chrek/signals",
"Host path for signal files used for checkpoint job coordination") "Host path for signal files used for checkpoint job coordination")
flag.StringVar(&checkpointCRIUTimeout, "checkpoint-criu-timeout", "21600", flag.StringVar(&checkpointPVCName, "checkpoint-pvc-name", "chrek-pvc",
"CRIU timeout in seconds (required for CUDA checkpoints/restores, default: 21600 = 6 hours)")
flag.StringVar(&checkpointPVCName, "checkpoint-pvc-name", "checkpoint-storage",
"Name of the PVC for checkpoint storage (used when storage-type=pvc)") "Name of the PVC for checkpoint storage (used when storage-type=pvc)")
flag.StringVar(&checkpointPVCBasePath, "checkpoint-pvc-base-path", "/checkpoints", flag.StringVar(&checkpointPVCBasePath, "checkpoint-pvc-base-path", "/checkpoints",
"Base path within the PVC for storing checkpoints (used when storage-type=pvc)") "Base path within the PVC for storing checkpoints (used when storage-type=pvc)")
...@@ -247,6 +246,11 @@ func main() { ...@@ -247,6 +246,11 @@ func main() {
"Docker config secret name for OCI registry auth (used when storage-type=oci)") "Docker config secret name for OCI registry auth (used when storage-type=oci)")
flag.StringVar(&checkpointInitContainerImage, "checkpoint-init-container-image", "busybox:latest", flag.StringVar(&checkpointInitContainerImage, "checkpoint-init-container-image", "busybox:latest",
"Image to use for checkpoint init containers (e.g., signal file cleanup)") "Image to use for checkpoint init containers (e.g., signal file cleanup)")
flag.StringVar(&checkpointReadyForCheckpointFilePath,
"checkpoint-ready-for-checkpoint-file-path", "/tmp/ready-for-checkpoint",
"Path written by the worker container when the model is loaded and ready for checkpointing")
flag.StringVar(&checkpointRestoreMarkerFilePath, "checkpoint-restore-marker-file-path", "/tmp/dynamo-restored",
"Path written by restore-entrypoint after successful CRIU restore")
opts := zap.Options{ opts := zap.Options{
Development: true, Development: true,
} }
...@@ -325,9 +329,10 @@ func main() { ...@@ -325,9 +329,10 @@ func main() {
}, },
DiscoveryBackend: discoveryBackend, DiscoveryBackend: discoveryBackend,
Checkpoint: commonController.CheckpointConfig{ Checkpoint: commonController.CheckpointConfig{
Enabled: checkpointEnabled, Enabled: checkpointEnabled,
CRIUTimeout: checkpointCRIUTimeout, InitContainerImage: checkpointInitContainerImage,
InitContainerImage: checkpointInitContainerImage, ReadyForCheckpointFilePath: checkpointReadyForCheckpointFilePath,
RestoreMarkerFilePath: checkpointRestoreMarkerFilePath,
Storage: commonController.CheckpointStorageConfig{ Storage: commonController.CheckpointStorageConfig{
Type: checkpointStorageType, Type: checkpointStorageType,
SignalHostPath: checkpointSignalHostPath, SignalHostPath: checkpointSignalHostPath,
......
...@@ -20,7 +20,6 @@ package checkpoint ...@@ -20,7 +20,6 @@ package checkpoint
import ( import (
"context" "context"
"fmt" "fmt"
"path/filepath"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts" "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
...@@ -46,26 +45,23 @@ func getCheckpointInfoFromCheckpoint(ckpt *nvidiacomv1alpha1.DynamoCheckpoint) * ...@@ -46,26 +45,23 @@ func getCheckpointInfoFromCheckpoint(ckpt *nvidiacomv1alpha1.DynamoCheckpoint) *
return info return info
} }
// DefaultCheckpointPVCName is the default PVC name for checkpoint storage // getPVCBasePath returns the PVC base path from storage config.
const DefaultCheckpointPVCName = "checkpoint-storage"
// getPVCBasePath returns the PVC base path from storage config, or the default
// Only applicable for PVC storage type // Only applicable for PVC storage type
func getPVCBasePath(storageConfig *controller_common.CheckpointStorageConfig) string { func getPVCBasePath(storageConfig *controller_common.CheckpointStorageConfig) string {
if storageConfig != nil && storageConfig.PVC.BasePath != "" { if storageConfig != nil && storageConfig.PVC.BasePath != "" {
return storageConfig.PVC.BasePath return storageConfig.PVC.BasePath
} }
return consts.CheckpointBasePath return ""
} }
// GetPVCBasePath returns the configured PVC base path from controller config, // GetPVCBasePath returns the configured PVC base path from controller config.
// or the default if not set. This is used by both CheckpointReconciler and DynamoGraphDeploymentReconciler. // This is used by both CheckpointReconciler and DynamoGraphDeploymentReconciler.
// Only applicable for PVC storage type. // Only applicable for PVC storage type.
func GetPVCBasePath(config *controller_common.CheckpointConfig) string { func GetPVCBasePath(config *controller_common.CheckpointConfig) string {
if config != nil && config.Enabled { if config != nil {
return getPVCBasePath(&config.Storage) return getPVCBasePath(&config.Storage)
} }
return consts.CheckpointBasePath return ""
} }
// storageTypeToAPI converts controller_common storage type string to API enum // storageTypeToAPI converts controller_common storage type string to API enum
...@@ -161,81 +157,63 @@ func ResolveCheckpointForService( ...@@ -161,81 +157,63 @@ func ResolveCheckpointForService(
return info, nil return info, nil
} }
// InjectCheckpointEnvVars adds checkpoint-related environment variables to a container // InjectCheckpointEnvVars adds checkpoint-related environment variables to a restored/DGD container.
// Sets STORAGE_TYPE, LOCATION, PATH, HASH, and CRIU-related vars for unified storage backend handling. // Sets PATH, HASH, RESTORE_MARKER_FILE, and SKIP_WAIT_FOR_CHECKPOINT. The restore entrypoint constructs
func InjectCheckpointEnvVars(container *corev1.Container, info *CheckpointInfo, config *controller_common.CheckpointConfig) { // the full checkpoint location from PATH + "/" + HASH.
// DYN_CHECKPOINT_LOCATION is reserved for future S3/OCI support.
func InjectCheckpointEnvVars(container *corev1.Container, info *CheckpointInfo, checkpointConfig *controller_common.CheckpointConfig) {
if !info.Enabled { if !info.Enabled {
return return
} }
// Determine storage type (default to PVC if not set) var envVars []corev1.EnvVar
storageType := info.StorageType
if storageType == "" {
storageType = nvidiacomv1alpha1.DynamoCheckpointStorageType(controller_common.CheckpointStorageTypePVC)
}
envVars := []corev1.EnvVar{ // For PVC storage: inject base path so restore-entrypoint constructs location = path/hash.
{ // For S3/OCI (future): inject DYN_CHECKPOINT_LOCATION directly.
Name: consts.EnvCheckpointStorageType, storageType := controller_common.CheckpointStorageTypePVC
Value: string(storageType), if checkpointConfig != nil && checkpointConfig.Storage.Type != "" {
}, storageType = checkpointConfig.Storage.Type
}
// Location is the source (where to fetch from)
if info.Location != "" {
envVars = append(envVars, corev1.EnvVar{
Name: consts.EnvCheckpointLocation,
Value: info.Location,
})
} }
// For PVC storage, also inject DYNAMO_CHECKPOINT_PATH (base directory) switch storageType {
// This is used by k8s-runc-bypass restore entrypoint case controller_common.CheckpointStorageTypePVC:
if string(storageType) == controller_common.CheckpointStorageTypePVC && info.Location != "" { basePath := ""
// Extract base path using filepath.Dir() if checkpointConfig != nil {
basePath := filepath.Dir(info.Location) basePath = getPVCBasePath(&checkpointConfig.Storage)
}
envVars = append(envVars, corev1.EnvVar{ envVars = append(envVars, corev1.EnvVar{
Name: consts.EnvCheckpointPath, Name: consts.EnvCheckpointPath,
Value: basePath, Value: basePath,
}) })
default:
// S3/OCI: inject full location URI directly
if info.Location != "" {
envVars = append(envVars, corev1.EnvVar{
Name: consts.EnvCheckpointLocation,
Value: info.Location,
})
}
} }
// Include hash for debugging/observability and for k8s-runc-bypass
if info.Hash != "" { if info.Hash != "" {
envVars = append(envVars, corev1.EnvVar{ envVars = append(envVars, corev1.EnvVar{
Name: consts.EnvCheckpointHash, Name: consts.EnvCheckpointHash,
Value: info.Hash, Value: info.Hash,
}) })
} }
if checkpointConfig != nil && checkpointConfig.RestoreMarkerFilePath != "" {
// Add CRIU-related env vars for restore operations envVars = append(envVars, corev1.EnvVar{
criuTimeout := consts.DefaultCRIUTimeout Name: consts.EnvRestoreMarkerFile,
if config != nil && config.CRIUTimeout != "" { Value: checkpointConfig.RestoreMarkerFilePath,
criuTimeout = config.CRIUTimeout })
} }
envVars = append(envVars, // Tell the restore entrypoint to check once and cold-start if no checkpoint is ready.
corev1.EnvVar{ // Without this (standalone/DaemonSet path), the entrypoint polls indefinitely.
Name: consts.EnvRestoreMarkerFile, envVars = append(envVars, corev1.EnvVar{
Value: consts.RestoreMarkerFilePath, Name: consts.EnvSkipWaitForCheckpoint,
}, Value: "1",
corev1.EnvVar{ })
Name: consts.EnvCRIUWorkDir,
Value: consts.CRIUWorkDirPath,
},
corev1.EnvVar{
Name: consts.EnvCRIULogDir,
Value: consts.CRIULogDirPath,
},
corev1.EnvVar{
Name: consts.EnvCUDAPluginDir,
Value: consts.CUDAPluginDirPath,
},
corev1.EnvVar{
Name: consts.EnvCRIUTimeout,
Value: criuTimeout,
},
)
// Prepend checkpoint env vars to ensure they're available // Prepend checkpoint env vars to ensure they're available
container.Env = append(envVars, container.Env...) container.Env = append(envVars, container.Env...)
...@@ -270,10 +248,6 @@ func InjectCheckpointVolumeMount(container *corev1.Container, basePath string) { ...@@ -270,10 +248,6 @@ func InjectCheckpointVolumeMount(container *corev1.Container, basePath string) {
} }
} }
if basePath == "" {
basePath = consts.CheckpointBasePath
}
container.VolumeMounts = append(container.VolumeMounts, corev1.VolumeMount{ container.VolumeMounts = append(container.VolumeMounts, corev1.VolumeMount{
Name: consts.CheckpointVolumeName, Name: consts.CheckpointVolumeName,
MountPath: basePath, MountPath: basePath,
...@@ -292,8 +266,8 @@ func InjectCheckpointSignalVolume(podSpec *corev1.PodSpec, checkpointConfig *con ...@@ -292,8 +266,8 @@ func InjectCheckpointSignalVolume(podSpec *corev1.PodSpec, checkpointConfig *con
} }
// Get signal host path from config or use default // Get signal host path from config or use default
signalHostPath := consts.CheckpointSignalHostPath signalHostPath := ""
if checkpointConfig != nil && checkpointConfig.Storage.SignalHostPath != "" { if checkpointConfig != nil {
signalHostPath = checkpointConfig.Storage.SignalHostPath signalHostPath = checkpointConfig.Storage.SignalHostPath
} }
...@@ -465,10 +439,9 @@ func InjectCheckpointIntoPodSpec( ...@@ -465,10 +439,9 @@ func InjectCheckpointIntoPodSpec(
} }
// 1. Handle command/args for checkpoint-enabled images // 1. Handle command/args for checkpoint-enabled images
// When checkpoint is enabled, the image has a smart ENTRYPOINT (e.g., /smart-entrypoint.sh) // When checkpoint is enabled, the image ENTRYPOINT is /restore-entrypoint which
// that detects checkpoints and decides between restore and cold start. // decides between restore and cold start. We pass the user's command as arguments
// We need to pass the user's command as arguments to this ENTRYPOINT rather than // to this ENTRYPOINT (used as cold-start fallback if no checkpoint is ready).
// overriding it with Command.
if len(mainContainer.Command) > 0 { if len(mainContainer.Command) > 0 {
// Combine Command + Args into a single Args array // Combine Command + Args into a single Args array
// This allows the image's ENTRYPOINT to receive the full command as arguments // This allows the image's ENTRYPOINT to receive the full command as arguments
...@@ -489,7 +462,7 @@ func InjectCheckpointIntoPodSpec( ...@@ -489,7 +462,7 @@ func InjectCheckpointIntoPodSpec(
} }
podSpec.SecurityContext.SeccompProfile = &corev1.SeccompProfile{ podSpec.SecurityContext.SeccompProfile = &corev1.SeccompProfile{
Type: corev1.SeccompProfileTypeLocalhost, Type: corev1.SeccompProfileTypeLocalhost,
LocalhostProfile: ptr.To("profiles/block-iouring.json"), LocalhostProfile: ptr.To(consts.SeccompProfilePath),
} }
// Apply container-level security context for CRIU restore // Apply container-level security context for CRIU restore
...@@ -514,32 +487,31 @@ func InjectCheckpointIntoPodSpec( ...@@ -514,32 +487,31 @@ func InjectCheckpointIntoPodSpec(
// S3 storage: location is s3:// URI // S3 storage: location is s3:// URI
// URI format: s3://[endpoint/]bucket/prefix // URI format: s3://[endpoint/]bucket/prefix
info.StorageType = storageTypeToAPI(storageType) info.StorageType = storageTypeToAPI(storageType)
s3URI := "s3://checkpoint-storage/checkpoints" // default if storageConfig == nil || storageConfig.S3.URI == "" {
if storageConfig != nil && storageConfig.S3.URI != "" { return fmt.Errorf("S3 storage type selected but no S3 URI configured (set checkpoint.storage.s3.uri)")
s3URI = storageConfig.S3.URI
} }
// Append hash to the URI info.Location = fmt.Sprintf("%s/%s.tar", storageConfig.S3.URI, info.Hash)
info.Location = fmt.Sprintf("%s/%s.tar", s3URI, info.Hash)
case controller_common.CheckpointStorageTypeOCI: case controller_common.CheckpointStorageTypeOCI:
// OCI storage: location is oci:// URI // OCI storage: location is oci:// URI
// URI format: oci://registry/repository // URI format: oci://registry/repository
info.StorageType = storageTypeToAPI(storageType) info.StorageType = storageTypeToAPI(storageType)
ociURI := "oci://localhost/checkpoints" // default if storageConfig == nil || storageConfig.OCI.URI == "" {
if storageConfig != nil && storageConfig.OCI.URI != "" { return fmt.Errorf("OCI storage type selected but no OCI URI configured (set checkpoint.storage.oci.uri)")
ociURI = storageConfig.OCI.URI
} }
// Append hash as tag info.Location = fmt.Sprintf("%s:%s", storageConfig.OCI.URI, info.Hash)
info.Location = fmt.Sprintf("%s:%s", ociURI, info.Hash)
default: // controller_common.CheckpointStorageTypePVC default: // controller_common.CheckpointStorageTypePVC
// PVC storage: location is the checkpoint directory // PVC storage: location is the checkpoint directory
// k8s-runc-bypass expects: /checkpoints/{hash}/ (directory with checkpoint data) // k8s-runc-bypass expects: /checkpoints/{hash}/ (directory with checkpoint data)
info.StorageType = storageTypeToAPI(storageType) info.StorageType = storageTypeToAPI(storageType)
basePath := getPVCBasePath(storageConfig) basePath := getPVCBasePath(storageConfig)
pvcName := DefaultCheckpointPVCName if storageConfig == nil || storageConfig.PVC.PVCName == "" {
if storageConfig != nil && storageConfig.PVC.PVCName != "" { return fmt.Errorf("PVC storage type selected but no PVC name configured (set checkpoint.storage.pvc.pvcName)")
pvcName = storageConfig.PVC.PVCName }
pvcName := storageConfig.PVC.PVCName
if basePath == "" {
return fmt.Errorf("PVC storage type selected but no PVC base path configured (set checkpoint.storage.pvc.basePath)")
} }
info.Location = fmt.Sprintf("%s/%s", basePath, info.Hash) info.Location = fmt.Sprintf("%s/%s", basePath, info.Hash)
......
...@@ -121,68 +121,38 @@ const ( ...@@ -121,68 +121,38 @@ const (
ResourceStateReady = "ready" ResourceStateReady = "ready"
ResourceStateNotReady = "not_ready" ResourceStateNotReady = "not_ready"
ResourceStateUnknown = "unknown" ResourceStateUnknown = "unknown"
// Checkpoint related constants
KubeLabelCheckpointSource = "nvidia.com/checkpoint-source" // Checkpoint/restore constants
KubeLabelCheckpointHash = "nvidia.com/checkpoint-hash" // CROSS-REFERENCE: Some constants below are duplicated in the chrek package at
KubeLabelCheckpointName = "nvidia.com/checkpoint-name" // deploy/chrek/pkg/config/constants.go. If you change a value here, update there too.
// EnvCheckpointStorageType indicates the storage backend type (pvc, s3, oci) // Kubernetes labels
EnvCheckpointStorageType = "DYN_CHECKPOINT_STORAGE_TYPE" KubeLabelCheckpointSource = "nvidia.com/checkpoint-source" // Pod label that triggers DaemonSet auto-checkpoint
// EnvCheckpointLocation is the source location of the checkpoint KubeLabelCheckpointHash = "nvidia.com/checkpoint-hash" // Checkpoint identity hash for deduplication
// For PVC: same as path (e.g., /checkpoints/{hash}.tar) KubeLabelCheckpointName = "nvidia.com/checkpoint-name" // DynamoCheckpoint CR name reference
// For S3: s3://bucket/prefix/{hash}.tar
// For OCI: oci://registry/repo:{hash} // Environment variables injected into pods
EnvCheckpointLocation = "DYN_CHECKPOINT_LOCATION" EnvCheckpointStorageType = "DYN_CHECKPOINT_STORAGE_TYPE" // Storage backend (pvc, s3, oci) — checkpoint job pods only
// EnvCheckpointPath is the local path to the checkpoint tar file EnvCheckpointLocation = "DYN_CHECKPOINT_LOCATION" // Full checkpoint URI — future S3/OCI; for PVC, use PATH+HASH instead
// For PVC: same as location EnvCheckpointPath = "DYN_CHECKPOINT_PATH" // Base checkpoint directory (e.g., /checkpoints) — PVC restored pods
// For S3/OCI: download destination (e.g., /tmp/{hash}.tar) EnvCheckpointHash = "DYN_CHECKPOINT_HASH" // Identity hash — all checkpoint-related pods
EnvCheckpointPath = "DYN_CHECKPOINT_PATH" EnvCheckpointSignalFile = "DYN_CHECKPOINT_SIGNAL_FILE" // Signal file path — checkpoint job pods
// EnvCheckpointHash is the identity hash (for debugging/observability) EnvReadyForCheckpointFile = "DYN_READY_FOR_CHECKPOINT_FILE" // Ready-for-checkpoint file path — checkpoint job pods
EnvCheckpointHash = "DYN_CHECKPOINT_HASH" EnvRestoreMarkerFile = "DYN_RESTORE_MARKER_FILE" // Restore marker path — injected into restore and checkpoint job pods
// EnvCheckpointSignalFile is the full path to the signal file EnvSkipWaitForCheckpoint = "SKIP_WAIT_FOR_CHECKPOINT" // Skip polling, check once — restored/DGD pods
// The DaemonSet writes this file after checkpoint is complete // Checkpoint pod-internal constants
// The checkpoint job pod waits for this file, then exits successfully CheckpointVolumeName = "checkpoint-storage" // Pod-internal volume name for checkpoint PVC
EnvCheckpointSignalFile = "DYN_CHECKPOINT_SIGNAL_FILE" CheckpointSignalVolumeName = "checkpoint-signal" // Pod-internal volume name for signal hostPath
CheckpointSignalMountPath = "/checkpoint-signal" // Mount path for signal volume inside pods
// EnvCheckpointReadyFile is the full path to a file the worker creates SignalFileCleanupInitContainerName = "cleanup-signal-file" // Init container that removes stale signal files before job starts
// when the model is loaded and ready for checkpointing.
// The readiness probe watches this file to trigger DaemonSet checkpoint. // SeccompProfilePath is the localhost seccomp profile that blocks io_uring syscalls.
EnvCheckpointReadyFile = "DYN_CHECKPOINT_READY_FILE" // Deployed to nodes by the chrek DaemonSet init container.
SeccompProfilePath = "profiles/block-iouring.json"
// CRIU-related environment variables for restore operations
// EnvRestoreMarkerFile is the file created by CRIU after successful restore // Pod identity (Downward API) ---
EnvRestoreMarkerFile = "DYN_RESTORE_MARKER_FILE" // After CRIU restore, env vars contain stale values from the checkpoint pod.
// EnvCRIUWorkDir is the working directory for CRIU operations // The Downward API files at /etc/podinfo always reflect the current pod.
EnvCRIUWorkDir = "CRIU_WORK_DIR"
// EnvCRIULogDir is the directory where CRIU writes logs
EnvCRIULogDir = "CRIU_LOG_DIR"
// EnvCUDAPluginDir is the directory containing CRIU CUDA plugins
EnvCUDAPluginDir = "CUDA_PLUGIN_DIR"
// EnvCRIUTimeout is the timeout for CRIU operations
EnvCRIUTimeout = "CRIU_TIMEOUT"
// CheckpointReadyFilePath is the default path for the ready file
CheckpointReadyFilePath = "/tmp/checkpoint-ready"
// RestoreMarkerFilePath is the default path for the restore marker
RestoreMarkerFilePath = "/tmp/dynamo-restored"
// CRIUWorkDirPath is the default CRIU work directory
CRIUWorkDirPath = "/var/criu-work"
// CRIULogDirPath is the default CRIU log directory
CRIULogDirPath = "/checkpoints/restore-logs"
// CUDAPluginDirPath is the default CUDA plugin directory
CUDAPluginDirPath = "/usr/local/lib/criu"
// DefaultCRIUTimeout is the default CRIU timeout in seconds (6 hours)
DefaultCRIUTimeout = "21600"
CheckpointVolumeName = "checkpoint-storage"
CheckpointSignalVolumeName = "checkpoint-signal"
CheckpointBasePath = "/checkpoints"
CheckpointSignalHostPath = "/var/lib/dynamo-checkpoint/signals"
CheckpointSignalMountPath = "/checkpoint-signal"
// PodInfo volume for Downward API (critical for CRIU restore)
// After CRIU restore, environment variables contain stale values from checkpoint pod.
// The Downward API files at /etc/podinfo always have current pod identity.
PodInfoVolumeName = "podinfo" PodInfoVolumeName = "podinfo"
PodInfoMountPath = "/etc/podinfo" PodInfoMountPath = "/etc/podinfo"
......
...@@ -64,14 +64,6 @@ func (r *CheckpointReconciler) GetRecorder() record.EventRecorder { ...@@ -64,14 +64,6 @@ func (r *CheckpointReconciler) GetRecorder() record.EventRecorder {
return r.Recorder return r.Recorder
} }
// getSignalHostPath returns the configured signal host path, or the default if not set
func (r *CheckpointReconciler) getSignalHostPath() string {
if r.Config.Checkpoint.Enabled && r.Config.Checkpoint.Storage.SignalHostPath != "" {
return r.Config.Checkpoint.Storage.SignalHostPath
}
return consts.CheckpointSignalHostPath
}
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamocheckpoints,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=nvidia.com,resources=dynamocheckpoints,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamocheckpoints/status,verbs=get;update;patch // +kubebuilder:rbac:groups=nvidia.com,resources=dynamocheckpoints/status,verbs=get;update;patch
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamocheckpoints/finalizers,verbs=update // +kubebuilder:rbac:groups=nvidia.com,resources=dynamocheckpoints/finalizers,verbs=update
...@@ -120,8 +112,11 @@ func (r *CheckpointReconciler) Reconcile(ctx context.Context, req ctrl.Request) ...@@ -120,8 +112,11 @@ func (r *CheckpointReconciler) Reconcile(ctx context.Context, req ctrl.Request)
// Nothing to do, checkpoint is ready // Nothing to do, checkpoint is ready
return ctrl.Result{}, nil return ctrl.Result{}, nil
case nvidiacomv1alpha1.DynamoCheckpointPhaseFailed: case nvidiacomv1alpha1.DynamoCheckpointPhaseFailed:
// Could implement retry logic here // Re-evaluate the Job in case retries succeeded after a transient failure.
return ctrl.Result{}, nil if ckpt.Status.JobName == "" {
return ctrl.Result{}, nil
}
return r.handleCreating(ctx, ckpt)
default: default:
// Unknown phase, reset to Pending // Unknown phase, reset to Pending
ckpt.Status.Phase = nvidiacomv1alpha1.DynamoCheckpointPhasePending ckpt.Status.Phase = nvidiacomv1alpha1.DynamoCheckpointPhasePending
...@@ -229,8 +224,15 @@ func (r *CheckpointReconciler) handleCreating(ctx context.Context, ckpt *nvidiac ...@@ -229,8 +224,15 @@ func (r *CheckpointReconciler) handleCreating(ctx context.Context, ckpt *nvidiac
return ctrl.Result{}, nil return ctrl.Result{}, nil
} }
// Check if job failed // Check if job reached terminal Failed condition.
if job.Status.Failed > 0 { jobFailed := false
for _, condition := range job.Status.Conditions {
if condition.Type == batchv1.JobFailed && condition.Status == corev1.ConditionTrue {
jobFailed = true
break
}
}
if jobFailed {
logger.Info("Checkpoint Job failed", "job", job.Name) logger.Info("Checkpoint Job failed", "job", job.Name)
r.Recorder.Event(ckpt, corev1.EventTypeWarning, "CheckpointFailed", "Checkpoint creation failed") r.Recorder.Event(ckpt, corev1.EventTypeWarning, "CheckpointFailed", "Checkpoint creation failed")
...@@ -273,14 +275,14 @@ func (r *CheckpointReconciler) buildCheckpointJob(ckpt *nvidiacomv1alpha1.Dynamo ...@@ -273,14 +275,14 @@ func (r *CheckpointReconciler) buildCheckpointJob(ckpt *nvidiacomv1alpha1.Dynamo
Name: consts.CheckpointSignalVolumeName, Name: consts.CheckpointSignalVolumeName,
VolumeSource: corev1.VolumeSource{ VolumeSource: corev1.VolumeSource{
HostPath: &corev1.HostPathVolumeSource{ HostPath: &corev1.HostPathVolumeSource{
Path: r.getSignalHostPath(), Path: r.Config.Checkpoint.Storage.SignalHostPath,
Type: &hostPathType, Type: &hostPathType,
}, },
}, },
}) })
// Compute the signal file path - unique per checkpoint hash // Compute the signal file path - unique per checkpoint hash
signalFilePath := consts.CheckpointSignalMountPath + "/" + ckpt.Status.IdentityHash + ".done" signalFilePath := consts.CheckpointSignalMountPath + "/" + ckpt.Status.IdentityHash
// Add initContainer to clean up any leftover signal file from previous runs // Add initContainer to clean up any leftover signal file from previous runs
// This ensures a fresh start for each checkpoint job without affecting the checkpoint itself // This ensures a fresh start for each checkpoint job without affecting the checkpoint itself
...@@ -288,7 +290,7 @@ func (r *CheckpointReconciler) buildCheckpointJob(ckpt *nvidiacomv1alpha1.Dynamo ...@@ -288,7 +290,7 @@ func (r *CheckpointReconciler) buildCheckpointJob(ckpt *nvidiacomv1alpha1.Dynamo
initContainerImage := r.Config.Checkpoint.InitContainerImage initContainerImage := r.Config.Checkpoint.InitContainerImage
podTemplate.Spec.InitContainers = append(podTemplate.Spec.InitContainers, corev1.Container{ podTemplate.Spec.InitContainers = append(podTemplate.Spec.InitContainers, corev1.Container{
Name: "cleanup-signal-file", Name: consts.SignalFileCleanupInitContainerName,
Image: initContainerImage, Image: initContainerImage,
Command: []string{ Command: []string{
"sh", "sh",
...@@ -320,8 +322,8 @@ func (r *CheckpointReconciler) buildCheckpointJob(ckpt *nvidiacomv1alpha1.Dynamo ...@@ -320,8 +322,8 @@ func (r *CheckpointReconciler) buildCheckpointJob(ckpt *nvidiacomv1alpha1.Dynamo
}, },
// Ready file: Worker creates this when model is loaded // Ready file: Worker creates this when model is loaded
corev1.EnvVar{ corev1.EnvVar{
Name: consts.EnvCheckpointReadyFile, Name: consts.EnvReadyForCheckpointFile,
Value: consts.CheckpointReadyFilePath, Value: r.Config.Checkpoint.ReadyForCheckpointFilePath,
}, },
// Checkpoint hash: For idempotency check // Checkpoint hash: For idempotency check
corev1.EnvVar{ corev1.EnvVar{
...@@ -338,6 +340,11 @@ func (r *CheckpointReconciler) buildCheckpointJob(ckpt *nvidiacomv1alpha1.Dynamo ...@@ -338,6 +340,11 @@ func (r *CheckpointReconciler) buildCheckpointJob(ckpt *nvidiacomv1alpha1.Dynamo
Name: consts.EnvCheckpointStorageType, Name: consts.EnvCheckpointStorageType,
Value: storageType, Value: storageType,
}, },
// Restore marker: Written by restore-entrypoint after CRIU restore
corev1.EnvVar{
Name: consts.EnvRestoreMarkerFile,
Value: r.Config.Checkpoint.RestoreMarkerFilePath,
},
) )
// Add signal volume mount (required for DaemonSet communication) // Add signal volume mount (required for DaemonSet communication)
...@@ -353,9 +360,6 @@ func (r *CheckpointReconciler) buildCheckpointJob(ckpt *nvidiacomv1alpha1.Dynamo ...@@ -353,9 +360,6 @@ func (r *CheckpointReconciler) buildCheckpointJob(ckpt *nvidiacomv1alpha1.Dynamo
if r.Config.Checkpoint.Storage.PVC.PVCName != "" { if r.Config.Checkpoint.Storage.PVC.PVCName != "" {
pvcName := r.Config.Checkpoint.Storage.PVC.PVCName pvcName := r.Config.Checkpoint.Storage.PVC.PVCName
basePath := r.Config.Checkpoint.Storage.PVC.BasePath basePath := r.Config.Checkpoint.Storage.PVC.BasePath
if basePath == "" {
basePath = consts.CheckpointBasePath
}
checkpoint.InjectCheckpointVolume(&podTemplate.Spec, pvcName) checkpoint.InjectCheckpointVolume(&podTemplate.Spec, pvcName)
checkpoint.InjectCheckpointVolumeMount(mainContainer, basePath) checkpoint.InjectCheckpointVolumeMount(mainContainer, basePath)
} }
...@@ -371,7 +375,7 @@ func (r *CheckpointReconciler) buildCheckpointJob(ckpt *nvidiacomv1alpha1.Dynamo ...@@ -371,7 +375,7 @@ func (r *CheckpointReconciler) buildCheckpointJob(ckpt *nvidiacomv1alpha1.Dynamo
mainContainer.ReadinessProbe = &corev1.Probe{ mainContainer.ReadinessProbe = &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{ ProbeHandler: corev1.ProbeHandler{
Exec: &corev1.ExecAction{ Exec: &corev1.ExecAction{
Command: []string{"cat", consts.CheckpointReadyFilePath}, Command: []string{"cat", r.Config.Checkpoint.ReadyForCheckpointFilePath},
}, },
}, },
InitialDelaySeconds: 15, InitialDelaySeconds: 15,
...@@ -391,14 +395,14 @@ func (r *CheckpointReconciler) buildCheckpointJob(ckpt *nvidiacomv1alpha1.Dynamo ...@@ -391,14 +395,14 @@ func (r *CheckpointReconciler) buildCheckpointJob(ckpt *nvidiacomv1alpha1.Dynamo
podTemplate.Spec.SecurityContext = &corev1.PodSecurityContext{ podTemplate.Spec.SecurityContext = &corev1.PodSecurityContext{
SeccompProfile: &corev1.SeccompProfile{ SeccompProfile: &corev1.SeccompProfile{
Type: corev1.SeccompProfileTypeLocalhost, Type: corev1.SeccompProfileTypeLocalhost,
LocalhostProfile: ptr.To("profiles/block-iouring.json"), LocalhostProfile: ptr.To(consts.SeccompProfilePath),
}, },
} }
// Build the Job // Build the Job
activeDeadlineSeconds := ckpt.Spec.Job.ActiveDeadlineSeconds activeDeadlineSeconds := ckpt.Spec.Job.ActiveDeadlineSeconds
if activeDeadlineSeconds == nil { if activeDeadlineSeconds == nil {
defaultDeadline := int64(3600) defaultDeadline := int64(3600) // 1 hour
activeDeadlineSeconds = &defaultDeadline activeDeadlineSeconds = &defaultDeadline
} }
...@@ -410,7 +414,7 @@ func (r *CheckpointReconciler) buildCheckpointJob(ckpt *nvidiacomv1alpha1.Dynamo ...@@ -410,7 +414,7 @@ func (r *CheckpointReconciler) buildCheckpointJob(ckpt *nvidiacomv1alpha1.Dynamo
ttlSeconds := ckpt.Spec.Job.TTLSecondsAfterFinished ttlSeconds := ckpt.Spec.Job.TTLSecondsAfterFinished
if ttlSeconds == nil { if ttlSeconds == nil {
defaultTTL := int32(300) defaultTTL := int32(300) // 5 minutes
ttlSeconds = &defaultTTL ttlSeconds = &defaultTTL
} }
......
...@@ -105,11 +105,13 @@ type CheckpointConfig struct { ...@@ -105,11 +105,13 @@ type CheckpointConfig struct {
Enabled bool Enabled bool
// Storage holds storage backend configuration // Storage holds storage backend configuration
Storage CheckpointStorageConfig Storage CheckpointStorageConfig
// CRIUTimeout is the CRIU timeout in seconds (required for CUDA checkpoints/restores)
CRIUTimeout string
// InitContainerImage is the image used for init containers (e.g., signal file cleanup) // InitContainerImage is the image used for init containers (e.g., signal file cleanup)
// Defaults to "busybox:latest" if not specified // Defaults to "busybox:latest" if not specified
InitContainerImage string InitContainerImage string
// ReadyForCheckpointFilePath is the file path used to signal model readiness for checkpoint jobs
ReadyForCheckpointFilePath string
// RestoreMarkerFilePath is the marker file path written after successful restore
RestoreMarkerFilePath string
} }
// Checkpoint storage type constants // Checkpoint storage type constants
......
...@@ -198,7 +198,7 @@ Your checkpoint job MUST set these environment variables: ...@@ -198,7 +198,7 @@ Your checkpoint job MUST set these environment variables:
| Variable | Description | Example | | Variable | Description | Example |
|----------|-------------|---------| |----------|-------------|---------|
| `DYN_CHECKPOINT_SIGNAL_FILE` | Path where DaemonSet writes completion signal | `/checkpoint-signal/my-checkpoint.done` | | `DYN_CHECKPOINT_SIGNAL_FILE` | Path where DaemonSet writes completion signal | `/checkpoint-signal/my-checkpoint.done` |
| `DYN_CHECKPOINT_READY_FILE` | Path where your app signals it's ready | `/tmp/checkpoint-ready` | | `DYN_READY_FOR_CHECKPOINT_FILE` | Path where your app signals it's ready | `/tmp/ready-for-checkpoint` |
| `DYN_CHECKPOINT_HASH` | Unique identifier for this checkpoint | `abc123def456` | | `DYN_CHECKPOINT_HASH` | Unique identifier for this checkpoint | `abc123def456` |
| `DYN_CHECKPOINT_LOCATION` | Directory where checkpoint is stored | `/checkpoints/abc123def456` | | `DYN_CHECKPOINT_LOCATION` | Directory where checkpoint is stored | `/checkpoints/abc123def456` |
| `DYN_CHECKPOINT_STORAGE_TYPE` | Storage backend type | `pvc` | | `DYN_CHECKPOINT_STORAGE_TYPE` | Storage backend type | `pvc` |
...@@ -256,7 +256,7 @@ spec: ...@@ -256,7 +256,7 @@ spec:
# This is what triggers the DaemonSet to start checkpointing # This is what triggers the DaemonSet to start checkpointing
readinessProbe: readinessProbe:
exec: exec:
command: ["sh", "-c", "cat ${DYN_CHECKPOINT_READY_FILE}"] command: ["sh", "-c", "cat ${DYN_READY_FOR_CHECKPOINT_FILE}"]
initialDelaySeconds: 15 initialDelaySeconds: 15
periodSeconds: 2 periodSeconds: 2
...@@ -269,8 +269,8 @@ spec: ...@@ -269,8 +269,8 @@ spec:
env: env:
- name: DYN_CHECKPOINT_SIGNAL_FILE - name: DYN_CHECKPOINT_SIGNAL_FILE
value: "/checkpoint-signal/my-checkpoint.done" value: "/checkpoint-signal/my-checkpoint.done"
- name: DYN_CHECKPOINT_READY_FILE - name: DYN_READY_FOR_CHECKPOINT_FILE
value: "/tmp/checkpoint-ready" value: "/tmp/ready-for-checkpoint"
- name: DYN_CHECKPOINT_HASH - name: DYN_CHECKPOINT_HASH
value: "abc123def456" value: "abc123def456"
- name: DYN_CHECKPOINT_LOCATION - name: DYN_CHECKPOINT_LOCATION
...@@ -315,8 +315,8 @@ import time ...@@ -315,8 +315,8 @@ import time
def main(): def main():
# 1. Check for checkpoint mode # 1. Check for checkpoint mode
signal_file = os.environ.get("DYN_CHECKPOINT_SIGNAL_FILE") signal_file = os.environ.get("DYN_CHECKPOINT_SIGNAL_FILE")
ready_file = os.environ.get("DYN_CHECKPOINT_READY_FILE") ready_file = os.environ.get("DYN_READY_FOR_CHECKPOINT_FILE")
restore_marker = os.environ.get("DYN_RESTORE_MARKER_FILE", "/tmp/dynamo-restored") restore_marker = os.environ.get("DYN_RESTORE_MARKER_FILE")
is_checkpoint_mode = signal_file is not None is_checkpoint_mode = signal_file is not None
...@@ -366,7 +366,7 @@ def main(): ...@@ -366,7 +366,7 @@ def main():
```yaml ```yaml
readinessProbe: readinessProbe:
exec: exec:
command: ["sh", "-c", "cat ${DYN_CHECKPOINT_READY_FILE}"] command: ["sh", "-c", "cat ${DYN_READY_FOR_CHECKPOINT_FILE}"]
initialDelaySeconds: 15 initialDelaySeconds: 15
periodSeconds: 2 periodSeconds: 2
``` ```
...@@ -415,9 +415,8 @@ spec: ...@@ -415,9 +415,8 @@ spec:
- name: DYN_CHECKPOINT_PATH - name: DYN_CHECKPOINT_PATH
value: "/checkpoints" # Base path (hash appended automatically) value: "/checkpoints" # Base path (hash appended automatically)
# Optional: Customize restore marker file path - name: DYN_RESTORE_MARKER_FILE
# - name: DYN_RESTORE_MARKER_FILE value: "/tmp/dynamo-restored"
# value: "/tmp/dynamo-restored"
# GPU request # GPU request
resources: resources:
...@@ -458,7 +457,7 @@ spec: ...@@ -458,7 +457,7 @@ spec:
| Variable | Required | Description | | Variable | Required | Description |
|----------|----------|-------------| |----------|----------|-------------|
| `DYN_CHECKPOINT_SIGNAL_FILE` | Yes | Full path to signal file (e.g., `/checkpoint-signal/my-checkpoint.done`) | | `DYN_CHECKPOINT_SIGNAL_FILE` | Yes | Full path to signal file (e.g., `/checkpoint-signal/my-checkpoint.done`) |
| `DYN_CHECKPOINT_READY_FILE` | Yes | Full path where app signals readiness (e.g., `/tmp/checkpoint-ready`) | | `DYN_READY_FOR_CHECKPOINT_FILE` | Yes | Full path where app signals readiness (e.g., `/tmp/ready-for-checkpoint`) |
| `DYN_CHECKPOINT_HASH` | Yes | Unique checkpoint identifier (alphanumeric string) | | `DYN_CHECKPOINT_HASH` | Yes | Unique checkpoint identifier (alphanumeric string) |
| `DYN_CHECKPOINT_LOCATION` | Yes | Directory where checkpoint is stored (e.g., `/checkpoints/abc123`) | | `DYN_CHECKPOINT_LOCATION` | Yes | Directory where checkpoint is stored (e.g., `/checkpoints/abc123`) |
| `DYN_CHECKPOINT_STORAGE_TYPE` | Yes | Storage backend: `pvc`, `s3`, or `oci` | | `DYN_CHECKPOINT_STORAGE_TYPE` | Yes | Storage backend: `pvc`, `s3`, or `oci` |
...@@ -469,7 +468,7 @@ spec: ...@@ -469,7 +468,7 @@ spec:
|----------|----------|-------------| |----------|----------|-------------|
| `DYN_CHECKPOINT_HASH` | Yes | Checkpoint identifier (must match checkpoint job) | | `DYN_CHECKPOINT_HASH` | Yes | Checkpoint identifier (must match checkpoint job) |
| `DYN_CHECKPOINT_PATH` | Yes | Base checkpoint directory (hash appended automatically) | | `DYN_CHECKPOINT_PATH` | Yes | Base checkpoint directory (hash appended automatically) |
| `DYN_RESTORE_MARKER_FILE` | No | Path for restore marker file (default: `/tmp/dynamo-restored`) | | `DYN_RESTORE_MARKER_FILE` | Yes | Path for restore marker file |
### Optional CRIU Tuning (Advanced) ### Optional CRIU Tuning (Advanced)
...@@ -500,7 +499,7 @@ spec: ...@@ -500,7 +499,7 @@ spec:
┌─────────────────────────────────────────────────────────────┐ ┌─────────────────────────────────────────────────────────────┐
│ 2. Application loads model and creates ready file │ │ 2. Application loads model and creates ready file │
│ /tmp/checkpoint-ready │ /tmp/ready-for-checkpoint │
└──────────────────────┬──────────────────────────────────────┘ └──────────────────────┬──────────────────────────────────────┘
...@@ -513,7 +512,7 @@ spec: ...@@ -513,7 +512,7 @@ spec:
│ 4. ChReK DaemonSet detects: │ │ 4. ChReK DaemonSet detects: │
│ - Pod is Ready │ │ - Pod is Ready │
│ - Has checkpoint-source label │ │ - Has checkpoint-source label │
│ - Ready file exists: /tmp/checkpoint-ready │ - Ready file exists: /tmp/ready-for-checkpoint │
└──────────────────────┬──────────────────────────────────────┘ └──────────────────────┬──────────────────────────────────────┘
...@@ -603,7 +602,7 @@ spec: ...@@ -603,7 +602,7 @@ spec:
3. Check ready file was created: 3. Check ready file was created:
```bash ```bash
kubectl exec <pod-name> -- ls -la /tmp/checkpoint-ready kubectl exec <pod-name> -- ls -la /tmp/ready-for-checkpoint
``` ```
4. Check DaemonSet logs: 4. Check DaemonSet logs:
......
...@@ -201,7 +201,7 @@ Your checkpoint job MUST set these environment variables: ...@@ -201,7 +201,7 @@ Your checkpoint job MUST set these environment variables:
| Variable | Description | Example | | Variable | Description | Example |
|----------|-------------|---------| |----------|-------------|---------|
| `DYN_CHECKPOINT_SIGNAL_FILE` | Path where DaemonSet writes completion signal | `/checkpoint-signal/my-checkpoint.done` | | `DYN_CHECKPOINT_SIGNAL_FILE` | Path where DaemonSet writes completion signal | `/checkpoint-signal/my-checkpoint.done` |
| `DYN_CHECKPOINT_READY_FILE` | Path where your app signals it's ready | `/tmp/checkpoint-ready` | | `DYN_READY_FOR_CHECKPOINT_FILE` | Path where your app signals it's ready | `/tmp/ready-for-checkpoint` |
| `DYN_CHECKPOINT_HASH` | Unique identifier for this checkpoint | `abc123def456` | | `DYN_CHECKPOINT_HASH` | Unique identifier for this checkpoint | `abc123def456` |
| `DYN_CHECKPOINT_LOCATION` | Directory where checkpoint is stored | `/checkpoints/abc123def456` | | `DYN_CHECKPOINT_LOCATION` | Directory where checkpoint is stored | `/checkpoints/abc123def456` |
| `DYN_CHECKPOINT_STORAGE_TYPE` | Storage backend type | `pvc` | | `DYN_CHECKPOINT_STORAGE_TYPE` | Storage backend type | `pvc` |
...@@ -259,7 +259,7 @@ spec: ...@@ -259,7 +259,7 @@ spec:
# This is what triggers the DaemonSet to start checkpointing # This is what triggers the DaemonSet to start checkpointing
readinessProbe: readinessProbe:
exec: exec:
command: ["sh", "-c", "cat ${DYN_CHECKPOINT_READY_FILE}"] command: ["sh", "-c", "cat ${DYN_READY_FOR_CHECKPOINT_FILE}"]
initialDelaySeconds: 15 initialDelaySeconds: 15
periodSeconds: 2 periodSeconds: 2
...@@ -272,8 +272,8 @@ spec: ...@@ -272,8 +272,8 @@ spec:
env: env:
- name: DYN_CHECKPOINT_SIGNAL_FILE - name: DYN_CHECKPOINT_SIGNAL_FILE
value: "/checkpoint-signal/my-checkpoint.done" value: "/checkpoint-signal/my-checkpoint.done"
- name: DYN_CHECKPOINT_READY_FILE - name: DYN_READY_FOR_CHECKPOINT_FILE
value: "/tmp/checkpoint-ready" value: "/tmp/ready-for-checkpoint"
- name: DYN_CHECKPOINT_HASH - name: DYN_CHECKPOINT_HASH
value: "abc123def456" value: "abc123def456"
- name: DYN_CHECKPOINT_LOCATION - name: DYN_CHECKPOINT_LOCATION
...@@ -318,8 +318,8 @@ import time ...@@ -318,8 +318,8 @@ import time
def main(): def main():
# 1. Check for checkpoint mode # 1. Check for checkpoint mode
signal_file = os.environ.get("DYN_CHECKPOINT_SIGNAL_FILE") signal_file = os.environ.get("DYN_CHECKPOINT_SIGNAL_FILE")
ready_file = os.environ.get("DYN_CHECKPOINT_READY_FILE") ready_file = os.environ.get("DYN_READY_FOR_CHECKPOINT_FILE")
restore_marker = os.environ.get("DYN_RESTORE_MARKER_FILE", "/tmp/dynamo-restored") restore_marker = os.environ.get("DYN_RESTORE_MARKER_FILE")
is_checkpoint_mode = signal_file is not None is_checkpoint_mode = signal_file is not None
...@@ -369,7 +369,7 @@ def main(): ...@@ -369,7 +369,7 @@ def main():
```yaml ```yaml
readinessProbe: readinessProbe:
exec: exec:
command: ["sh", "-c", "cat ${DYN_CHECKPOINT_READY_FILE}"] command: ["sh", "-c", "cat ${DYN_READY_FOR_CHECKPOINT_FILE}"]
initialDelaySeconds: 15 initialDelaySeconds: 15
periodSeconds: 2 periodSeconds: 2
``` ```
...@@ -418,9 +418,8 @@ spec: ...@@ -418,9 +418,8 @@ spec:
- name: DYN_CHECKPOINT_PATH - name: DYN_CHECKPOINT_PATH
value: "/checkpoints" # Base path (hash appended automatically) value: "/checkpoints" # Base path (hash appended automatically)
# Optional: Customize restore marker file path - name: DYN_RESTORE_MARKER_FILE
# - name: DYN_RESTORE_MARKER_FILE value: "/tmp/dynamo-restored"
# value: "/tmp/dynamo-restored"
# GPU request # GPU request
resources: resources:
...@@ -461,7 +460,7 @@ spec: ...@@ -461,7 +460,7 @@ spec:
| Variable | Required | Description | | Variable | Required | Description |
|----------|----------|-------------| |----------|----------|-------------|
| `DYN_CHECKPOINT_SIGNAL_FILE` | Yes | Full path to signal file (e.g., `/checkpoint-signal/my-checkpoint.done`) | | `DYN_CHECKPOINT_SIGNAL_FILE` | Yes | Full path to signal file (e.g., `/checkpoint-signal/my-checkpoint.done`) |
| `DYN_CHECKPOINT_READY_FILE` | Yes | Full path where app signals readiness (e.g., `/tmp/checkpoint-ready`) | | `DYN_READY_FOR_CHECKPOINT_FILE` | Yes | Full path where app signals readiness (e.g., `/tmp/ready-for-checkpoint`) |
| `DYN_CHECKPOINT_HASH` | Yes | Unique checkpoint identifier (alphanumeric string) | | `DYN_CHECKPOINT_HASH` | Yes | Unique checkpoint identifier (alphanumeric string) |
| `DYN_CHECKPOINT_LOCATION` | Yes | Directory where checkpoint is stored (e.g., `/checkpoints/abc123`) | | `DYN_CHECKPOINT_LOCATION` | Yes | Directory where checkpoint is stored (e.g., `/checkpoints/abc123`) |
| `DYN_CHECKPOINT_STORAGE_TYPE` | Yes | Storage backend: `pvc`, `s3`, or `oci` | | `DYN_CHECKPOINT_STORAGE_TYPE` | Yes | Storage backend: `pvc`, `s3`, or `oci` |
...@@ -472,7 +471,7 @@ spec: ...@@ -472,7 +471,7 @@ spec:
|----------|----------|-------------| |----------|----------|-------------|
| `DYN_CHECKPOINT_HASH` | Yes | Checkpoint identifier (must match checkpoint job) | | `DYN_CHECKPOINT_HASH` | Yes | Checkpoint identifier (must match checkpoint job) |
| `DYN_CHECKPOINT_PATH` | Yes | Base checkpoint directory (hash appended automatically) | | `DYN_CHECKPOINT_PATH` | Yes | Base checkpoint directory (hash appended automatically) |
| `DYN_RESTORE_MARKER_FILE` | No | Path for restore marker file (default: `/tmp/dynamo-restored`) | | `DYN_RESTORE_MARKER_FILE` | Yes | Path for restore marker file |
### Optional CRIU Tuning (Advanced) ### Optional CRIU Tuning (Advanced)
...@@ -503,7 +502,7 @@ spec: ...@@ -503,7 +502,7 @@ spec:
┌─────────────────────────────────────────────────────────────┐ ┌─────────────────────────────────────────────────────────────┐
│ 2. Application loads model and creates ready file │ │ 2. Application loads model and creates ready file │
│ /tmp/checkpoint-ready │ /tmp/ready-for-checkpoint │
└──────────────────────┬──────────────────────────────────────┘ └──────────────────────┬──────────────────────────────────────┘
...@@ -516,7 +515,7 @@ spec: ...@@ -516,7 +515,7 @@ spec:
│ 4. ChReK DaemonSet detects: │ │ 4. ChReK DaemonSet detects: │
│ - Pod is Ready │ │ - Pod is Ready │
│ - Has checkpoint-source label │ │ - Has checkpoint-source label │
│ - Ready file exists: /tmp/checkpoint-ready │ - Ready file exists: /tmp/ready-for-checkpoint │
└──────────────────────┬──────────────────────────────────────┘ └──────────────────────┬──────────────────────────────────────┘
...@@ -606,7 +605,7 @@ spec: ...@@ -606,7 +605,7 @@ spec:
3. Check ready file was created: 3. Check ready file was created:
```bash ```bash
kubectl exec <pod-name> -- ls -la /tmp/checkpoint-ready kubectl exec <pod-name> -- ls -la /tmp/ready-for-checkpoint
``` ```
4. Check DaemonSet logs: 4. Check DaemonSet logs:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment