w.log.V(1).Info("Checkpoint not ready on disk, skipping restore","pod",podKey,"checkpoint_hash",checkpointHash)
return
}
if!w.tryAcquire(podKey){
return
}
returnfalse
}
w.log.Info("Restore pod running, triggering external restore","pod",podKey,"checkpoint_hash",checkpointHash)
emitPodEvent(ctx,w.clientset,w.log,pod,"chrek",corev1.EventTypeNormal,"RestoreRequested",fmt.Sprintf("Restore requested from checkpoint %s",checkpointHash))
// doCheckpoint performs the checkpoint and writes the signal file
log.WithError(err).WithField("path",markerPath).Error("Failed to write checkpoint.done marker")
return
emitPodEvent(ctx,w.clientset,log,pod,"chrek",corev1.EventTypeNormal,"RestoreSucceeded",fmt.Sprintf("Restore completed from checkpoint %s",checkpointHash))
> ⚠️ **Experimental Feature**: ChReK is currently in **beta/preview**. It requires privileged mode for restore operations, which may not be suitable for all production environments. See [Prerequisites](#prerequisites) for security considerations.
> ⚠️ **Experimental Feature**: ChReK is currently in **beta/preview**. The DaemonSet runs in privileged mode to perform CRIU operations. See [Prerequisites](#prerequisites) for security considerations.
This Helm chart deploys the checkpoint/restore infrastructure for NVIDIA Dynamo, including:
- Persistent Volume Claim (PVC) for checkpoint storage
...
...
@@ -14,14 +14,14 @@ This Helm chart deploys the checkpoint/restore infrastructure for NVIDIA Dynamo,
## Prerequisites
⚠️ **Security Warning**: ChReK restore operations require**privileged mode**, which grants containers elevated host access. This may violate security policies in production environments. Only deploy in environments where privileged containers are acceptable.
⚠️ **Security Warning**: The ChReK DaemonSet runs in**privileged mode** with `hostPID`, `hostIPC`, and `hostNetwork` to perform CRIU checkpoint/restore operations. Workload pods do not need privileged mode. Only deploy in environments where a privileged DaemonSet is acceptable.
- Kubernetes 1.21+
- GPU nodes with NVIDIA runtime (`nvidia` runtime class)
-CRIU support in the container runtime (containerd with CRIU plugin)
- NVIDIA Dynamo operator installed (cluster-wide or namespace-scoped)
- containerd runtime (for container inspection; CRIU is bundled in ChReK images)
- NVIDIA Dynamo operator installed (cluster-wide or namespace-scoped), **or** manual pod configuration — see [Standalone Usage](../../../../docs/pages/kubernetes/chrek/standalone.md#using-chrek-without-the-dynamo-operator) for required labels, seccomp profiles, command overrides, and deployment strategy when running without the operator
- RWX (ReadWriteMany) storage class for multi-node deployments
-**Security clearance for privileged pods** (required for restore operations)
-**Security clearance for privileged DaemonSet** (the ChReK agent runs privileged with hostPID/hostIPC/hostNetwork)
## Installation
...
...
@@ -63,11 +63,10 @@ See `values.yaml` for all configuration options.
| `storage.pvc.name` | PVC name (must match operator config) | `chrek-pvc` |
| `storage.pvc.size` | PVC size | `100Gi` |
| `storage.pvc.storageClass` | Storage class name | `""` (default) |
@@ -165,7 +165,6 @@ The chart includes built-in validation to prevent all operator conflicts:
| dynamo-operator.checkpoint.readyForCheckpointFilePath | string | `"/tmp/ready-for-checkpoint"` | Path written by worker when model is loaded and ready for checkpointing |
| dynamo-operator.checkpoint.restoreMarkerFilePath | string | `"/tmp/dynamo-restored"` | Path written by restore-entrypoint after successful CRIU restore |
| dynamo-operator.checkpoint.storage.signalHostPath | string | `"/var/lib/chrek/signals"` | Host path for signal files (communication between checkpoint pod and DaemonSet) |
| dynamo-operator.checkpoint.storage.pvc.pvcName | string | `"chrek-pvc"` | Name of the PVC created by the chrek chart |
| dynamo-operator.checkpoint.storage.pvc.basePath | string | `"/checkpoints"` | Base path within the PVC for storing checkpoints |
| dynamo-operator.checkpoint.storage.s3.uri | string | `""` | S3 URI in format: s3://[endpoint/]bucket/prefix |