Unverified Commit 6831020f authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

chore: rename chrek to Dynamo Snapshot (#7028)


Signed-off-by: default avatarSchwinn Saereesitthipitak <17022745+galletas1712@users.noreply.github.com>
parent 7dbebf3c
// Package watcher provides Kubernetes pod watching for automatic checkpoint/restore. // Package watcher provides Kubernetes pod watching for automatic checkpoint/restore.
// The watcher is the sole entry point for chrek operations — it detects pods with // The watcher is the sole entry point for snapshot operations — it detects pods with
// checkpoint/restore labels and calls the orchestrators directly. // checkpoint/restore labels and calls the orchestrators directly.
package watcher package watcher
...@@ -23,17 +23,17 @@ import ( ...@@ -23,17 +23,17 @@ import (
"k8s.io/client-go/rest" "k8s.io/client-go/rest"
"k8s.io/client-go/tools/cache" "k8s.io/client-go/tools/cache"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/common" "github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/common"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/orchestrate" "github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/orchestrate"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/types" "github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
) )
const ( const (
kubeLabelIsCheckpointSource = "nvidia.com/chrek-is-checkpoint-source" kubeLabelIsCheckpointSource = "nvidia.com/snapshot-is-checkpoint-source"
kubeLabelCheckpointHash = "nvidia.com/chrek-checkpoint-hash" kubeLabelCheckpointHash = "nvidia.com/snapshot-checkpoint-hash"
kubeLabelIsRestoreTarget = "nvidia.com/chrek-is-restore-target" kubeLabelIsRestoreTarget = "nvidia.com/snapshot-is-restore-target"
kubeAnnotationCheckpointStatus = "nvidia.com/chrek-checkpoint-status" kubeAnnotationCheckpointStatus = "nvidia.com/snapshot-checkpoint-status"
kubeAnnotationRestoreStatus = "nvidia.com/chrek-restore-status" kubeAnnotationRestoreStatus = "nvidia.com/snapshot-restore-status"
) )
// Watcher watches for pods with checkpoint/restore labels and triggers operations. // Watcher watches for pods with checkpoint/restore labels and triggers operations.
...@@ -199,7 +199,7 @@ func (w *Watcher) handleCheckpointPodEvent(ctx context.Context, pod *corev1.Pod) ...@@ -199,7 +199,7 @@ func (w *Watcher) handleCheckpointPodEvent(ctx context.Context, pod *corev1.Pod)
} }
w.log.Info("Pod ready, triggering checkpoint", "pod", podKey, "checkpoint_hash", checkpointHash) w.log.Info("Pod ready, triggering checkpoint", "pod", podKey, "checkpoint_hash", checkpointHash)
emitPodEvent(ctx, w.clientset, w.log, pod, "chrek", corev1.EventTypeNormal, "CheckpointRequested", fmt.Sprintf("Checkpoint requested: %s", checkpointHash)) emitPodEvent(ctx, w.clientset, w.log, pod, "snapshot", corev1.EventTypeNormal, "CheckpointRequested", fmt.Sprintf("Checkpoint requested: %s", checkpointHash))
go w.doCheckpoint(ctx, pod, checkpointHash, podKey) go w.doCheckpoint(ctx, pod, checkpointHash, podKey)
} }
...@@ -249,7 +249,7 @@ func (w *Watcher) handleRestorePodEvent(ctx context.Context, pod *corev1.Pod) { ...@@ -249,7 +249,7 @@ func (w *Watcher) handleRestorePodEvent(ctx context.Context, pod *corev1.Pod) {
} }
w.log.Info("Restore pod running, triggering external restore", "pod", podKey, "checkpoint_hash", checkpointHash) w.log.Info("Restore pod running, triggering external restore", "pod", podKey, "checkpoint_hash", checkpointHash)
emitPodEvent(ctx, w.clientset, w.log, pod, "chrek", corev1.EventTypeNormal, "RestoreRequested", fmt.Sprintf("Restore requested from checkpoint %s", checkpointHash)) emitPodEvent(ctx, w.clientset, w.log, pod, "snapshot", corev1.EventTypeNormal, "RestoreRequested", fmt.Sprintf("Restore requested from checkpoint %s", checkpointHash))
go w.doRestore(ctx, pod, checkpointHash, podKey) go w.doRestore(ctx, pod, checkpointHash, podKey)
} }
...@@ -276,7 +276,7 @@ func (w *Watcher) doCheckpoint(ctx context.Context, pod *corev1.Pod, checkpointH ...@@ -276,7 +276,7 @@ func (w *Watcher) doCheckpoint(ctx context.Context, pod *corev1.Pod, checkpointH
if containerName == "" { if containerName == "" {
err := fmt.Errorf("no containers found in pod spec") err := fmt.Errorf("no containers found in pod spec")
log.Error(err, "Checkpoint failed") log.Error(err, "Checkpoint failed")
emitPodEvent(ctx, w.clientset, log, pod, "chrek", corev1.EventTypeWarning, "CheckpointFailed", err.Error()) emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "CheckpointFailed", err.Error())
annotatePod(ctx, w.clientset, log, pod, map[string]string{kubeAnnotationCheckpointStatus: "failed"}) annotatePod(ctx, w.clientset, log, pod, map[string]string{kubeAnnotationCheckpointStatus: "failed"})
return return
} }
...@@ -288,7 +288,7 @@ func (w *Watcher) doCheckpoint(ctx context.Context, pod *corev1.Pod, checkpointH ...@@ -288,7 +288,7 @@ func (w *Watcher) doCheckpoint(ctx context.Context, pod *corev1.Pod, checkpointH
} }
} }
if containerID == "" { if containerID == "" {
emitPodEvent(ctx, w.clientset, log, pod, "chrek", corev1.EventTypeWarning, "CheckpointFailed", "Could not resolve target container ID") emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "CheckpointFailed", "Could not resolve target container ID")
annotatePod(ctx, w.clientset, log, pod, map[string]string{kubeAnnotationCheckpointStatus: "failed"}) annotatePod(ctx, w.clientset, log, pod, map[string]string{kubeAnnotationCheckpointStatus: "failed"})
return return
} }
...@@ -297,7 +297,7 @@ func (w *Watcher) doCheckpoint(ctx context.Context, pod *corev1.Pod, checkpointH ...@@ -297,7 +297,7 @@ func (w *Watcher) doCheckpoint(ctx context.Context, pod *corev1.Pod, checkpointH
containerPID, _, err := common.ResolveContainer(ctx, w.containerd, containerID) containerPID, _, err := common.ResolveContainer(ctx, w.containerd, containerID)
if err != nil { if err != nil {
log.Error(err, "Failed to resolve container") log.Error(err, "Failed to resolve container")
emitPodEvent(ctx, w.clientset, log, pod, "chrek", corev1.EventTypeWarning, "CheckpointFailed", fmt.Sprintf("Container resolve failed: %v", err)) emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "CheckpointFailed", fmt.Sprintf("Container resolve failed: %v", err))
annotatePod(ctx, w.clientset, log, pod, map[string]string{kubeAnnotationCheckpointStatus: "failed"}) annotatePod(ctx, w.clientset, log, pod, map[string]string{kubeAnnotationCheckpointStatus: "failed"})
return return
} }
...@@ -314,7 +314,7 @@ func (w *Watcher) doCheckpoint(ctx context.Context, pod *corev1.Pod, checkpointH ...@@ -314,7 +314,7 @@ func (w *Watcher) doCheckpoint(ctx context.Context, pod *corev1.Pod, checkpointH
} }
if err := orchestrate.Checkpoint(ctx, w.containerd, log, req, w.config); err != nil { if err := orchestrate.Checkpoint(ctx, w.containerd, log, req, w.config); err != nil {
log.Error(err, "Checkpoint failed") log.Error(err, "Checkpoint failed")
emitPodEvent(ctx, w.clientset, log, pod, "chrek", corev1.EventTypeWarning, "CheckpointFailed", err.Error()) emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "CheckpointFailed", err.Error())
// SIGKILL on failure: process is unrecoverable (CUDA locked), terminate immediately // SIGKILL on failure: process is unrecoverable (CUDA locked), terminate immediately
if signalErr := common.SendSignalToPID(log, containerPID, syscall.SIGKILL, "checkpoint failed"); signalErr != nil { if signalErr := common.SendSignalToPID(log, containerPID, syscall.SIGKILL, "checkpoint failed"); signalErr != nil {
log.Error(signalErr, "Failed to signal checkpoint failure to runtime process") log.Error(signalErr, "Failed to signal checkpoint failure to runtime process")
...@@ -324,10 +324,10 @@ func (w *Watcher) doCheckpoint(ctx context.Context, pod *corev1.Pod, checkpointH ...@@ -324,10 +324,10 @@ func (w *Watcher) doCheckpoint(ctx context.Context, pod *corev1.Pod, checkpointH
} }
// Step 2: SIGUSR1 on success: notify the workload that checkpoint completed // Step 2: SIGUSR1 on success: notify the workload that checkpoint completed
emitPodEvent(ctx, w.clientset, log, pod, "chrek", corev1.EventTypeNormal, "CheckpointSucceeded", fmt.Sprintf("Checkpoint completed: %s", checkpointHash)) emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeNormal, "CheckpointSucceeded", fmt.Sprintf("Checkpoint completed: %s", checkpointHash))
if err := common.SendSignalToPID(log, containerPID, syscall.SIGUSR1, "checkpoint complete"); err != nil { if err := common.SendSignalToPID(log, containerPID, syscall.SIGUSR1, "checkpoint complete"); err != nil {
log.Error(err, "Failed to signal checkpoint completion to runtime process") log.Error(err, "Failed to signal checkpoint completion to runtime process")
emitPodEvent(ctx, w.clientset, log, pod, "chrek", corev1.EventTypeWarning, "CheckpointFailed", err.Error()) emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "CheckpointFailed", err.Error())
annotatePod(ctx, w.clientset, log, pod, map[string]string{kubeAnnotationCheckpointStatus: "failed"}) annotatePod(ctx, w.clientset, log, pod, map[string]string{kubeAnnotationCheckpointStatus: "failed"})
return return
} }
...@@ -356,7 +356,7 @@ func (w *Watcher) doRestore(ctx context.Context, pod *corev1.Pod, checkpointHash ...@@ -356,7 +356,7 @@ func (w *Watcher) doRestore(ctx context.Context, pod *corev1.Pod, checkpointHash
if containerName == "" { if containerName == "" {
err := fmt.Errorf("no containers found in pod spec") err := fmt.Errorf("no containers found in pod spec")
log.Error(err, "Restore failed") log.Error(err, "Restore failed")
emitPodEvent(ctx, w.clientset, log, pod, "chrek", corev1.EventTypeWarning, "RestoreFailed", err.Error()) emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "RestoreFailed", err.Error())
annotatePod(ctx, w.clientset, log, pod, map[string]string{kubeAnnotationRestoreStatus: "failed"}) annotatePod(ctx, w.clientset, log, pod, map[string]string{kubeAnnotationRestoreStatus: "failed"})
return return
} }
...@@ -373,7 +373,7 @@ func (w *Watcher) doRestore(ctx context.Context, pod *corev1.Pod, checkpointHash ...@@ -373,7 +373,7 @@ func (w *Watcher) doRestore(ctx context.Context, pod *corev1.Pod, checkpointHash
restoredPID, err := orchestrate.Restore(ctx, w.containerd, log, req) restoredPID, err := orchestrate.Restore(ctx, w.containerd, log, req)
if err != nil { if err != nil {
log.Error(err, "External restore failed") log.Error(err, "External restore failed")
emitPodEvent(ctx, w.clientset, log, pod, "chrek", corev1.EventTypeWarning, "RestoreFailed", err.Error()) emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "RestoreFailed", err.Error())
annotatePod(ctx, w.clientset, log, pod, map[string]string{kubeAnnotationRestoreStatus: "failed"}) annotatePod(ctx, w.clientset, log, pod, map[string]string{kubeAnnotationRestoreStatus: "failed"})
return return
} }
...@@ -382,13 +382,13 @@ func (w *Watcher) doRestore(ctx context.Context, pod *corev1.Pod, checkpointHash ...@@ -382,13 +382,13 @@ func (w *Watcher) doRestore(ctx context.Context, pod *corev1.Pod, checkpointHash
placeholderHostPID, _, err := common.ResolveContainerByPod(ctx, w.containerd, pod.Name, pod.Namespace, containerName) placeholderHostPID, _, err := common.ResolveContainerByPod(ctx, w.containerd, pod.Name, pod.Namespace, containerName)
if err != nil { if err != nil {
log.Error(err, "Failed to resolve placeholder host PID for signaling") log.Error(err, "Failed to resolve placeholder host PID for signaling")
emitPodEvent(ctx, w.clientset, log, pod, "chrek", corev1.EventTypeWarning, "RestoreFailed", err.Error()) emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "RestoreFailed", err.Error())
annotatePod(ctx, w.clientset, log, pod, map[string]string{kubeAnnotationRestoreStatus: "failed"}) annotatePod(ctx, w.clientset, log, pod, map[string]string{kubeAnnotationRestoreStatus: "failed"})
return return
} }
if err := common.SendSignalViaPIDNamespace(ctx, log, placeholderHostPID, restoredPID, syscall.SIGCONT, "restore complete"); err != nil { if err := common.SendSignalViaPIDNamespace(ctx, log, placeholderHostPID, restoredPID, syscall.SIGCONT, "restore complete"); err != nil {
log.Error(err, "Failed to signal restored runtime process") log.Error(err, "Failed to signal restored runtime process")
emitPodEvent(ctx, w.clientset, log, pod, "chrek", corev1.EventTypeWarning, "RestoreFailed", err.Error()) emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "RestoreFailed", err.Error())
annotatePod(ctx, w.clientset, log, pod, map[string]string{kubeAnnotationRestoreStatus: "failed"}) annotatePod(ctx, w.clientset, log, pod, map[string]string{kubeAnnotationRestoreStatus: "failed"})
return return
} }
...@@ -402,12 +402,12 @@ func (w *Watcher) doRestore(ctx context.Context, pod *corev1.Pod, checkpointHash ...@@ -402,12 +402,12 @@ func (w *Watcher) doRestore(ctx context.Context, pod *corev1.Pod, checkpointHash
} }
if err := waitForPodReady(readyCtx, w.clientset, pod.Namespace, pod.Name, containerName); err != nil { if err := waitForPodReady(readyCtx, w.clientset, pod.Namespace, pod.Name, containerName); err != nil {
log.Error(err, "Restore post-signal readiness check failed") log.Error(err, "Restore post-signal readiness check failed")
emitPodEvent(ctx, w.clientset, log, pod, "chrek", corev1.EventTypeWarning, "RestoreFailed", err.Error()) emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "RestoreFailed", err.Error())
annotatePod(ctx, w.clientset, log, pod, map[string]string{kubeAnnotationRestoreStatus: "failed"}) annotatePod(ctx, w.clientset, log, pod, map[string]string{kubeAnnotationRestoreStatus: "failed"})
return return
} }
emitPodEvent(ctx, w.clientset, log, pod, "chrek", corev1.EventTypeNormal, "RestoreSucceeded", fmt.Sprintf("Restore completed from checkpoint %s", checkpointHash)) emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeNormal, "RestoreSucceeded", fmt.Sprintf("Restore completed from checkpoint %s", checkpointHash))
annotatePod(ctx, w.clientset, log, pod, map[string]string{kubeAnnotationRestoreStatus: "completed"}) annotatePod(ctx, w.clientset, log, pod, map[string]string{kubeAnnotationRestoreStatus: "completed"})
} }
......
...@@ -12,7 +12,7 @@ import ( ...@@ -12,7 +12,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes/fake" "k8s.io/client-go/kubernetes/fake"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/types" "github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
) )
const testNodeName = "test-node" const testNodeName = "test-node"
......
...@@ -54,10 +54,10 @@ navigation: ...@@ -54,10 +54,10 @@ navigation:
- page: Inference Gateway (GAIE) - page: Inference Gateway (GAIE)
path: kubernetes/inference-gateway.md path: kubernetes/inference-gateway.md
- section: Checkpointing - section: Checkpointing
path: kubernetes/chrek/README.md path: kubernetes/snapshot/README.md
contents: contents:
- page: Integration with Dynamo - page: Integration with Dynamo
path: kubernetes/chrek/dynamo.md path: kubernetes/snapshot/dynamo.md
- section: Observability (K8s) - section: Observability (K8s)
contents: contents:
- page: Metrics - page: Metrics
......
...@@ -230,7 +230,7 @@ Key customization points include: ...@@ -230,7 +230,7 @@ Key customization points include:
- **[Operator Documentation](dynamo-operator.md)** - How the platform works - **[Operator Documentation](dynamo-operator.md)** - How the platform works
- **[Service Discovery](service-discovery.md)** - Discovery backends and configuration - **[Service Discovery](service-discovery.md)** - Discovery backends and configuration
- **[Helm Charts](https://github.com/ai-dynamo/dynamo/tree/main/deploy/helm/README.md)** - For advanced users - **[Helm Charts](https://github.com/ai-dynamo/dynamo/tree/main/deploy/helm/README.md)** - For advanced users
- **[Checkpointing](chrek/README.md)** - Fast pod startup with checkpoint/restore - **[Checkpointing](snapshot/README.md)** - Fast pod startup with checkpoint/restore
- **[GitOps Deployment with FluxCD](fluxcd.md)** - For advanced users - **[GitOps Deployment with FluxCD](fluxcd.md)** - For advanced users
- **[Logging](observability/logging.md)** - For logging setup - **[Logging](observability/logging.md)** - For logging setup
- **[Multinode Deployment](deployment/multinode-deployment.md)** - For multinode deployment - **[Multinode Deployment](deployment/multinode-deployment.md)** - For multinode deployment
......
...@@ -1683,7 +1683,7 @@ _Appears in:_ ...@@ -1683,7 +1683,7 @@ _Appears in:_
| Field | Description | Default | Validation | | Field | Description | Default | Validation |
| --- | --- | --- | --- | | --- | --- | --- | --- |
| `pvcName` _string_ | PVCName is the name of the PVC | chrek-pvc | | | `pvcName` _string_ | PVCName is the name of the PVC | snapshot-pvc | |
| `basePath` _string_ | BasePath is the base directory within the PVC | /checkpoints | | | `basePath` _string_ | BasePath is the base directory within the PVC | /checkpoints | |
......
...@@ -4,13 +4,13 @@ ...@@ -4,13 +4,13 @@
title: Checkpointing title: Checkpointing
--- ---
> ⚠️ **Experimental Feature**: ChReK is currently in **beta/preview**. The ChReK DaemonSet runs in privileged mode to perform CRIU operations. See [Limitations](#limitations) for details. > ⚠️ **Experimental Feature**: Dynamo Snapshot is currently in **beta/preview**. The Dynamo Snapshot DaemonSet runs in privileged mode to perform CRIU operations. See [Limitations](#limitations) for details.
**ChReK** (Checkpoint/Restore in Kubernetes) is an experimental infrastructure for fast-starting GPU applications using CRIU (Checkpoint/Restore in User-space). ChReK dramatically reduces cold-start times for large models from minutes to seconds by capturing initialized application state and restoring it on-demand. **Dynamo Snapshot** (Checkpoint/Restore in Kubernetes) is an experimental infrastructure for fast-starting GPU applications using CRIU (Checkpoint/Restore in User-space). Dynamo Snapshot dramatically reduces cold-start times for large models from minutes to seconds by capturing initialized application state and restoring it on-demand.
## What is ChReK? ## What is Dynamo Snapshot?
ChReK provides: Dynamo Snapshot provides:
- **Fast cold starts**: Restore GPU-accelerated applications in seconds instead of minutes - **Fast cold starts**: Restore GPU-accelerated applications in seconds instead of minutes
- **CUDA state preservation**: Checkpoint and restore GPU memory and CUDA contexts - **CUDA state preservation**: Checkpoint and restore GPU memory and CUDA contexts
- **Kubernetes-native**: Integrates seamlessly with Kubernetes primitives - **Kubernetes-native**: Integrates seamlessly with Kubernetes primitives
...@@ -21,7 +21,7 @@ ChReK provides: ...@@ -21,7 +21,7 @@ ChReK provides:
### 1. With NVIDIA Dynamo Platform (Recommended) ### 1. With NVIDIA Dynamo Platform (Recommended)
Use ChReK as part of the Dynamo platform for automatic checkpoint management: Use Dynamo Snapshot as part of the Dynamo platform for automatic checkpoint management:
- Automatic checkpoint creation and lifecycle management - Automatic checkpoint creation and lifecycle management
- Seamless integration with DynamoGraphDeployment CRDs - Seamless integration with DynamoGraphDeployment CRDs
- Built-in autoscaling with fast restore - Built-in autoscaling with fast restore
...@@ -30,9 +30,9 @@ Use ChReK as part of the Dynamo platform for automatic checkpoint management: ...@@ -30,9 +30,9 @@ Use ChReK as part of the Dynamo platform for automatic checkpoint management:
## Architecture ## Architecture
ChReK consists of two main components: Dynamo Snapshot consists of two main components:
### 1. ChReK Helm Chart ### 1. Dynamo Snapshot Helm Chart
Deploys the checkpoint/restore infrastructure: Deploys the checkpoint/restore infrastructure:
- **DaemonSet**: Runs on GPU nodes to perform CRIU checkpoint operations - **DaemonSet**: Runs on GPU nodes to perform CRIU checkpoint operations
- **PVC**: Stores checkpoint data (rootfs diffs, CUDA memory state) - **PVC**: Stores checkpoint data (rootfs diffs, CUDA memory state)
...@@ -46,10 +46,10 @@ The DaemonSet performs checkpoint/restore externally using `nsenter` to enter po ...@@ -46,10 +46,10 @@ The DaemonSet performs checkpoint/restore externally using `nsenter` to enter po
## Quick Start ## Quick Start
To install the ChReK DaemonSet in your cluster, run the following: To install the Dynamo Snapshot DaemonSet in your cluster, run the following:
```bash ```bash
helm install chrek nvidia/chrek \ helm install snapshot nvidia/snapshot \
--namespace my-team \ --namespace my-team \
--create-namespace \ --create-namespace \
--set storage.pvc.size=100Gi --set storage.pvc.size=100Gi
...@@ -77,10 +77,10 @@ helm install chrek nvidia/chrek \ ...@@ -77,10 +77,10 @@ helm install chrek nvidia/chrek \
## Limitations ## Limitations
⚠️ **Important**: ChReK has significant limitations that may impact production readiness: ⚠️ **Important**: Dynamo Snapshot has significant limitations that may impact production readiness:
### Security Considerations ### Security Considerations
- **🔴 Privileged DaemonSet**: The ChReK DaemonSet runs in privileged mode with `hostPID`, `hostIPC`, and `hostNetwork` to perform CRIU operations. Workload pods do **not** need privileged mode — all CRIU privilege lives in the DaemonSet. - **🔴 Privileged DaemonSet**: The Dynamo Snapshot DaemonSet runs in privileged mode with `hostPID`, `hostIPC`, and `hostNetwork` to perform CRIU operations. Workload pods do **not** need privileged mode — all CRIU privilege lives in the DaemonSet.
- **Security Impact**: The privileged DaemonSet can: - **Security Impact**: The privileged DaemonSet can:
- Access all host devices and processes - Access all host devices and processes
- Bypass most security restrictions - Bypass most security restrictions
...@@ -95,7 +95,7 @@ helm install chrek nvidia/chrek \ ...@@ -95,7 +95,7 @@ helm install chrek nvidia/chrek \
- **Storage**: Only PVC storage is currently implemented (S3/OCI planned) - **Storage**: Only PVC storage is currently implemented (S3/OCI planned)
### Recommendation ### Recommendation
ChReK is best suited for: Dynamo Snapshot is best suited for:
- ✅ Development and testing environments - ✅ Development and testing environments
- ✅ Research and experimentation - ✅ Research and experimentation
- ✅ Controlled production environments with appropriate security controls - ✅ Controlled production environments with appropriate security controls
...@@ -104,8 +104,8 @@ ChReK is best suited for: ...@@ -104,8 +104,8 @@ ChReK is best suited for:
## Documentation ## Documentation
### Getting Started ### Getting Started
- [Dynamo Integration Guide](dynamo.md) - Using ChReK with Dynamo Platform - [Dynamo Integration Guide](dynamo.md) - Using Dynamo Snapshot with Dynamo Platform
- [ChReK Helm Chart README](https://github.com/ai-dynamo/dynamo/tree/main/deploy/helm/charts/chrek/README.md) - Helm chart configuration - [Dynamo Snapshot Helm Chart README](https://github.com/ai-dynamo/dynamo/tree/main/deploy/helm/charts/snapshot/README.md) - Helm chart configuration
### Related Documentation ### Related Documentation
- [CRIU Documentation](https://criu.org/Main_Page) - Upstream CRIU docs - [CRIU Documentation](https://criu.org/Main_Page) - Upstream CRIU docs
...@@ -114,13 +114,13 @@ ChReK is best suited for: ...@@ -114,13 +114,13 @@ ChReK is best suited for:
- Kubernetes 1.21+ - Kubernetes 1.21+
- GPU nodes with NVIDIA runtime (`nvidia` runtime class) - GPU nodes with NVIDIA runtime (`nvidia` runtime class)
- containerd runtime (for container inspection; CRIU is bundled in ChReK images) - containerd runtime (for container inspection; CRIU is bundled in Dynamo Snapshot images)
- RWX storage class (for multi-node deployments) - RWX storage class (for multi-node deployments)
- **Security clearance for privileged DaemonSet** (the ChReK agent runs privileged with hostPID/hostIPC/hostNetwork) - **Security clearance for privileged DaemonSet** (the Dynamo Snapshot agent runs privileged with hostPID/hostIPC/hostNetwork)
## Contributing ## Contributing
ChReK is part of the NVIDIA Dynamo project. Contributions are welcome! Dynamo Snapshot is part of the NVIDIA Dynamo project. Contributions are welcome!
## License ## License
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
title: Integration with Dynamo title: Integration with Dynamo
--- ---
> ⚠️ **Experimental Feature**: ChReK is currently in **beta/preview**. The ChReK DaemonSet runs in privileged mode to perform CRIU operations. See [Limitations](#limitations) for details. > ⚠️ **Experimental Feature**: Dynamo Snapshot is currently in **beta/preview**. The Dynamo Snapshot DaemonSet runs in privileged mode to perform CRIU operations. See [Limitations](#limitations) for details.
Checkpointing captures the complete state of a running worker pod (including GPU memory) and saves it to storage. New pods can restore from this checkpoint instead of performing a full cold start. Checkpointing captures the complete state of a running worker pod (including GPU memory) and saves it to storage. New pods can restore from this checkpoint instead of performing a full cold start.
...@@ -16,30 +16,30 @@ Checkpointing captures the complete state of a running worker pod (including GPU ...@@ -16,30 +16,30 @@ Checkpointing captures the complete state of a running worker pod (including GPU
## Prerequisites ## Prerequisites
- Dynamo Platform installed (v0.4.0+) on k8s cluster with GPU nodes - Dynamo Platform installed (v0.4.0+) on k8s cluster with GPU nodes
- ChReK Helm chart installed (separate from platform) - Dynamo Snapshot Helm chart installed (separate from platform)
- RWX PVC storage (PVC is currently the only supported backend) - RWX PVC storage (PVC is currently the only supported backend)
## Quick Start ## Quick Start
### 1. Install ChReK Infrastructure ### 1. Install Dynamo Snapshot Infrastructure
First, install the ChReK Helm chart in each namespace where you need checkpointing: First, install the Dynamo Snapshot Helm chart in each namespace where you need checkpointing:
```bash ```bash
# Install ChReK infrastructure # Install Dynamo Snapshot infrastructure
helm install chrek nvidia/chrek \ helm install snapshot nvidia/snapshot \
--namespace my-team \ --namespace my-team \
--create-namespace \ --create-namespace \
--set storage.pvc.size=100Gi --set storage.pvc.size=100Gi
``` ```
This creates: This creates:
- A PVC for checkpoint storage (`chrek-pvc`) - A PVC for checkpoint storage (`snapshot-pvc`)
- A DaemonSet for CRIU operations (`chrek-agent`) - A DaemonSet for CRIU operations (`snapshot-agent`)
### 2. Configure Operator Values ### 2. Configure Operator Values
Update your Helm values to point to the ChReK infrastructure: Update your Helm values to point to the Dynamo Snapshot infrastructure:
```yaml ```yaml
# values.yaml # values.yaml
...@@ -49,9 +49,9 @@ dynamo-operator: ...@@ -49,9 +49,9 @@ dynamo-operator:
storage: storage:
type: pvc # Only PVC is currently supported (S3/OCI planned) type: pvc # Only PVC is currently supported (S3/OCI planned)
pvc: pvc:
pvcName: "chrek-pvc" # Must match ChReK chart pvcName: "snapshot-pvc" # Must match Dynamo Snapshot chart
basePath: "/checkpoints" basePath: "/checkpoints"
signalHostPath: "/var/lib/chrek/signals" # Must match ChReK chart signalHostPath: "/var/lib/snapshot/signals" # Must match Dynamo Snapshot chart
``` ```
### 2. Configure Your DGD ### 2. Configure Your DGD
...@@ -361,7 +361,7 @@ Or use `auto` mode and the operator will find/create it automatically. ...@@ -361,7 +361,7 @@ Or use `auto` mode and the operator will find/create it automatically.
- **Single-GPU only**: Multi-GPU configurations are not yet supported (planned) - **Single-GPU only**: Multi-GPU configurations are not yet supported (planned)
- **Network state**: Active TCP connections are closed during restore (handled with `tcp-close` CRIU option) - **Network state**: Active TCP connections are closed during restore (handled with `tcp-close` CRIU option)
- **Storage**: Only PVC backend currently implemented (S3/OCI planned) - **Storage**: Only PVC backend currently implemented (S3/OCI planned)
- **Security**: ChReK runs as a **privileged DaemonSet** which is required to run CRIU - **Security**: Dynamo Snapshot runs as a **privileged DaemonSet** which is required to run CRIU
## Troubleshooting ## Troubleshooting
...@@ -369,13 +369,13 @@ Or use `auto` mode and the operator will find/create it automatically. ...@@ -369,13 +369,13 @@ Or use `auto` mode and the operator will find/create it automatically.
1. Check the checkpoint job: 1. Check the checkpoint job:
```bash ```bash
kubectl get jobs -l nvidia.com/chrek-is-checkpoint-source=true -n dynamo-system kubectl get jobs -l nvidia.com/snapshot-is-checkpoint-source=true -n dynamo-system
kubectl logs job/checkpoint-<name> -n dynamo-system kubectl logs job/checkpoint-<name> -n dynamo-system
``` ```
2. Check the DaemonSet: 2. Check the DaemonSet:
```bash ```bash
kubectl logs daemonset/chrek-agent -n dynamo-system kubectl logs daemonset/snapshot-agent -n dynamo-system
``` ```
3. Verify storage access: 3. Verify storage access:
...@@ -510,8 +510,8 @@ spec: ...@@ -510,8 +510,8 @@ spec:
## Related Documentation ## Related Documentation
- [ChReK Overview](README.md) - ChReK architecture and use cases - [Dynamo Snapshot Overview](README.md) - Dynamo Snapshot architecture and use cases
- [ChReK Helm Chart README](https://github.com/ai-dynamo/dynamo/tree/main/deploy/helm/charts/chrek/README.md) - Chart configuration - [Dynamo Snapshot Helm Chart README](https://github.com/ai-dynamo/dynamo/tree/main/deploy/helm/charts/snapshot/README.md) - Chart configuration
- [Installation Guide](../installation-guide.md) - Platform installation - [Installation Guide](../installation-guide.md) - Platform installation
- [API Reference](../api-reference.md) - Complete CRD specifications - [API Reference](../api-reference.md) - Complete CRD specifications
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment