Unverified Commit 6831020f authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

chore: rename chrek to Dynamo Snapshot (#7028)


Signed-off-by: default avatarSchwinn Saereesitthipitak <17022745+galletas1712@users.noreply.github.com>
parent 7dbebf3c
// Package watcher provides Kubernetes pod watching for automatic checkpoint/restore.
// The watcher is the sole entry point for chrek operations — it detects pods with
// The watcher is the sole entry point for snapshot operations — it detects pods with
// checkpoint/restore labels and calls the orchestrators directly.
package watcher
......@@ -23,17 +23,17 @@ import (
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/cache"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/common"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/orchestrate"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/types"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/common"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/orchestrate"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
)
const (
kubeLabelIsCheckpointSource = "nvidia.com/chrek-is-checkpoint-source"
kubeLabelCheckpointHash = "nvidia.com/chrek-checkpoint-hash"
kubeLabelIsRestoreTarget = "nvidia.com/chrek-is-restore-target"
kubeAnnotationCheckpointStatus = "nvidia.com/chrek-checkpoint-status"
kubeAnnotationRestoreStatus = "nvidia.com/chrek-restore-status"
kubeLabelIsCheckpointSource = "nvidia.com/snapshot-is-checkpoint-source"
kubeLabelCheckpointHash = "nvidia.com/snapshot-checkpoint-hash"
kubeLabelIsRestoreTarget = "nvidia.com/snapshot-is-restore-target"
kubeAnnotationCheckpointStatus = "nvidia.com/snapshot-checkpoint-status"
kubeAnnotationRestoreStatus = "nvidia.com/snapshot-restore-status"
)
// Watcher watches for pods with checkpoint/restore labels and triggers operations.
......@@ -199,7 +199,7 @@ func (w *Watcher) handleCheckpointPodEvent(ctx context.Context, pod *corev1.Pod)
}
w.log.Info("Pod ready, triggering checkpoint", "pod", podKey, "checkpoint_hash", checkpointHash)
emitPodEvent(ctx, w.clientset, w.log, pod, "chrek", corev1.EventTypeNormal, "CheckpointRequested", fmt.Sprintf("Checkpoint requested: %s", checkpointHash))
emitPodEvent(ctx, w.clientset, w.log, pod, "snapshot", corev1.EventTypeNormal, "CheckpointRequested", fmt.Sprintf("Checkpoint requested: %s", checkpointHash))
go w.doCheckpoint(ctx, pod, checkpointHash, podKey)
}
......@@ -249,7 +249,7 @@ func (w *Watcher) handleRestorePodEvent(ctx context.Context, pod *corev1.Pod) {
}
w.log.Info("Restore pod running, triggering external restore", "pod", podKey, "checkpoint_hash", checkpointHash)
emitPodEvent(ctx, w.clientset, w.log, pod, "chrek", corev1.EventTypeNormal, "RestoreRequested", fmt.Sprintf("Restore requested from checkpoint %s", checkpointHash))
emitPodEvent(ctx, w.clientset, w.log, pod, "snapshot", corev1.EventTypeNormal, "RestoreRequested", fmt.Sprintf("Restore requested from checkpoint %s", checkpointHash))
go w.doRestore(ctx, pod, checkpointHash, podKey)
}
......@@ -276,7 +276,7 @@ func (w *Watcher) doCheckpoint(ctx context.Context, pod *corev1.Pod, checkpointH
if containerName == "" {
err := fmt.Errorf("no containers found in pod spec")
log.Error(err, "Checkpoint failed")
emitPodEvent(ctx, w.clientset, log, pod, "chrek", corev1.EventTypeWarning, "CheckpointFailed", err.Error())
emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "CheckpointFailed", err.Error())
annotatePod(ctx, w.clientset, log, pod, map[string]string{kubeAnnotationCheckpointStatus: "failed"})
return
}
......@@ -288,7 +288,7 @@ func (w *Watcher) doCheckpoint(ctx context.Context, pod *corev1.Pod, checkpointH
}
}
if containerID == "" {
emitPodEvent(ctx, w.clientset, log, pod, "chrek", corev1.EventTypeWarning, "CheckpointFailed", "Could not resolve target container ID")
emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "CheckpointFailed", "Could not resolve target container ID")
annotatePod(ctx, w.clientset, log, pod, map[string]string{kubeAnnotationCheckpointStatus: "failed"})
return
}
......@@ -297,7 +297,7 @@ func (w *Watcher) doCheckpoint(ctx context.Context, pod *corev1.Pod, checkpointH
containerPID, _, err := common.ResolveContainer(ctx, w.containerd, containerID)
if err != nil {
log.Error(err, "Failed to resolve container")
emitPodEvent(ctx, w.clientset, log, pod, "chrek", corev1.EventTypeWarning, "CheckpointFailed", fmt.Sprintf("Container resolve failed: %v", err))
emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "CheckpointFailed", fmt.Sprintf("Container resolve failed: %v", err))
annotatePod(ctx, w.clientset, log, pod, map[string]string{kubeAnnotationCheckpointStatus: "failed"})
return
}
......@@ -314,7 +314,7 @@ func (w *Watcher) doCheckpoint(ctx context.Context, pod *corev1.Pod, checkpointH
}
if err := orchestrate.Checkpoint(ctx, w.containerd, log, req, w.config); err != nil {
log.Error(err, "Checkpoint failed")
emitPodEvent(ctx, w.clientset, log, pod, "chrek", corev1.EventTypeWarning, "CheckpointFailed", err.Error())
emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "CheckpointFailed", err.Error())
// SIGKILL on failure: process is unrecoverable (CUDA locked), terminate immediately
if signalErr := common.SendSignalToPID(log, containerPID, syscall.SIGKILL, "checkpoint failed"); signalErr != nil {
log.Error(signalErr, "Failed to signal checkpoint failure to runtime process")
......@@ -324,10 +324,10 @@ func (w *Watcher) doCheckpoint(ctx context.Context, pod *corev1.Pod, checkpointH
}
// Step 2: SIGUSR1 on success: notify the workload that checkpoint completed
emitPodEvent(ctx, w.clientset, log, pod, "chrek", corev1.EventTypeNormal, "CheckpointSucceeded", fmt.Sprintf("Checkpoint completed: %s", checkpointHash))
emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeNormal, "CheckpointSucceeded", fmt.Sprintf("Checkpoint completed: %s", checkpointHash))
if err := common.SendSignalToPID(log, containerPID, syscall.SIGUSR1, "checkpoint complete"); err != nil {
log.Error(err, "Failed to signal checkpoint completion to runtime process")
emitPodEvent(ctx, w.clientset, log, pod, "chrek", corev1.EventTypeWarning, "CheckpointFailed", err.Error())
emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "CheckpointFailed", err.Error())
annotatePod(ctx, w.clientset, log, pod, map[string]string{kubeAnnotationCheckpointStatus: "failed"})
return
}
......@@ -356,7 +356,7 @@ func (w *Watcher) doRestore(ctx context.Context, pod *corev1.Pod, checkpointHash
if containerName == "" {
err := fmt.Errorf("no containers found in pod spec")
log.Error(err, "Restore failed")
emitPodEvent(ctx, w.clientset, log, pod, "chrek", corev1.EventTypeWarning, "RestoreFailed", err.Error())
emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "RestoreFailed", err.Error())
annotatePod(ctx, w.clientset, log, pod, map[string]string{kubeAnnotationRestoreStatus: "failed"})
return
}
......@@ -373,7 +373,7 @@ func (w *Watcher) doRestore(ctx context.Context, pod *corev1.Pod, checkpointHash
restoredPID, err := orchestrate.Restore(ctx, w.containerd, log, req)
if err != nil {
log.Error(err, "External restore failed")
emitPodEvent(ctx, w.clientset, log, pod, "chrek", corev1.EventTypeWarning, "RestoreFailed", err.Error())
emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "RestoreFailed", err.Error())
annotatePod(ctx, w.clientset, log, pod, map[string]string{kubeAnnotationRestoreStatus: "failed"})
return
}
......@@ -382,13 +382,13 @@ func (w *Watcher) doRestore(ctx context.Context, pod *corev1.Pod, checkpointHash
placeholderHostPID, _, err := common.ResolveContainerByPod(ctx, w.containerd, pod.Name, pod.Namespace, containerName)
if err != nil {
log.Error(err, "Failed to resolve placeholder host PID for signaling")
emitPodEvent(ctx, w.clientset, log, pod, "chrek", corev1.EventTypeWarning, "RestoreFailed", err.Error())
emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "RestoreFailed", err.Error())
annotatePod(ctx, w.clientset, log, pod, map[string]string{kubeAnnotationRestoreStatus: "failed"})
return
}
if err := common.SendSignalViaPIDNamespace(ctx, log, placeholderHostPID, restoredPID, syscall.SIGCONT, "restore complete"); err != nil {
log.Error(err, "Failed to signal restored runtime process")
emitPodEvent(ctx, w.clientset, log, pod, "chrek", corev1.EventTypeWarning, "RestoreFailed", err.Error())
emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "RestoreFailed", err.Error())
annotatePod(ctx, w.clientset, log, pod, map[string]string{kubeAnnotationRestoreStatus: "failed"})
return
}
......@@ -402,12 +402,12 @@ func (w *Watcher) doRestore(ctx context.Context, pod *corev1.Pod, checkpointHash
}
if err := waitForPodReady(readyCtx, w.clientset, pod.Namespace, pod.Name, containerName); err != nil {
log.Error(err, "Restore post-signal readiness check failed")
emitPodEvent(ctx, w.clientset, log, pod, "chrek", corev1.EventTypeWarning, "RestoreFailed", err.Error())
emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "RestoreFailed", err.Error())
annotatePod(ctx, w.clientset, log, pod, map[string]string{kubeAnnotationRestoreStatus: "failed"})
return
}
emitPodEvent(ctx, w.clientset, log, pod, "chrek", corev1.EventTypeNormal, "RestoreSucceeded", fmt.Sprintf("Restore completed from checkpoint %s", checkpointHash))
emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeNormal, "RestoreSucceeded", fmt.Sprintf("Restore completed from checkpoint %s", checkpointHash))
annotatePod(ctx, w.clientset, log, pod, map[string]string{kubeAnnotationRestoreStatus: "completed"})
}
......
......@@ -12,7 +12,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes/fake"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/types"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
)
const testNodeName = "test-node"
......
......@@ -54,10 +54,10 @@ navigation:
- page: Inference Gateway (GAIE)
path: kubernetes/inference-gateway.md
- section: Checkpointing
path: kubernetes/chrek/README.md
path: kubernetes/snapshot/README.md
contents:
- page: Integration with Dynamo
path: kubernetes/chrek/dynamo.md
path: kubernetes/snapshot/dynamo.md
- section: Observability (K8s)
contents:
- page: Metrics
......
......@@ -230,7 +230,7 @@ Key customization points include:
- **[Operator Documentation](dynamo-operator.md)** - How the platform works
- **[Service Discovery](service-discovery.md)** - Discovery backends and configuration
- **[Helm Charts](https://github.com/ai-dynamo/dynamo/tree/main/deploy/helm/README.md)** - For advanced users
- **[Checkpointing](chrek/README.md)** - Fast pod startup with checkpoint/restore
- **[Checkpointing](snapshot/README.md)** - Fast pod startup with checkpoint/restore
- **[GitOps Deployment with FluxCD](fluxcd.md)** - For advanced users
- **[Logging](observability/logging.md)** - For logging setup
- **[Multinode Deployment](deployment/multinode-deployment.md)** - For multinode deployment
......
......@@ -1683,7 +1683,7 @@ _Appears in:_
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `pvcName` _string_ | PVCName is the name of the PVC | chrek-pvc | |
| `pvcName` _string_ | PVCName is the name of the PVC | snapshot-pvc | |
| `basePath` _string_ | BasePath is the base directory within the PVC | /checkpoints | |
......
......@@ -4,13 +4,13 @@
title: Checkpointing
---
> ⚠️ **Experimental Feature**: ChReK is currently in **beta/preview**. The ChReK DaemonSet runs in privileged mode to perform CRIU operations. See [Limitations](#limitations) for details.
> ⚠️ **Experimental Feature**: Dynamo Snapshot is currently in **beta/preview**. The Dynamo Snapshot DaemonSet runs in privileged mode to perform CRIU operations. See [Limitations](#limitations) for details.
**ChReK** (Checkpoint/Restore in Kubernetes) is an experimental infrastructure for fast-starting GPU applications using CRIU (Checkpoint/Restore in User-space). ChReK dramatically reduces cold-start times for large models from minutes to seconds by capturing initialized application state and restoring it on-demand.
**Dynamo Snapshot** (Checkpoint/Restore in Kubernetes) is an experimental infrastructure for fast-starting GPU applications using CRIU (Checkpoint/Restore in User-space). Dynamo Snapshot dramatically reduces cold-start times for large models from minutes to seconds by capturing initialized application state and restoring it on-demand.
## What is ChReK?
## What is Dynamo Snapshot?
ChReK provides:
Dynamo Snapshot provides:
- **Fast cold starts**: Restore GPU-accelerated applications in seconds instead of minutes
- **CUDA state preservation**: Checkpoint and restore GPU memory and CUDA contexts
- **Kubernetes-native**: Integrates seamlessly with Kubernetes primitives
......@@ -21,7 +21,7 @@ ChReK provides:
### 1. With NVIDIA Dynamo Platform (Recommended)
Use ChReK as part of the Dynamo platform for automatic checkpoint management:
Use Dynamo Snapshot as part of the Dynamo platform for automatic checkpoint management:
- Automatic checkpoint creation and lifecycle management
- Seamless integration with DynamoGraphDeployment CRDs
- Built-in autoscaling with fast restore
......@@ -30,9 +30,9 @@ Use ChReK as part of the Dynamo platform for automatic checkpoint management:
## Architecture
ChReK consists of two main components:
Dynamo Snapshot consists of two main components:
### 1. ChReK Helm Chart
### 1. Dynamo Snapshot Helm Chart
Deploys the checkpoint/restore infrastructure:
- **DaemonSet**: Runs on GPU nodes to perform CRIU checkpoint operations
- **PVC**: Stores checkpoint data (rootfs diffs, CUDA memory state)
......@@ -46,10 +46,10 @@ The DaemonSet performs checkpoint/restore externally using `nsenter` to enter po
## Quick Start
To install the ChReK DaemonSet in your cluster, run the following:
To install the Dynamo Snapshot DaemonSet in your cluster, run the following:
```bash
helm install chrek nvidia/chrek \
helm install snapshot nvidia/snapshot \
--namespace my-team \
--create-namespace \
--set storage.pvc.size=100Gi
......@@ -77,10 +77,10 @@ helm install chrek nvidia/chrek \
## Limitations
⚠️ **Important**: ChReK has significant limitations that may impact production readiness:
⚠️ **Important**: Dynamo Snapshot has significant limitations that may impact production readiness:
### Security Considerations
- **🔴 Privileged DaemonSet**: The ChReK DaemonSet runs in privileged mode with `hostPID`, `hostIPC`, and `hostNetwork` to perform CRIU operations. Workload pods do **not** need privileged mode — all CRIU privilege lives in the DaemonSet.
- **🔴 Privileged DaemonSet**: The Dynamo Snapshot DaemonSet runs in privileged mode with `hostPID`, `hostIPC`, and `hostNetwork` to perform CRIU operations. Workload pods do **not** need privileged mode — all CRIU privilege lives in the DaemonSet.
- **Security Impact**: The privileged DaemonSet can:
- Access all host devices and processes
- Bypass most security restrictions
......@@ -95,7 +95,7 @@ helm install chrek nvidia/chrek \
- **Storage**: Only PVC storage is currently implemented (S3/OCI planned)
### Recommendation
ChReK is best suited for:
Dynamo Snapshot is best suited for:
- ✅ Development and testing environments
- ✅ Research and experimentation
- ✅ Controlled production environments with appropriate security controls
......@@ -104,8 +104,8 @@ ChReK is best suited for:
## Documentation
### Getting Started
- [Dynamo Integration Guide](dynamo.md) - Using ChReK with Dynamo Platform
- [ChReK Helm Chart README](https://github.com/ai-dynamo/dynamo/tree/main/deploy/helm/charts/chrek/README.md) - Helm chart configuration
- [Dynamo Integration Guide](dynamo.md) - Using Dynamo Snapshot with Dynamo Platform
- [Dynamo Snapshot Helm Chart README](https://github.com/ai-dynamo/dynamo/tree/main/deploy/helm/charts/snapshot/README.md) - Helm chart configuration
### Related Documentation
- [CRIU Documentation](https://criu.org/Main_Page) - Upstream CRIU docs
......@@ -114,13 +114,13 @@ ChReK is best suited for:
- Kubernetes 1.21+
- GPU nodes with NVIDIA runtime (`nvidia` runtime class)
- containerd runtime (for container inspection; CRIU is bundled in ChReK images)
- containerd runtime (for container inspection; CRIU is bundled in Dynamo Snapshot images)
- RWX storage class (for multi-node deployments)
- **Security clearance for privileged DaemonSet** (the ChReK agent runs privileged with hostPID/hostIPC/hostNetwork)
- **Security clearance for privileged DaemonSet** (the Dynamo Snapshot agent runs privileged with hostPID/hostIPC/hostNetwork)
## Contributing
ChReK is part of the NVIDIA Dynamo project. Contributions are welcome!
Dynamo Snapshot is part of the NVIDIA Dynamo project. Contributions are welcome!
## License
......
......@@ -4,7 +4,7 @@
title: Integration with Dynamo
---
> ⚠️ **Experimental Feature**: ChReK is currently in **beta/preview**. The ChReK DaemonSet runs in privileged mode to perform CRIU operations. See [Limitations](#limitations) for details.
> ⚠️ **Experimental Feature**: Dynamo Snapshot is currently in **beta/preview**. The Dynamo Snapshot DaemonSet runs in privileged mode to perform CRIU operations. See [Limitations](#limitations) for details.
Checkpointing captures the complete state of a running worker pod (including GPU memory) and saves it to storage. New pods can restore from this checkpoint instead of performing a full cold start.
......@@ -16,30 +16,30 @@ Checkpointing captures the complete state of a running worker pod (including GPU
## Prerequisites
- Dynamo Platform installed (v0.4.0+) on k8s cluster with GPU nodes
- ChReK Helm chart installed (separate from platform)
- Dynamo Snapshot Helm chart installed (separate from platform)
- RWX PVC storage (PVC is currently the only supported backend)
## Quick Start
### 1. Install ChReK Infrastructure
### 1. Install Dynamo Snapshot Infrastructure
First, install the ChReK Helm chart in each namespace where you need checkpointing:
First, install the Dynamo Snapshot Helm chart in each namespace where you need checkpointing:
```bash
# Install ChReK infrastructure
helm install chrek nvidia/chrek \
# Install Dynamo Snapshot infrastructure
helm install snapshot nvidia/snapshot \
--namespace my-team \
--create-namespace \
--set storage.pvc.size=100Gi
```
This creates:
- A PVC for checkpoint storage (`chrek-pvc`)
- A DaemonSet for CRIU operations (`chrek-agent`)
- A PVC for checkpoint storage (`snapshot-pvc`)
- A DaemonSet for CRIU operations (`snapshot-agent`)
### 2. Configure Operator Values
Update your Helm values to point to the ChReK infrastructure:
Update your Helm values to point to the Dynamo Snapshot infrastructure:
```yaml
# values.yaml
......@@ -49,9 +49,9 @@ dynamo-operator:
storage:
type: pvc # Only PVC is currently supported (S3/OCI planned)
pvc:
pvcName: "chrek-pvc" # Must match ChReK chart
pvcName: "snapshot-pvc" # Must match Dynamo Snapshot chart
basePath: "/checkpoints"
signalHostPath: "/var/lib/chrek/signals" # Must match ChReK chart
signalHostPath: "/var/lib/snapshot/signals" # Must match Dynamo Snapshot chart
```
### 2. Configure Your DGD
......@@ -361,7 +361,7 @@ Or use `auto` mode and the operator will find/create it automatically.
- **Single-GPU only**: Multi-GPU configurations are not yet supported (planned)
- **Network state**: Active TCP connections are closed during restore (handled with `tcp-close` CRIU option)
- **Storage**: Only PVC backend currently implemented (S3/OCI planned)
- **Security**: ChReK runs as a **privileged DaemonSet** which is required to run CRIU
- **Security**: Dynamo Snapshot runs as a **privileged DaemonSet** which is required to run CRIU
## Troubleshooting
......@@ -369,13 +369,13 @@ Or use `auto` mode and the operator will find/create it automatically.
1. Check the checkpoint job:
```bash
kubectl get jobs -l nvidia.com/chrek-is-checkpoint-source=true -n dynamo-system
kubectl get jobs -l nvidia.com/snapshot-is-checkpoint-source=true -n dynamo-system
kubectl logs job/checkpoint-<name> -n dynamo-system
```
2. Check the DaemonSet:
```bash
kubectl logs daemonset/chrek-agent -n dynamo-system
kubectl logs daemonset/snapshot-agent -n dynamo-system
```
3. Verify storage access:
......@@ -510,8 +510,8 @@ spec:
## Related Documentation
- [ChReK Overview](README.md) - ChReK architecture and use cases
- [ChReK Helm Chart README](https://github.com/ai-dynamo/dynamo/tree/main/deploy/helm/charts/chrek/README.md) - Chart configuration
- [Dynamo Snapshot Overview](README.md) - Dynamo Snapshot architecture and use cases
- [Dynamo Snapshot Helm Chart README](https://github.com/ai-dynamo/dynamo/tree/main/deploy/helm/charts/snapshot/README.md) - Chart configuration
- [Installation Guide](../installation-guide.md) - Platform installation
- [API Reference](../api-reference.md) - Complete CRD specifications
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment