Unverified Commit bb8fc8a4 authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

feat(chrek): external restore, signal-based IPC, and package refactor (#6286)


Co-authored-by: default avatarDan Feigin <dfeigin@nvidia.com>
parent c8423b57
......@@ -1225,6 +1225,260 @@ func TestDynamoComponentDeploymentReconciler_createOrUpdateOrDeleteDeployments_R
g.Expect(deployment3).NotTo(gomega.BeNil())
}
func TestDynamoComponentDeploymentReconciler_generatePodTemplateSpec_RestoreLabels(t *testing.T) {
s := scheme.Scheme
if err := v1alpha1.AddToScheme(s); err != nil {
t.Fatalf("Failed to add v1alpha1 to scheme: %v", err)
}
if err := corev1.AddToScheme(s); err != nil {
t.Fatalf("Failed to add corev1 to scheme: %v", err)
}
makeDCD := func(checkpointRef string) *v1alpha1.DynamoComponentDeployment {
return &v1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-worker",
Namespace: "default",
},
Spec: v1alpha1.DynamoComponentDeploymentSpec{
BackendFramework: string(dynamo.BackendFrameworkVLLM),
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
ServiceName: "worker",
ComponentType: commonconsts.ComponentTypeWorker,
DynamoNamespace: ptr.To("default"),
Labels: map[string]string{
commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
commonconsts.KubeLabelIsRestoreTarget: commonconsts.KubeLabelValueTrue,
},
Checkpoint: &v1alpha1.ServiceCheckpointConfig{
Enabled: true,
CheckpointRef: &checkpointRef,
},
ExtraPodSpec: &v1alpha1.ExtraPodSpec{
MainContainer: &corev1.Container{
Name: commonconsts.MainContainerName,
Image: "test-image:latest",
Command: []string{"python3"},
Args: []string{"-m", "dynamo.vllm"},
},
},
},
},
}
}
makeReconciler := func(objs ...client.Object) *DynamoComponentDeploymentReconciler {
return &DynamoComponentDeploymentReconciler{
Client: fake.NewClientBuilder().
WithScheme(s).
WithObjects(objs...).
Build(),
Config: controller_common.Config{
Checkpoint: controller_common.CheckpointConfig{
Enabled: true,
Storage: controller_common.CheckpointStorageConfig{
Type: controller_common.CheckpointStorageTypePVC,
PVC: controller_common.CheckpointPVCConfig{
PVCName: "chrek-pvc",
BasePath: "/checkpoints",
},
},
},
},
}
}
t.Run("ready checkpoint adds explicit restore labels", func(t *testing.T) {
checkpointName := "ckpt-ready"
dcd := makeDCD(checkpointName)
ckpt := &v1alpha1.DynamoCheckpoint{
ObjectMeta: metav1.ObjectMeta{
Name: checkpointName,
Namespace: "default",
},
Status: v1alpha1.DynamoCheckpointStatus{
Phase: v1alpha1.DynamoCheckpointPhaseReady,
IdentityHash: "hash-ready-1",
},
}
r := makeReconciler(dcd, ckpt)
podTemplateSpec, err := r.generatePodTemplateSpec(
context.Background(),
generateResourceOption{dynamoComponentDeployment: dcd},
dynamo.RoleMain,
)
if err != nil {
t.Fatalf("generatePodTemplateSpec failed: %v", err)
}
if got := podTemplateSpec.Labels[commonconsts.KubeLabelIsRestoreTarget]; got != commonconsts.KubeLabelValueTrue {
t.Fatalf("expected %s label to be true, got %q", commonconsts.KubeLabelIsRestoreTarget, got)
}
if got := podTemplateSpec.Labels[commonconsts.KubeLabelCheckpointHash]; got != "hash-ready-1" {
t.Fatalf("expected %s to be checkpoint hash, got %q", commonconsts.KubeLabelCheckpointHash, got)
}
})
t.Run("non-ready checkpoint clears stale restore labels", func(t *testing.T) {
checkpointName := "ckpt-pending"
dcd := makeDCD(checkpointName)
ckpt := &v1alpha1.DynamoCheckpoint{
ObjectMeta: metav1.ObjectMeta{
Name: checkpointName,
Namespace: "default",
},
Status: v1alpha1.DynamoCheckpointStatus{
Phase: v1alpha1.DynamoCheckpointPhaseCreating,
IdentityHash: "hash-pending-1",
},
}
r := makeReconciler(dcd, ckpt)
podTemplateSpec, err := r.generatePodTemplateSpec(
context.Background(),
generateResourceOption{dynamoComponentDeployment: dcd},
dynamo.RoleMain,
)
if err != nil {
t.Fatalf("generatePodTemplateSpec failed: %v", err)
}
if _, ok := podTemplateSpec.Labels[commonconsts.KubeLabelIsRestoreTarget]; ok {
t.Fatalf("did not expect %s label when checkpoint is not ready", commonconsts.KubeLabelIsRestoreTarget)
}
if _, ok := podTemplateSpec.Labels[commonconsts.KubeLabelCheckpointHash]; ok {
t.Fatalf("did not expect %s label when checkpoint is not ready", commonconsts.KubeLabelCheckpointHash)
}
})
}
func TestDynamoComponentDeploymentReconciler_generateDeployment_RestoreStrategy(t *testing.T) {
s := scheme.Scheme
if err := v1alpha1.AddToScheme(s); err != nil {
t.Fatalf("Failed to add v1alpha1 to scheme: %v", err)
}
if err := corev1.AddToScheme(s); err != nil {
t.Fatalf("Failed to add corev1 to scheme: %v", err)
}
if err := appsv1.AddToScheme(s); err != nil {
t.Fatalf("Failed to add appsv1 to scheme: %v", err)
}
replicas := int32(1)
makeDCD := func(checkpointRef string) *v1alpha1.DynamoComponentDeployment {
return &v1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-worker",
Namespace: "default",
},
Spec: v1alpha1.DynamoComponentDeploymentSpec{
BackendFramework: string(dynamo.BackendFrameworkVLLM),
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
ServiceName: "worker",
ComponentType: commonconsts.ComponentTypeWorker,
DynamoNamespace: ptr.To("default"),
Replicas: &replicas,
Labels: map[string]string{
commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
},
Checkpoint: &v1alpha1.ServiceCheckpointConfig{
Enabled: true,
CheckpointRef: &checkpointRef,
},
ExtraPodSpec: &v1alpha1.ExtraPodSpec{
MainContainer: &corev1.Container{
Name: commonconsts.MainContainerName,
Image: "test-image:latest",
Command: []string{"python3"},
Args: []string{"-m", "dynamo.vllm"},
},
},
},
},
}
}
makeReconciler := func(objs ...client.Object) *DynamoComponentDeploymentReconciler {
return &DynamoComponentDeploymentReconciler{
Client: fake.NewClientBuilder().
WithScheme(s).
WithObjects(objs...).
Build(),
Config: controller_common.Config{
Checkpoint: controller_common.CheckpointConfig{
Enabled: true,
Storage: controller_common.CheckpointStorageConfig{
Type: controller_common.CheckpointStorageTypePVC,
PVC: controller_common.CheckpointPVCConfig{
PVCName: "chrek-pvc",
BasePath: "/checkpoints",
},
},
},
},
}
}
t.Run("ready checkpoint forces Recreate strategy", func(t *testing.T) {
checkpointName := "ckpt-ready"
dcd := makeDCD(checkpointName)
ckpt := &v1alpha1.DynamoCheckpoint{
ObjectMeta: metav1.ObjectMeta{
Name: checkpointName,
Namespace: "default",
},
Status: v1alpha1.DynamoCheckpointStatus{
Phase: v1alpha1.DynamoCheckpointPhaseReady,
IdentityHash: "hash-ready-1",
},
}
r := makeReconciler(dcd, ckpt)
deploy, toDelete, err := r.generateDeployment(context.Background(), generateResourceOption{
dynamoComponentDeployment: dcd,
})
if err != nil {
t.Fatalf("generateDeployment failed: %v", err)
}
if toDelete {
t.Fatalf("expected deployment to be retained")
}
if deploy.Spec.Strategy.Type != appsv1.RecreateDeploymentStrategyType {
t.Fatalf("expected Recreate strategy, got %s", deploy.Spec.Strategy.Type)
}
})
t.Run("non-ready checkpoint keeps RollingUpdate strategy", func(t *testing.T) {
checkpointName := "ckpt-creating"
dcd := makeDCD(checkpointName)
ckpt := &v1alpha1.DynamoCheckpoint{
ObjectMeta: metav1.ObjectMeta{
Name: checkpointName,
Namespace: "default",
},
Status: v1alpha1.DynamoCheckpointStatus{
Phase: v1alpha1.DynamoCheckpointPhaseCreating,
IdentityHash: "hash-creating-1",
},
}
r := makeReconciler(dcd, ckpt)
deploy, toDelete, err := r.generateDeployment(context.Background(), generateResourceOption{
dynamoComponentDeployment: dcd,
})
if err != nil {
t.Fatalf("generateDeployment failed: %v", err)
}
if toDelete {
t.Fatalf("expected deployment to be retained")
}
if deploy.Spec.Strategy.Type != appsv1.RollingUpdateDeploymentStrategyType {
t.Fatalf("expected RollingUpdate strategy, got %s", deploy.Spec.Strategy.Type)
}
})
}
func Test_createOrUpdateOrDeleteDeployments_K8sAPIDefaults(t *testing.T) {
g := gomega.NewGomegaWithT(t)
ctx := context.Background()
......
......@@ -104,13 +104,8 @@ type CheckpointConfig struct {
Enabled bool
// Storage holds storage backend configuration
Storage CheckpointStorageConfig
// InitContainerImage is the image used for init containers (e.g., signal file cleanup)
// Defaults to "busybox:latest" if not specified
InitContainerImage string
// ReadyForCheckpointFilePath is the file path used to signal model readiness for checkpoint jobs
ReadyForCheckpointFilePath string
// RestoreMarkerFilePath is the marker file path written after successful restore
RestoreMarkerFilePath string
}
// Checkpoint storage type constants
......@@ -124,8 +119,6 @@ const (
type CheckpointStorageConfig struct {
// Type is the storage backend type: pvc, s3, or oci
Type string
// SignalHostPath is the host path for signal files (used for checkpoint job coordination)
SignalHostPath string
// PVC configuration (used when Type=pvc)
PVC CheckpointPVCConfig
// S3 configuration (used when Type=s3)
......
......@@ -1338,7 +1338,7 @@ func GenerateGrovePodCliqueSet(
PodSpec: *podSpec,
},
}
labels, err := generateLabels(component, dynamoDeployment, serviceName)
labels, err := generateLabels(component, dynamoDeployment, serviceName, checkpointInfo)
if err != nil {
return nil, fmt.Errorf("failed to generate labels: %w", err)
}
......@@ -1392,7 +1392,12 @@ func GenerateGrovePodCliqueSet(
return gangSet, nil
}
func generateLabels(component *v1alpha1.DynamoComponentDeploymentSharedSpec, dynamoDeployment *v1alpha1.DynamoGraphDeployment, componentName string) (map[string]string, error) {
func generateLabels(
component *v1alpha1.DynamoComponentDeploymentSharedSpec,
dynamoDeployment *v1alpha1.DynamoGraphDeployment,
componentName string,
checkpointInfo *checkpoint.CheckpointInfo,
) (map[string]string, error) {
labels := make(map[string]string)
labels[commonconsts.KubeLabelDynamoSelector] = GetDCDResourceName(dynamoDeployment, componentName, "")
labels[commonconsts.KubeLabelDynamoGraphDeploymentName] = dynamoDeployment.Name
......@@ -1408,25 +1413,31 @@ func generateLabels(component *v1alpha1.DynamoComponentDeploymentSharedSpec, dyn
}
// Add base model label if modelRef is specified
AddBaseModelLabel(labels, component.ModelRef)
// Add checkpoint labels if checkpointing is enabled
var err error
labels, err = checkpoint.InjectCheckpointLabelsFromConfig(labels, component.Checkpoint)
if err != nil {
return nil, fmt.Errorf("failed to inject checkpoint labels: %w", err)
}
// Merge user-supplied labels first so they cannot overwrite checkpoint labels.
setMetricsLabels(labels, dynamoDeployment)
if component.Labels != nil {
err = mergo.Merge(&labels, component.Labels, mergo.WithOverride)
if err != nil {
if err := mergo.Merge(&labels, component.Labels, mergo.WithOverride); err != nil {
return nil, fmt.Errorf("failed to merge labels: %w", err)
}
}
if component.ExtraPodMetadata != nil {
err = mergo.Merge(&labels, component.ExtraPodMetadata.Labels, mergo.WithOverride)
if err != nil {
if err := mergo.Merge(&labels, component.ExtraPodMetadata.Labels, mergo.WithOverride); err != nil {
return nil, fmt.Errorf("failed to merge extraPodMetadata labels: %w", err)
}
}
// Inject checkpoint labels AFTER user labels so they cannot be overridden.
var err error
labels, err = checkpoint.InjectCheckpointLabelsFromConfig(labels, component.Checkpoint)
if err != nil {
return nil, fmt.Errorf("failed to inject checkpoint labels: %w", err)
}
// Only mark pods as restore targets when a concrete checkpoint is ready.
if checkpointInfo != nil && checkpointInfo.Enabled && checkpointInfo.Ready {
labels[commonconsts.KubeLabelIsRestoreTarget] = "true"
labels[commonconsts.KubeLabelCheckpointHash] = checkpointInfo.Hash
}
return labels, nil
}
......
......@@ -4,7 +4,7 @@
title: Checkpointing
---
> ⚠️ **Experimental Feature**: ChReK is currently in **beta/preview**. It requires privileged mode for restore operations, which may not be suitable for all production environments. See [Limitations](#limitations) for details.
> ⚠️ **Experimental Feature**: ChReK is currently in **beta/preview**. The ChReK DaemonSet runs in privileged mode to perform CRIU operations. See [Limitations](#limitations) for details.
**ChReK** (Checkpoint/Restore in Kubernetes) is an experimental infrastructure for fast-starting GPU applications using CRIU (Checkpoint/Restore in User-space). ChReK dramatically reduces cold-start times for large models from minutes to seconds by capturing initialized application state and restoring it on-demand.
......@@ -48,10 +48,10 @@ Deploys the checkpoint/restore infrastructure:
- **RBAC**: Namespace-scoped or cluster-wide permissions
- **Seccomp Profile**: Security policies for CRIU syscalls
### 2. Smart Entrypoint
A wrapper script that intelligently decides between:
- **Cold start**: Normal application startup (when no checkpoint exists)
- **Restore**: CRIU restore from checkpoint (when checkpoint available)
### 2. External Restore via DaemonSet
The DaemonSet performs checkpoint/restore externally using `nsenter` to enter pod namespaces:
- **Checkpoint**: Freezes the running process and dumps state (CPU + GPU) to storage
- **Restore**: Enters a placeholder pod's namespaces and restores the checkpointed process via `nsrestore`
## Quick Start
......@@ -94,11 +94,11 @@ helm install chrek nvidia/chrek \
⚠️ **Important**: ChReK has significant limitations that may impact production readiness:
### Security Considerations
- **🔴 Privileged mode required**: Restore pods **must run in privileged mode** for CRIU to function. This grants containers elevated host access and may violate security policies in many production environments.
- **Security Impact**: Privileged containers can:
- Access all host devices
- **🔴 Privileged DaemonSet**: The ChReK DaemonSet runs in privileged mode with `hostPID`, `hostIPC`, and `hostNetwork` to perform CRIU operations. Workload pods do **not** need privileged mode — all CRIU privilege lives in the DaemonSet.
- **Security Impact**: The privileged DaemonSet can:
- Access all host devices and processes
- Bypass most security restrictions
- Potentially compromise node security if the container is exploited
- Potentially compromise node security if exploited
### Technical Limitations
- **vLLM backend only**: Currently only the vLLM backend supports checkpoint/restore. SGLang and TensorRT-LLM support is planned.
......@@ -128,9 +128,9 @@ ChReK is best suited for:
- Kubernetes 1.21+
- GPU nodes with NVIDIA runtime (`nvidia` runtime class)
- CRIU support in container runtime (containerd with CRIU plugin)
- containerd runtime (for container inspection; CRIU is bundled in ChReK images)
- RWX storage class (for multi-node deployments)
- **Security clearance for privileged pods** (required for restore operations)
- **Security clearance for privileged DaemonSet** (the ChReK agent runs privileged with hostPID/hostIPC/hostNetwork)
## Troubleshooting
......@@ -146,9 +146,9 @@ ChReK is best suited for:
- Verify CRIU is installed in the runtime
**Restore fails?**
- Ensure restore pod uses the same volumes as checkpoint job
- Verify `hostIPC: true` is set (required for CUDA)
- Check for `PSM3_DISABLED=1` and `GLOO_SOCKET_IFNAME=lo` environment variables
- Ensure restore pod uses the same image (built with `placeholder` target) and volume mounts as checkpoint job
- Verify the DaemonSet is running on the same node as the restore pod
- Check DaemonSet logs for CRIU errors: `kubectl logs -l app.kubernetes.io/name=chrek`
For detailed troubleshooting, see:
- [Dynamo Integration Guide - Troubleshooting](dynamo.md#troubleshooting)
......
......@@ -6,7 +6,7 @@ title: Integration with Dynamo
# Checkpoint/Restore for Fast Pod Startup
> ⚠️ **Experimental Feature**: ChReK is currently in **beta/preview**. It requires privileged mode for restore operations. See [Limitations](#limitations) for details.
> ⚠️ **Experimental Feature**: ChReK is currently in **beta/preview**. The ChReK DaemonSet runs in privileged mode to perform CRIU operations. See [Limitations](#limitations) for details.
Reduce cold start times for LLM inference workers from ~3 minutes to ~30 seconds using container checkpointing.
......@@ -23,7 +23,7 @@ Checkpointing captures the complete state of a running worker pod (including GPU
- Dynamo Platform installed (v0.4.0+)
- ChReK Helm chart installed (separate from platform)
- GPU nodes with CRIU support
- GPU nodes with containerd runtime (CRIU is bundled in ChReK images)
- RWX PVC storage (PVC is currently the only supported backend)
## Quick Start
......@@ -350,9 +350,9 @@ Or use `auto` mode and the operator will find/create it automatically.
⚠️ **Important**: ChReK has significant limitations that impact production readiness:
### Security Considerations
- **🔴 Privileged mode required**: Restore pods **must run in privileged mode** for CRIU to function
- Privileged containers have elevated host access, which may violate security policies in many production environments
- This requirement applies to all worker pods that restore from checkpoints
- **🔴 Privileged DaemonSet**: The ChReK DaemonSet runs in privileged mode with `hostPID`, `hostIPC`, and `hostNetwork` to perform CRIU operations externally
- Workload pods (checkpoint jobs, restore pods) do **not** need privileged mode — all CRIU privilege lives in the DaemonSet
- The privileged DaemonSet has elevated host access, which may violate security policies in many production environments
### Technical Limitations
- **vLLM backend only**: Currently only the vLLM backend supports checkpoint/restore. SGLang and TensorRT-LLM support is planned.
......@@ -374,7 +374,7 @@ ChReK is **experimental/beta** and best suited for:
1. Check the checkpoint job:
```bash
kubectl get jobs -l nvidia.com/checkpoint-source=true -n dynamo-system
kubectl get jobs -l nvidia.com/chrek-is-checkpoint-source=true -n dynamo-system
kubectl logs job/checkpoint-<name> -n dynamo-system
```
......@@ -430,10 +430,10 @@ Check logs for "Falling back to cold start" message.
| Variable | Description |
|----------|-------------|
| `DYN_CHECKPOINT_STORAGE_TYPE` | Backend: `pvc`, `s3`, `oci` |
| `DYN_CHECKPOINT_LOCATION` | Source location (URI) |
| `DYN_CHECKPOINT_PATH` | Local path to tar file |
| `DYN_CHECKPOINT_HASH` | Identity hash (debugging) |
| `DYN_CHECKPOINT_SIGNAL_FILE` | Signal file (creation mode only) |
| `DYN_CHECKPOINT_LOCATION` | Full checkpoint location (checkpoint jobs) |
| `DYN_CHECKPOINT_PATH` | Base checkpoint directory (restore pods, PVC) |
| `DYN_CHECKPOINT_HASH` | Identity hash |
| `DYN_READY_FOR_CHECKPOINT_FILE` | Ready-for-checkpoint file path (checkpoint jobs) |
## Complete Example
......
......@@ -4,13 +4,14 @@
title: Standalone Usage
---
> ⚠️ **Experimental Feature**: ChReK is currently in **beta/preview**. It requires privileged mode for restore operations, which may not be suitable for all production environments. Review the [security implications](#security-considerations) before deploying.
> ⚠️ **Experimental Feature**: ChReK is currently in **beta/preview**. The ChReK DaemonSet runs in privileged mode to perform CRIU operations. Review the [security implications](#security-considerations) before deploying.
This guide explains how to use **ChReK** (Checkpoint/Restore for Kubernetes) as a standalone component without deploying the full Dynamo platform. This is useful if you want to add checkpoint/restore capabilities to your own GPU workloads.
## Table of Contents
- [Overview](#overview)
- [Using ChReK Without the Dynamo Operator](#using-chrek-without-the-dynamo-operator)
- [Prerequisites](#prerequisites)
- [Step 1: Deploy ChReK](#step-1-deploy-chrek)
- [Step 2: Build Checkpoint-Enabled Images](#step-2-build-checkpoint-enabled-images)
......@@ -27,7 +28,7 @@ This guide explains how to use **ChReK** (Checkpoint/Restore for Kubernetes) as
When using ChReK standalone, you are responsible for:
1. **Deploying the ChReK Helm chart** (DaemonSet + PVC)
2. **Building checkpoint-enabled container images** with the restore entrypoint
2. **Building checkpoint-enabled container images** with the CRIU runtime dependencies
3. **Creating checkpoint jobs** with the correct environment variables
4. **Creating restore pods** that detect and use the checkpoints
......@@ -35,22 +36,89 @@ The ChReK DaemonSet handles the actual CRIU checkpoint/restore operations automa
---
## Using ChReK Without the Dynamo Operator
When using ChReK with the Dynamo operator, the operator automatically configures workload pods for checkpoint/restore. Without the operator, you must handle this configuration manually. This section documents what the operator normally injects and how to replicate it.
### Container Naming
The ChReK DaemonSet needs to identify which container in your pod is the model-serving workload (as opposed to sidecars like istio-proxy or log collectors). It resolves the target container by name:
1. If a container is named `main`, it is selected
2. Otherwise, the first container in the pod spec is selected
When using the Dynamo operator, the model container is always named `main`. In standalone mode, you must either name your model container `main` or ensure it is the first container listed in your pod spec. All YAML examples in this guide use `name: main`.
### Seccomp Profile
The operator sets a seccomp profile on all checkpoint/restore workload pods to block `io_uring` syscalls. The chrek DaemonSet deploys the profile file (`profiles/block-iouring.json`) to each node, but you must reference it in your pod specs:
```yaml
spec:
securityContext:
seccompProfile:
type: Localhost
localhostProfile: profiles/block-iouring.json
```
Without this profile, `io_uring` syscalls during restore can cause CRIU failures.
### Sleep Infinity Command for Restore Pods
The operator overrides the container command to `["sleep", "infinity"]` on restore-target pods. This produces a Running-but-not-Ready placeholder pod that the chrek DaemonSet watcher detects and restores externally via `nsenter`. Without this override, the container runs its normal entrypoint (cold-starting instead of waiting for restore).
```yaml
containers:
- name: main
image: my-app:checkpoint-enabled
command: ["sleep", "infinity"]
```
### Recreate Deployment Strategy
The operator forces `Recreate` strategy when restore labels are present. This prevents the old and new pods from running simultaneously, which would cause failures — two pods competing for the same GPU checkpoint data. If you are using a Deployment, set this manually:
```yaml
apiVersion: apps/v1
kind: Deployment
spec:
strategy:
type: Recreate
```
### PVC Volume Mount Consistency
CRIU requires identical mount layouts between checkpoint and restore. The operator ensures the checkpoint PVC is mounted at the same path in both the checkpoint job and restore pod. When configuring manually, make sure your checkpoint job and restore pod use the exact same `mountPath` for the checkpoint PVC (e.g., `/checkpoints`).
### Downward API Volume (Currently Unused)
The operator injects a Downward API volume at `/etc/podinfo` for post-restore identity discovery (pod name, namespace, UID). This is not currently consumed by any component — you can skip it for now.
### Environment Variables
The following environment variables are normally injected by the operator. They are already documented in the [Environment Variables Reference](#environment-variables-reference) below, but note that without the operator you must set them manually:
- **Checkpoint jobs:** `DYN_READY_FOR_CHECKPOINT_FILE`, `DYN_CHECKPOINT_LOCATION`, `DYN_CHECKPOINT_STORAGE_TYPE`, `DYN_CHECKPOINT_HASH`
- **Restore pods:** `DYN_CHECKPOINT_PATH`, `DYN_CHECKPOINT_HASH`
---
## Prerequisites
- Kubernetes cluster with:
- NVIDIA GPUs with checkpoint support
- **Privileged security context allowed** (⚠️ required for CRIU - see [Security Considerations](#security-considerations))
- **Privileged DaemonSet allowed** (⚠️ the ChReK DaemonSet runs privileged - see [Security Considerations](#security-considerations))
- PVC storage (ReadWriteMany recommended for multi-node)
- Docker or compatible container runtime for building images
- Access to the ChReK source code: `deploy/chrek/`
### Security Considerations
⚠️ **Important**: ChReK restore operations **require privileged mode**, which has significant security implications:
⚠️ **Important**: The ChReK **DaemonSet** runs in privileged mode to perform CRIU checkpoint/restore operations. Your workload pods (checkpoint jobs, restore pods) do **not** need privileged mode — all CRIU privilege lives in the DaemonSet, which performs external restore via `nsenter`.
- **Privileged containers** can access all host devices and bypass most security restrictions
- **The DaemonSet** has `privileged: true`, `hostPID`, `hostIPC`, and `hostNetwork`
- This may violate security policies in production environments
- Privileged containers, if compromised, can potentially compromise node security
- If the DaemonSet is compromised, it could potentially compromise node security
**Recommended for:**
- ✅ Development and testing environments
......@@ -108,7 +176,7 @@ kubectl get pvc -n my-app
## Step 2: Build Checkpoint-Enabled Images
ChReK provides a convenient `placeholder` target in its Dockerfile that automatically injects checkpoint/restore capabilities into your existing container images.
ChReK provides a `placeholder` target in its Dockerfile that layers CRIU runtime dependencies onto your existing container images. The DaemonSet performs restore externally via `nsenter`, so these dependencies must be present in the image.
### Quick Start: Using the Placeholder Target (Recommended)
......@@ -149,43 +217,14 @@ docker build \
The ChReK Dockerfile's `placeholder` stage automatically:
- ✅ Builds the restore-entrypoint binary
- ✅ Injects it into `/usr/local/bin/restore-entrypoint`
- ✅ Adds `smart-entrypoint.sh` to `/usr/local/bin/`
- ✅ Sets executable permissions
- ✅ Configures the entrypoint to detect and restore checkpoints
- ✅ Preserves your original application CMD
### Alternative: Manual Multi-Stage Build
If you need more control, you can create your own Dockerfile:
```dockerfile
# Stage 1: Build restore-entrypoint
FROM golang:1.23-alpine AS restore-builder
WORKDIR /build
COPY deploy/chrek/cmd/restore-entrypoint ./cmd/restore-entrypoint
COPY deploy/chrek/pkg ./pkg
COPY deploy/chrek/go.mod deploy/chrek/go.sum ./
RUN go build -o /restore-entrypoint ./cmd/restore-entrypoint
# Stage 2: Your application image
FROM your-base-image:latest
- ✅ Installs CRIU runtime libraries (required by `nsrestore` running inside the pod's namespaces)
- ✅ Copies the `criu` binary to `/usr/local/sbin/criu`
- ✅ Copies `cuda-checkpoint` to `/usr/local/sbin/cuda-checkpoint` (used for CUDA state checkpoint/restore)
- ✅ Copies `nsrestore` to `/usr/local/bin/nsrestore` (invoked by DaemonSet via `nsenter`)
- ✅ Creates checkpoint directories (`/checkpoints`, `/var/run/criu`, `/var/criu-work`)
- ✅ Preserves your original application image contents
# Copy restore-entrypoint
COPY --from=restore-builder /restore-entrypoint /usr/local/bin/restore-entrypoint
# Copy smart-entrypoint.sh
COPY deploy/chrek/scripts/smart-entrypoint.sh /usr/local/bin/smart-entrypoint.sh
RUN chmod +x /usr/local/bin/smart-entrypoint.sh /usr/local/bin/restore-entrypoint
# Set smart-entrypoint as the default entrypoint
ENTRYPOINT ["/usr/local/bin/smart-entrypoint.sh"]
# Your application command (becomes CMD, can be overridden)
CMD ["python", "your_app.py"]
```
The placeholder image does **not** override the entrypoint or CMD. For restore pods, the operator (or you, in standalone mode) overrides the command to `sleep infinity`.
> **💡 Tip**: Using the `placeholder` target is the recommended approach as it's maintained with the ChReK codebase and ensures compatibility.
......@@ -201,7 +240,6 @@ Your checkpoint job MUST set these environment variables:
| Variable | Description | Example |
|----------|-------------|---------|
| `DYN_CHECKPOINT_SIGNAL_FILE` | Path where DaemonSet writes completion signal | `/checkpoint-signal/my-checkpoint.done` |
| `DYN_READY_FOR_CHECKPOINT_FILE` | Path where your app signals it's ready | `/tmp/ready-for-checkpoint` |
| `DYN_CHECKPOINT_HASH` | Unique identifier for this checkpoint | `abc123def456` |
| `DYN_CHECKPOINT_LOCATION` | Directory where checkpoint is stored | `/checkpoints/abc123def456` |
......@@ -213,7 +251,7 @@ Add this label to enable DaemonSet checkpoint detection:
```yaml
labels:
nvidia.com/checkpoint-source: "true"
nvidia.com/chrek-is-checkpoint-source: "true"
```
### Example Checkpoint Job
......@@ -228,39 +266,26 @@ spec:
template:
metadata:
labels:
nvidia.com/checkpoint-source: "true" # Required for DaemonSet detection
nvidia.com/chrek-is-checkpoint-source: "true" # Required for DaemonSet detection
nvidia.com/chrek-checkpoint-hash: "abc123def456" # Must match DYN_CHECKPOINT_HASH
spec:
restartPolicy: Never
# Init container to clean up stale signal files
initContainers:
- name: cleanup-signal-file
image: busybox:latest
command:
- sh
- -c
- |
rm -f /checkpoint-signal/my-checkpoint.done || true
echo "Signal file cleanup complete"
volumeMounts:
- name: checkpoint-signal
mountPath: /checkpoint-signal
# Seccomp profile to block io_uring syscalls (deployed by the chrek DaemonSet)
securityContext:
seccompProfile:
type: Localhost
localhostProfile: profiles/block-iouring.json
containers:
- name: main
image: my-app:checkpoint-enabled
# Security context required for CRIU
securityContext:
privileged: true
capabilities:
add: ["SYS_ADMIN", "SYS_PTRACE", "SYS_CHROOT"]
# Readiness probe: Pod becomes Ready when model is loaded
# This is what triggers the DaemonSet to start checkpointing
readinessProbe:
exec:
command: ["sh", "-c", "cat ${DYN_READY_FOR_CHECKPOINT_FILE}"]
command: ["cat", "/tmp/ready-for-checkpoint"]
initialDelaySeconds: 15
periodSeconds: 2
......@@ -271,8 +296,6 @@ spec:
# Checkpoint-related environment variables
env:
- name: DYN_CHECKPOINT_SIGNAL_FILE
value: "/checkpoint-signal/my-checkpoint.done"
- name: DYN_READY_FOR_CHECKPOINT_FILE
value: "/tmp/ready-for-checkpoint"
- name: DYN_CHECKPOINT_HASH
......@@ -291,105 +314,94 @@ spec:
volumeMounts:
- name: checkpoint-storage
mountPath: /checkpoints
- name: checkpoint-signal
mountPath: /checkpoint-signal
- name: tmp
mountPath: /tmp
volumes:
- name: checkpoint-storage
persistentVolumeClaim:
claimName: chrek-pvc
- name: checkpoint-signal
hostPath:
path: /var/lib/chrek/signals
type: DirectoryOrCreate
- name: tmp
emptyDir: {}
```
### Application Code Requirements
Your application must implement the checkpoint flow. Here's the pattern used by Dynamo vLLM:
Your application must implement the checkpoint flow. The DaemonSet communicates with your application via Unix signals (not files):
- **`SIGUSR1`**: Checkpoint completed — your process should exit gracefully
- **`SIGCONT`**: Restore completed — your process should wake up and continue
- **`SIGUSR2`**: Checkpoint/restore failed
Here's the pattern used by Dynamo vLLM (see `components/src/dynamo/vllm/chrek.py`):
```python
import asyncio
import os
import time
import signal
def main():
# 1. Check for checkpoint mode
signal_file = os.environ.get("DYN_CHECKPOINT_SIGNAL_FILE")
async def main():
ready_file = os.environ.get("DYN_READY_FOR_CHECKPOINT_FILE")
restore_marker = os.environ.get("DYN_RESTORE_MARKER_FILE")
is_checkpoint_mode = signal_file is not None
if is_checkpoint_mode:
print("Checkpoint mode detected")
# 2. Load your model/application
model = load_model()
# 3. Optional: Put model to sleep to reduce memory footprint
# model.sleep()
# 4. Write ready file (for application use, not DaemonSet)
if ready_file:
with open(ready_file, "w") as f:
f.write("ready")
print(f"Wrote checkpoint ready file: {ready_file}")
# 5. Log readiness messages (helps debugging)
print("CHECKPOINT_READY: Model loaded, ready for container checkpoint")
print(f"CHECKPOINT_READY: Waiting for signal file: {signal_file}")
print(f"CHECKPOINT_READY: Or restore marker file: {restore_marker}")
# 6. Wait for checkpoint completion OR restore detection
while True:
# Check if we've been restored (marker file created by restore entrypoint)
if os.path.exists(restore_marker):
print(f"Detected restore from checkpoint (marker: {restore_marker})")
# Continue with normal application flow
break
# Check if checkpoint is complete (signal file created by DaemonSet)
if os.path.exists(signal_file):
print(f"Checkpoint signal file detected: {signal_file}")
print("Checkpoint complete, exiting")
return # Exit gracefully
time.sleep(1)
# Normal application flow (or post-restore flow)
run_application()
if not ready_file:
# Not in checkpoint mode, run normally
await run_application()
return
print("Checkpoint mode detected")
# 1. Load your model/application
model = await load_model()
# 2. Optional: Put model to sleep for CRIU-friendly GPU state
await model.sleep()
# 3. Write ready file — triggers DaemonSet checkpoint via readiness probe
with open(ready_file, "w") as f:
f.write("ready")
# 4. Set up signal handlers and wait for DaemonSet
checkpoint_done = asyncio.Event()
restore_done = asyncio.Event()
loop = asyncio.get_running_loop()
loop.add_signal_handler(signal.SIGUSR1, checkpoint_done.set)
loop.add_signal_handler(signal.SIGCONT, restore_done.set)
print("Ready for checkpoint. Waiting for watcher signal...")
# Wait for whichever signal comes first
done, pending = await asyncio.wait(
[asyncio.create_task(checkpoint_done.wait()),
asyncio.create_task(restore_done.wait())],
return_when=asyncio.FIRST_COMPLETED,
)
for task in pending:
task.cancel()
if restore_done.is_set():
# SIGCONT: Process was restored from checkpoint
print("Restore complete, waking model")
await model.wake_up()
await run_application()
else:
# SIGUSR1: Checkpoint complete, exit
print("Checkpoint complete, exiting")
```
**Important Notes:**
1. **Ready File & Readiness Probe**: The checkpoint job must have a readiness probe that checks for the ready file:
```yaml
readinessProbe:
exec:
command: ["sh", "-c", "cat ${DYN_READY_FOR_CHECKPOINT_FILE}"]
initialDelaySeconds: 15
periodSeconds: 2
```
The ChReK DaemonSet triggers checkpointing when:
- Pod has `nvidia.com/checkpoint-source: "true"` label
1. **Ready File & Readiness Probe**: The checkpoint job must have a readiness probe that checks for the ready file. The ChReK DaemonSet triggers checkpointing when:
- Pod has `nvidia.com/chrek-is-checkpoint-source: "true"` label
- Pod status is `Ready` (readiness probe passes = ready file exists)
2. **Restore Marker**: Created by `restore-entrypoint` before CRIU restore, allows the restored process to detect it was restored
2. **Signal-based coordination**: The DaemonSet sends `SIGUSR1` after checkpoint completes and `SIGCONT` after restore completes. Your application must handle these signals (not poll for files).
3. **Two Exit Paths**:
- **Signal file found**: Checkpoint complete, exit gracefully
- **Restore marker found**: Process was restored, continue running
3. **Two exit paths**:
- **SIGUSR1 received**: Checkpoint complete, exit gracefully
- **SIGCONT received**: Process was restored, wake model and continue
---
## Step 4: Restore from Checkpoints
Restore pods automatically detect and restore from checkpoints if they exist.
The DaemonSet performs restore externally — your restore pod just needs to be a placeholder that sleeps until the DaemonSet restores the checkpointed process into it.
### Example Restore Pod
......@@ -399,18 +411,26 @@ kind: Pod
metadata:
name: my-app-restored
namespace: my-app
labels:
nvidia.com/chrek-is-restore-target: "true" # Required: watcher detects restore pods by this label
nvidia.com/chrek-checkpoint-hash: "abc123def456" # Required: watcher uses this to locate the checkpoint
spec:
restartPolicy: Never
# Seccomp profile to block io_uring syscalls (deployed by the chrek DaemonSet)
# Without this, io_uring syscalls may cause CRIU restore failures
securityContext:
seccompProfile:
type: Localhost
localhostProfile: profiles/block-iouring.json
containers:
- name: main
image: my-app:checkpoint-enabled
# Security context required for CRIU restore
securityContext:
privileged: true
capabilities:
add: ["SYS_ADMIN", "SYS_PTRACE", "SYS_CHROOT"]
# Override command to sleep — the chrek DaemonSet performs external restore
# on Running-but-not-Ready pods. Without this, the container would cold-start.
command: ["sleep", "infinity"]
# Set checkpoint environment variables
env:
......@@ -419,38 +439,28 @@ spec:
- name: DYN_CHECKPOINT_PATH
value: "/checkpoints" # Base path (hash appended automatically)
- name: DYN_RESTORE_MARKER_FILE
value: "/tmp/dynamo-restored"
# GPU request
resources:
limits:
nvidia.com/gpu: 1
# Mount checkpoint storage (READ-ONLY is fine for restore)
# CRIU needs write access for restore.log — do NOT set readOnly
volumeMounts:
- name: checkpoint-storage
mountPath: /checkpoints
readOnly: true
- name: checkpoint-signal
mountPath: /checkpoint-signal
volumes:
- name: checkpoint-storage
persistentVolumeClaim:
claimName: chrek-pvc
- name: checkpoint-signal
hostPath:
path: /var/lib/chrek/signals
type: DirectoryOrCreate
```
### How Restore Works
1. **Smart Entrypoint Detects Checkpoint**: The `smart-entrypoint.sh` checks if a checkpoint exists at `/checkpoints/${DYN_CHECKPOINT_HASH}/`
2. **Calls Restore Entrypoint**: If found, calls `/usr/local/bin/restore-entrypoint` which invokes CRIU
3. **CRIU Restores Process**: The entire process tree is restored from the checkpoint, including GPU state
4. **Application Continues**: Your application resumes exactly where it was checkpointed
1. **Pod starts as placeholder**: The `sleep infinity` command keeps the pod Running but not Ready
2. **DaemonSet detects restore pod**: The watcher finds pods with `nvidia.com/chrek-is-restore-target=true` that are Running but not Ready
3. **External restore via nsenter**: The DaemonSet enters the pod's namespaces and performs CRIU restore, including GPU state
4. **Application continues**: Your application resumes exactly where it was checkpointed
---
......@@ -460,10 +470,9 @@ spec:
| Variable | Required | Description |
|----------|----------|-------------|
| `DYN_CHECKPOINT_SIGNAL_FILE` | Yes | Full path to signal file (e.g., `/checkpoint-signal/my-checkpoint.done`) |
| `DYN_READY_FOR_CHECKPOINT_FILE` | Yes | Full path where app signals readiness (e.g., `/tmp/ready-for-checkpoint`) |
| `DYN_CHECKPOINT_HASH` | Yes | Unique checkpoint identifier (alphanumeric string) |
| `DYN_CHECKPOINT_LOCATION` | Yes | Directory where checkpoint is stored (e.g., `/checkpoints/abc123`) |
| `DYN_CHECKPOINT_HASH` | Yes | Unique checkpoint identifier (16-char hex string) |
| `DYN_CHECKPOINT_LOCATION` | Yes | Directory where checkpoint is stored (e.g., `/checkpoints/abc123def456`) |
| `DYN_CHECKPOINT_STORAGE_TYPE` | Yes | Storage backend: `pvc`, `s3`, or `oci` |
### Restore Pods
......@@ -472,22 +481,18 @@ spec:
|----------|----------|-------------|
| `DYN_CHECKPOINT_HASH` | Yes | Checkpoint identifier (must match checkpoint job) |
| `DYN_CHECKPOINT_PATH` | Yes | Base checkpoint directory (hash appended automatically) |
| `DYN_RESTORE_MARKER_FILE` | Yes | Path for restore marker file |
### Optional CRIU Tuning (Advanced)
| Variable | Default | Description |
|----------|---------|-------------|
| `CRIU_TIMEOUT` | `0` (unlimited) | CRIU operation timeout in seconds |
| `CRIU_LOG_LEVEL` | `4` | CRIU log verbosity (0-4) |
| `CRIU_WORK_DIR` | `/tmp` | CRIU working directory |
| `CUDA_PLUGIN_DIR` | `/usr/local/lib/criu` | Path to CRIU CUDA plugin |
| `CRIU_SKIP_IN_FLIGHT` | `false` | Skip in-flight TCP connections |
| `CRIU_AUTO_DEDUP` | `false` | Enable auto-deduplication |
| `CRIU_LAZY_PAGES` | `false` | Enable lazy page migration (experimental) |
| `WAIT_FOR_CHECKPOINT` | `false` | Wait for checkpoint to appear before starting |
| `RESTORE_WAIT_TIMEOUT` | `300` | Max seconds to wait for checkpoint |
| `DEBUG` | `false` | Enable debug mode (sleeps 300s on error) |
### Signals (DaemonSet → Application)
The DaemonSet communicates checkpoint/restore completion via Unix signals, not files:
| Signal | Direction | Meaning |
|--------|-----------|---------|
| `SIGUSR1` | DaemonSet → checkpoint pod | Checkpoint completed, process should exit |
| `SIGCONT` | DaemonSet → restored pod | Restore completed, process should wake up |
| `SIGUSR2` | DaemonSet → checkpoint pod | Checkpoint failed (wake process to continue) |
CRIU tuning options are configured via the ChReK Helm chart's `config.checkpoint.criu` values, not environment variables. See the [Helm Chart Values](https://github.com/ai-dynamo/dynamo/tree/main/deploy/helm/charts/chrek/values.yaml) for available options.
---
......@@ -497,7 +502,7 @@ spec:
```
┌─────────────────────────────────────────────────────────────┐
│ 1. Pod starts with nvidia.com/checkpoint-source=true label │
│ 1. Pod starts with nvidia.com/chrek-is-checkpoint-source=true label │
└──────────────────────┬──────────────────────────────────────┘
......@@ -515,13 +520,13 @@ spec:
┌─────────────────────────────────────────────────────────────┐
│ 4. ChReK DaemonSet detects: │
│ - Pod is Ready │
│ - Has checkpoint-source label
│ - Ready file exists: /tmp/ready-for-checkpoint
│ - Has chrek-is-checkpoint-source label │
│ - Has chrek-checkpoint-hash label
└──────────────────────┬──────────────────────────────────────┘
┌─────────────────────────────────────────────────────────────┐
│ 5. DaemonSet executes CRIU checkpoint via runc:
│ 5. DaemonSet executes CRIU checkpoint:
│ - Freezes container process │
│ - Dumps memory (CPU + GPU) │
│ - Saves to /checkpoints/${HASH}/ │
......@@ -529,13 +534,12 @@ spec:
┌─────────────────────────────────────────────────────────────┐
│ 6. DaemonSet writes signal file: │
│ /checkpoint-signal/${HASH}.done │
│ 6. DaemonSet sends SIGUSR1 to the application process │
└──────────────────────┬──────────────────────────────────────┘
┌─────────────────────────────────────────────────────────────┐
│ 7. Application detects signal file and exits gracefully │
│ 7. Application receives SIGUSR1 and exits gracefully
└─────────────────────────────────────────────────────────────┘
```
......@@ -543,44 +547,34 @@ spec:
```
┌─────────────────────────────────────────────────────────────┐
│ 1. Pod starts with DYN_CHECKPOINT_HASH set │
│ 1. Pod starts with restore labels and sleep infinity │
│ (Running but not Ready) │
└──────────────────────┬──────────────────────────────────────┘
┌─────────────────────────────────────────────────────────────┐
│ 2. smart-entrypoint.sh checks for checkpoint: │
│ /checkpoints/${DYN_CHECKPOINT_HASH}/checkpoint.done │
└──────────────────────┬──────────────────────────────────────┘
├─ Not Found ─────────────────┐
│ │
▼ ▼
┌───────────────────────┐ ┌──────────────────────┐
│ Checkpoint exists │ │ Cold start │
└──────────┬────────────┘ │ Run original CMD │
│ └──────────────────────┘
┌─────────────────────────────────────────────────────────────┐
│ 3. Call restore-entrypoint with checkpoint path │
│ 2. ChReK DaemonSet detects: │
│ - Pod is Running but not Ready │
│ - Has chrek-is-restore-target label │
│ - Has chrek-checkpoint-hash label │
└──────────────────────┬──────────────────────────────────────┘
┌─────────────────────────────────────────────────────────────┐
│ 4. restore-entrypoint extracts checkpoint and calls CRIU: │
│ criu restore --images-dir /checkpoints/${HASH}/images │
│ 3. DaemonSet performs external restore via nsenter: │
│ - Enters pod's namespaces (mount, net, pid, ipc) │
│ - Runs nsrestore with CRIU inside the pod's context │
│ - Restores memory (CPU + GPU via cuda-checkpoint) │
└──────────────────────┬──────────────────────────────────────┘
┌─────────────────────────────────────────────────────────────┐
│ 5. CRIU restores process from checkpoint │
│ - Restores memory (CPU + GPU) │
│ - Restores file descriptors │
│ - Resumes process execution │
│ 4. DaemonSet sends SIGCONT to the restored process │
└──────────────────────┬──────────────────────────────────────┘
┌─────────────────────────────────────────────────────────────┐
6. Application continues from checkpointed state
5. Application receives SIGCONT, wakes model, continues
│ (Model already loaded, GPU memory initialized) │
└─────────────────────────────────────────────────────────────┘
```
......@@ -596,7 +590,7 @@ spec:
**Checks**:
1. Verify the pod has the label:
```bash
kubectl get pod <pod-name> -o jsonpath='{.metadata.labels.nvidia\.com/checkpoint-source}'
kubectl get pod <pod-name> -o jsonpath='{.metadata.labels.nvidia\.com/chrek-is-checkpoint-source}'
```
2. Check pod readiness:
......@@ -624,61 +618,49 @@ spec:
kubectl exec <pod-name> -- ls -la /checkpoints/${DYN_CHECKPOINT_HASH}/
```
2. Check privileged mode is enabled:
2. Check DaemonSet logs for restore errors:
```bash
kubectl get pod <pod-name> -o jsonpath='{.spec.containers[0].securityContext.privileged}'
kubectl logs -n my-app daemonset/chrek-agent --all-containers
```
3. Check CRIU logs in `/tmp/criu-restore.log`:
3. Check pod events for restore status annotations:
```bash
kubectl exec <pod-name> -- cat /tmp/criu-restore.log
kubectl describe pod <pod-name>
```
4. Ensure checkpoint and restore have same:
- Container image
- Container image (built with `placeholder` target)
- GPU count
- Volume mounts
- Environment variables (except POD_NAME, POD_IP, etc.)
### Permission Denied Errors
**Symptom**: `CRIU: Permission denied` or `Operation not permitted`
- Volume mounts (same `mountPath` for checkpoint PVC)
**Solution**: Ensure pod has:
```yaml
securityContext:
privileged: true
capabilities:
add:
- SYS_ADMIN
- SYS_PTRACE
- SYS_CHROOT
```
### Restore Pod Not Detected
### Signal File Not Appearing
**Symptom**: Application waits forever for signal file
**Symptom**: Pod runs `sleep infinity` but DaemonSet never restores it
**Checks**:
1. Verify hostPath mount is correct:
1. Verify the pod has the required labels:
```bash
kubectl get pod <pod-name> -o jsonpath='{.spec.volumes[?(@.name=="checkpoint-signal")]}'
kubectl get pod <pod-name> -o jsonpath='{.metadata.labels}'
```
Must have both `nvidia.com/chrek-is-restore-target: "true"` and `nvidia.com/chrek-checkpoint-hash: "<hash>"`.
2. Check DaemonSet has access to the same path:
2. Verify the pod is Running but not Ready (this is the trigger):
```bash
kubectl get daemonset -n my-app chrek-agent -o jsonpath='{.spec.template.spec.volumes[?(@.name=="signal-dir")]}'
kubectl get pod <pod-name> -o jsonpath='{.status.phase}'
kubectl get pod <pod-name> -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}'
```
3. Verify paths match exactly:
- Pod: `/var/lib/chrek/signals`
- DaemonSet: `/var/lib/chrek/signals`
3. Verify the DaemonSet is running on the same node:
```bash
kubectl get pods -n my-app -l app.kubernetes.io/name=chrek -o wide
```
---
## Additional Resources
- [ChReK Helm Chart Values](https://github.com/ai-dynamo/dynamo/tree/main/deploy/helm/charts/chrek/values.yaml)
- [Dynamo vLLM ChReK Integration](https://github.com/ai-dynamo/dynamo/tree/main/components/src/dynamo/vllm/chrek.py) - Reference signal handler implementation
- [ChReK Dockerfile](https://github.com/ai-dynamo/dynamo/tree/main/deploy/chrek/Dockerfile)
- [CRIU Documentation](https://criu.org/Main_Page)
- [CUDA Checkpoint Utility](https://github.com/NVIDIA/cuda-checkpoint)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment