feat(chrek): external restore, signal-based IPC, and package refactor (#6286)

Co-authored-by: Dan Feigin <dfeigin@nvidia.com>

feat(chrek): external restore, signal-based IPC, and package refactor (#6286)
Co-authored-by: Dan Feigin <dfeigin@nvidia.com>
bb8fc8a4 · Schwinn Saereesitthipitak · GitHub · c8423b57 · bb8fc8a4 · bb8fc8a4
Unverified Commit bb8fc8a4 authored Feb 20, 2026 by Schwinn Saereesitthipitak Committed by GitHub Feb 21, 2026
6 changed files
--- a/deploy/operator/internal/controller/dynamocomponentdeployment_controller_test.go
+++ b/deploy/operator/internal/controller/dynamocomponentdeployment_controller_test.go
@@ -1225,6 +1225,260 @@ func TestDynamoComponentDeploymentReconciler_createOrUpdateOrDeleteDeployments_R
 	g.Expect(deployment3).NotTo(gomega.BeNil())
 }

+func TestDynamoComponentDeploymentReconciler_generatePodTemplateSpec_RestoreLabels(t *testing.T) {
+	s := scheme.Scheme
+	if err := v1alpha1.AddToScheme(s); err != nil {
+		t.Fatalf("Failed to add v1alpha1 to scheme: %v", err)
+	}
+	if err := corev1.AddToScheme(s); err != nil {
+		t.Fatalf("Failed to add corev1 to scheme: %v", err)
+	}
+
+	makeDCD := func(checkpointRef string) *v1alpha1.DynamoComponentDeployment {
+		return &v1alpha1.DynamoComponentDeployment{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      "test-worker",
+				Namespace: "default",
+			},
+			Spec: v1alpha1.DynamoComponentDeploymentSpec{
+				BackendFramework: string(dynamo.BackendFrameworkVLLM),
+				DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
+					ServiceName:     "worker",
+					ComponentType:   commonconsts.ComponentTypeWorker,
+					DynamoNamespace: ptr.To("default"),
+					Labels: map[string]string{
+						commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
+						commonconsts.KubeLabelIsRestoreTarget:           commonconsts.KubeLabelValueTrue,
+					},
+					Checkpoint: &v1alpha1.ServiceCheckpointConfig{
+						Enabled:       true,
+						CheckpointRef: &checkpointRef,
+					},
+					ExtraPodSpec: &v1alpha1.ExtraPodSpec{
+						MainContainer: &corev1.Container{
+							Name:    commonconsts.MainContainerName,
+							Image:   "test-image:latest",
+							Command: []string{"python3"},
+							Args:    []string{"-m", "dynamo.vllm"},
+						},
+					},
+				},
+			},
+		}
+	}
+
+	makeReconciler := func(objs ...client.Object) *DynamoComponentDeploymentReconciler {
+		return &DynamoComponentDeploymentReconciler{
+			Client: fake.NewClientBuilder().
+				WithScheme(s).
+				WithObjects(objs...).
+				Build(),
+			Config: controller_common.Config{
+				Checkpoint: controller_common.CheckpointConfig{
+					Enabled: true,
+					Storage: controller_common.CheckpointStorageConfig{
+						Type: controller_common.CheckpointStorageTypePVC,
+						PVC: controller_common.CheckpointPVCConfig{
+							PVCName:  "chrek-pvc",
+							BasePath: "/checkpoints",
+						},
+					},
+				},
+			},
+		}
+	}
+
+	t.Run("ready checkpoint adds explicit restore labels", func(t *testing.T) {
+		checkpointName := "ckpt-ready"
+		dcd := makeDCD(checkpointName)
+		ckpt := &v1alpha1.DynamoCheckpoint{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      checkpointName,
+				Namespace: "default",
+			},
+			Status: v1alpha1.DynamoCheckpointStatus{
+				Phase:        v1alpha1.DynamoCheckpointPhaseReady,
+				IdentityHash: "hash-ready-1",
+			},
+		}
+
+		r := makeReconciler(dcd, ckpt)
+		podTemplateSpec, err := r.generatePodTemplateSpec(
+			context.Background(),
+			generateResourceOption{dynamoComponentDeployment: dcd},
+			dynamo.RoleMain,
+		)
+		if err != nil {
+			t.Fatalf("generatePodTemplateSpec failed: %v", err)
+		}
+
+		if got := podTemplateSpec.Labels[commonconsts.KubeLabelIsRestoreTarget]; got != commonconsts.KubeLabelValueTrue {
+			t.Fatalf("expected %s label to be true, got %q", commonconsts.KubeLabelIsRestoreTarget, got)
+		}
+		if got := podTemplateSpec.Labels[commonconsts.KubeLabelCheckpointHash]; got != "hash-ready-1" {
+			t.Fatalf("expected %s to be checkpoint hash, got %q", commonconsts.KubeLabelCheckpointHash, got)
+		}
+	})
+
+	t.Run("non-ready checkpoint clears stale restore labels", func(t *testing.T) {
+		checkpointName := "ckpt-pending"
+		dcd := makeDCD(checkpointName)
+		ckpt := &v1alpha1.DynamoCheckpoint{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      checkpointName,
+				Namespace: "default",
+			},
+			Status: v1alpha1.DynamoCheckpointStatus{
+				Phase:        v1alpha1.DynamoCheckpointPhaseCreating,
+				IdentityHash: "hash-pending-1",
+			},
+		}
+
+		r := makeReconciler(dcd, ckpt)
+		podTemplateSpec, err := r.generatePodTemplateSpec(
+			context.Background(),
+			generateResourceOption{dynamoComponentDeployment: dcd},
+			dynamo.RoleMain,
+		)
+		if err != nil {
+			t.Fatalf("generatePodTemplateSpec failed: %v", err)
+		}
+
+		if _, ok := podTemplateSpec.Labels[commonconsts.KubeLabelIsRestoreTarget]; ok {
+			t.Fatalf("did not expect %s label when checkpoint is not ready", commonconsts.KubeLabelIsRestoreTarget)
+		}
+		if _, ok := podTemplateSpec.Labels[commonconsts.KubeLabelCheckpointHash]; ok {
+			t.Fatalf("did not expect %s label when checkpoint is not ready", commonconsts.KubeLabelCheckpointHash)
+		}
+	})
+}
+
+func TestDynamoComponentDeploymentReconciler_generateDeployment_RestoreStrategy(t *testing.T) {
+	s := scheme.Scheme
+	if err := v1alpha1.AddToScheme(s); err != nil {
+		t.Fatalf("Failed to add v1alpha1 to scheme: %v", err)
+	}
+	if err := corev1.AddToScheme(s); err != nil {
+		t.Fatalf("Failed to add corev1 to scheme: %v", err)
+	}
+	if err := appsv1.AddToScheme(s); err != nil {
+		t.Fatalf("Failed to add appsv1 to scheme: %v", err)
+	}
+
+	replicas := int32(1)
+	makeDCD := func(checkpointRef string) *v1alpha1.DynamoComponentDeployment {
+		return &v1alpha1.DynamoComponentDeployment{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      "test-worker",
+				Namespace: "default",
+			},
+			Spec: v1alpha1.DynamoComponentDeploymentSpec{
+				BackendFramework: string(dynamo.BackendFrameworkVLLM),
+				DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
+					ServiceName:     "worker",
+					ComponentType:   commonconsts.ComponentTypeWorker,
+					DynamoNamespace: ptr.To("default"),
+					Replicas:        &replicas,
+					Labels: map[string]string{
+						commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
+					},
+					Checkpoint: &v1alpha1.ServiceCheckpointConfig{
+						Enabled:       true,
+						CheckpointRef: &checkpointRef,
+					},
+					ExtraPodSpec: &v1alpha1.ExtraPodSpec{
+						MainContainer: &corev1.Container{
+							Name:    commonconsts.MainContainerName,
+							Image:   "test-image:latest",
+							Command: []string{"python3"},
+							Args:    []string{"-m", "dynamo.vllm"},
+						},
+					},
+				},
+			},
+		}
+	}
+
+	makeReconciler := func(objs ...client.Object) *DynamoComponentDeploymentReconciler {
+		return &DynamoComponentDeploymentReconciler{
+			Client: fake.NewClientBuilder().
+				WithScheme(s).
+				WithObjects(objs...).
+				Build(),
+			Config: controller_common.Config{
+				Checkpoint: controller_common.CheckpointConfig{
+					Enabled: true,
+					Storage: controller_common.CheckpointStorageConfig{
+						Type: controller_common.CheckpointStorageTypePVC,
+						PVC: controller_common.CheckpointPVCConfig{
+							PVCName:  "chrek-pvc",
+							BasePath: "/checkpoints",
+						},
+					},
+				},
+			},
+		}
+	}
+
+	t.Run("ready checkpoint forces Recreate strategy", func(t *testing.T) {
+		checkpointName := "ckpt-ready"
+		dcd := makeDCD(checkpointName)
+		ckpt := &v1alpha1.DynamoCheckpoint{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      checkpointName,
+				Namespace: "default",
+			},
+			Status: v1alpha1.DynamoCheckpointStatus{
+				Phase:        v1alpha1.DynamoCheckpointPhaseReady,
+				IdentityHash: "hash-ready-1",
+			},
+		}
+
+		r := makeReconciler(dcd, ckpt)
+		deploy, toDelete, err := r.generateDeployment(context.Background(), generateResourceOption{
+			dynamoComponentDeployment: dcd,
+		})
+		if err != nil {
+			t.Fatalf("generateDeployment failed: %v", err)
+		}
+		if toDelete {
+			t.Fatalf("expected deployment to be retained")
+		}
+		if deploy.Spec.Strategy.Type != appsv1.RecreateDeploymentStrategyType {
+			t.Fatalf("expected Recreate strategy, got %s", deploy.Spec.Strategy.Type)
+		}
+	})
+
+	t.Run("non-ready checkpoint keeps RollingUpdate strategy", func(t *testing.T) {
+		checkpointName := "ckpt-creating"
+		dcd := makeDCD(checkpointName)
+		ckpt := &v1alpha1.DynamoCheckpoint{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      checkpointName,
+				Namespace: "default",
+			},
+			Status: v1alpha1.DynamoCheckpointStatus{
+				Phase:        v1alpha1.DynamoCheckpointPhaseCreating,
+				IdentityHash: "hash-creating-1",
+			},
+		}
+
+		r := makeReconciler(dcd, ckpt)
+		deploy, toDelete, err := r.generateDeployment(context.Background(), generateResourceOption{
+			dynamoComponentDeployment: dcd,
+		})
+		if err != nil {
+			t.Fatalf("generateDeployment failed: %v", err)
+		}
+		if toDelete {
+			t.Fatalf("expected deployment to be retained")
+		}
+		if deploy.Spec.Strategy.Type != appsv1.RollingUpdateDeploymentStrategyType {
+			t.Fatalf("expected RollingUpdate strategy, got %s", deploy.Spec.Strategy.Type)
+		}
+	})
+}
+
 func Test_createOrUpdateOrDeleteDeployments_K8sAPIDefaults(t *testing.T) {
 	g := gomega.NewGomegaWithT(t)
 	ctx := context.Background()

--- a/deploy/operator/internal/controller_common/predicate.go
+++ b/deploy/operator/internal/controller_common/predicate.go
@@ -104,13 +104,8 @@ type CheckpointConfig struct {
 	Enabled bool
 	// Storage holds storage backend configuration
 	Storage CheckpointStorageConfig
-	// InitContainerImage is the image used for init containers (e.g., signal file cleanup)
-	// Defaults to "busybox:latest" if not specified
-	InitContainerImage string
 	// ReadyForCheckpointFilePath is the file path used to signal model readiness for checkpoint jobs
 	ReadyForCheckpointFilePath string
-	// RestoreMarkerFilePath is the marker file path written after successful restore
-	RestoreMarkerFilePath string
 }

 // Checkpoint storage type constants
@@ -124,8 +119,6 @@ const (
 type CheckpointStorageConfig struct {
 	// Type is the storage backend type: pvc, s3, or oci
 	Type string
-	// SignalHostPath is the host path for signal files (used for checkpoint job coordination)
-	SignalHostPath string
 	// PVC configuration (used when Type=pvc)
 	PVC CheckpointPVCConfig
 	// S3 configuration (used when Type=s3)

--- a/deploy/operator/internal/dynamo/graph.go
+++ b/deploy/operator/internal/dynamo/graph.go
@@ -1338,7 +1338,7 @@ func GenerateGrovePodCliqueSet(
 					PodSpec:      *podSpec,
 				},
 			}
-			labels, err := generateLabels(component, dynamoDeployment, serviceName)
+			labels, err := generateLabels(component, dynamoDeployment, serviceName, checkpointInfo)
 			if err != nil {
 				return nil, fmt.Errorf("failed to generate labels: %w", err)
 			}
@@ -1392,7 +1392,12 @@ func GenerateGrovePodCliqueSet(
 	return gangSet, nil
 }

-func generateLabels(component *v1alpha1.DynamoComponentDeploymentSharedSpec, dynamoDeployment *v1alpha1.DynamoGraphDeployment, componentName string) (map[string]string, error) {
+func generateLabels(
+	component *v1alpha1.DynamoComponentDeploymentSharedSpec,
+	dynamoDeployment *v1alpha1.DynamoGraphDeployment,
+	componentName string,
+	checkpointInfo *checkpoint.CheckpointInfo,
+) (map[string]string, error) {
 	labels := make(map[string]string)
 	labels[commonconsts.KubeLabelDynamoSelector] = GetDCDResourceName(dynamoDeployment, componentName, "")
 	labels[commonconsts.KubeLabelDynamoGraphDeploymentName] = dynamoDeployment.Name
@@ -1408,25 +1413,31 @@ func generateLabels(component *v1alpha1.DynamoComponentDeploymentSharedSpec, dyn
 	}
 	// Add base model label if modelRef is specified
 	AddBaseModelLabel(labels, component.ModelRef)
-	// Add checkpoint labels if checkpointing is enabled
-	var err error
-	labels, err = checkpoint.InjectCheckpointLabelsFromConfig(labels, component.Checkpoint)
-	if err != nil {
-		return nil, fmt.Errorf("failed to inject checkpoint labels: %w", err)
-	}
+	// Merge user-supplied labels first so they cannot overwrite checkpoint labels.
 	setMetricsLabels(labels, dynamoDeployment)
 	if component.Labels != nil {
-		err = mergo.Merge(&labels, component.Labels, mergo.WithOverride)
-		if err != nil {
+		if err := mergo.Merge(&labels, component.Labels, mergo.WithOverride); err != nil {
 			return nil, fmt.Errorf("failed to merge labels: %w", err)
 		}
 	}
 	if component.ExtraPodMetadata != nil {
-		err = mergo.Merge(&labels, component.ExtraPodMetadata.Labels, mergo.WithOverride)
-		if err != nil {
+		if err := mergo.Merge(&labels, component.ExtraPodMetadata.Labels, mergo.WithOverride); err != nil {
 			return nil, fmt.Errorf("failed to merge extraPodMetadata labels: %w", err)
 		}
 	}
+
+	// Inject checkpoint labels AFTER user labels so they cannot be overridden.
+	var err error
+	labels, err = checkpoint.InjectCheckpointLabelsFromConfig(labels, component.Checkpoint)
+	if err != nil {
+		return nil, fmt.Errorf("failed to inject checkpoint labels: %w", err)
+	}
+
+	// Only mark pods as restore targets when a concrete checkpoint is ready.
+	if checkpointInfo != nil && checkpointInfo.Enabled && checkpointInfo.Ready {
+		labels[commonconsts.KubeLabelIsRestoreTarget] = "true"
+		labels[commonconsts.KubeLabelCheckpointHash] = checkpointInfo.Hash
+	}
 	return labels, nil
 }


--- a/docs/pages/kubernetes/chrek/README.md
+++ b/docs/pages/kubernetes/chrek/README.md
@@ -4,7 +4,7 @@
 title: Checkpointing
 ---

-> ⚠️ **Experimental Feature**: ChReK is currently in **beta/preview**. It requires privileged mode for restore operations, which may not be suitable for all production environments. See [Limitations](#limitations) for details.
+> ⚠️ **Experimental Feature**: ChReK is currently in **beta/preview**. The ChReK DaemonSet runs in privileged mode to perform CRIU operations. See [Limitations](#limitations) for details.

 **ChReK** (Checkpoint/Restore in Kubernetes) is an experimental infrastructure for fast-starting GPU applications using CRIU (Checkpoint/Restore in User-space). ChReK dramatically reduces cold-start times for large models from minutes to seconds by capturing initialized application state and restoring it on-demand.

@@ -48,10 +48,10 @@ Deploys the checkpoint/restore infrastructure:
 - **RBAC**: Namespace-scoped or cluster-wide permissions
 - **Seccomp Profile**: Security policies for CRIU syscalls

-### 2. Smart Entrypoint
-A wrapper script that intelligently decides between:
- **Cold start**: Normal application startup (when no checkpoint exists)
- **Restore**: CRIU restore from checkpoint (when checkpoint available)
+### 2. External Restore via DaemonSet
+The DaemonSet performs checkpoint/restore externally using `nsenter` to enter pod namespaces:
+- **Checkpoint**: Freezes the running process and dumps state (CPU + GPU) to storage
+- **Restore**: Enters a placeholder pod's namespaces and restores the checkpointed process via `nsrestore`

 ## Quick Start

@@ -94,11 +94,11 @@ helm install chrek nvidia/chrek \
 ⚠️ **Important**: ChReK has significant limitations that may impact production readiness:

 ### Security Considerations
- **🔴 Privileged mode required**: Restore pods **must run in privileged mode** for CRIU to function. This grants containers elevated host access and may violate security policies in many production environments.
- **Security Impact**: Privileged containers can:
-  - Access all host devices
+- **🔴 Privileged DaemonSet**: The ChReK DaemonSet runs in privileged mode with `hostPID`, `hostIPC`, and `hostNetwork` to perform CRIU operations. Workload pods do **not** need privileged mode — all CRIU privilege lives in the DaemonSet.
+- **Security Impact**: The privileged DaemonSet can:
+  - Access all host devices and processes
  - Bypass most security restrictions
-  - Potentially compromise node security if the container is exploited
+  - Potentially compromise node security if exploited

 ### Technical Limitations
 - **vLLM backend only**: Currently only the vLLM backend supports checkpoint/restore. SGLang and TensorRT-LLM support is planned.
@@ -128,9 +128,9 @@ ChReK is best suited for:

 - Kubernetes 1.21+
 - GPU nodes with NVIDIA runtime (`nvidia` runtime class)
- CRIU support in container runtime (containerd with CRIU plugin)
+- containerd runtime (for container inspection; CRIU is bundled in ChReK images)
 - RWX storage class (for multi-node deployments)
- **Security clearance for privileged pods** (required for restore operations)
+- **Security clearance for privileged DaemonSet** (the ChReK agent runs privileged with hostPID/hostIPC/hostNetwork)

 ## Troubleshooting

@@ -146,9 +146,9 @@ ChReK is best suited for:
 - Verify CRIU is installed in the runtime

 **Restore fails?**
- Ensure restore pod uses the same volumes as checkpoint job
- Verify `hostIPC: true` is set (required for CUDA)
- Check for `PSM3_DISABLED=1` and `GLOO_SOCKET_IFNAME=lo` environment variables
+- Ensure restore pod uses the same image (built with `placeholder` target) and volume mounts as checkpoint job
+- Verify the DaemonSet is running on the same node as the restore pod
+- Check DaemonSet logs for CRIU errors: `kubectl logs -l app.kubernetes.io/name=chrek`

 For detailed troubleshooting, see:
 - [Dynamo Integration Guide - Troubleshooting](dynamo.md#troubleshooting)

--- a/docs/pages/kubernetes/chrek/dynamo.md
+++ b/docs/pages/kubernetes/chrek/dynamo.md
@@ -6,7 +6,7 @@ title: Integration with Dynamo

 # Checkpoint/Restore for Fast Pod Startup

-> ⚠️ **Experimental Feature**: ChReK is currently in **beta/preview**. It requires privileged mode for restore operations. See [Limitations](#limitations) for details.
+> ⚠️ **Experimental Feature**: ChReK is currently in **beta/preview**. The ChReK DaemonSet runs in privileged mode to perform CRIU operations. See [Limitations](#limitations) for details.

 Reduce cold start times for LLM inference workers from ~3 minutes to ~30 seconds using container checkpointing.

@@ -23,7 +23,7 @@ Checkpointing captures the complete state of a running worker pod (including GPU

 - Dynamo Platform installed (v0.4.0+)
 - ChReK Helm chart installed (separate from platform)
- GPU nodes with CRIU support
+- GPU nodes with containerd runtime (CRIU is bundled in ChReK images)
 - RWX PVC storage (PVC is currently the only supported backend)

 ## Quick Start
@@ -350,9 +350,9 @@ Or use `auto` mode and the operator will find/create it automatically.
 ⚠️ **Important**: ChReK has significant limitations that impact production readiness:

 ### Security Considerations
- **🔴 Privileged mode required**: Restore pods **must run in privileged mode** for CRIU to function
- Privileged containers have elevated host access, which may violate security policies in many production environments
- This requirement applies to all worker pods that restore from checkpoints
+- **🔴 Privileged DaemonSet**: The ChReK DaemonSet runs in privileged mode with `hostPID`, `hostIPC`, and `hostNetwork` to perform CRIU operations externally
+- Workload pods (checkpoint jobs, restore pods) do **not** need privileged mode — all CRIU privilege lives in the DaemonSet
+- The privileged DaemonSet has elevated host access, which may violate security policies in many production environments

 ### Technical Limitations
 - **vLLM backend only**: Currently only the vLLM backend supports checkpoint/restore. SGLang and TensorRT-LLM support is planned.
@@ -374,7 +374,7 @@ ChReK is **experimental/beta** and best suited for:

 1. Check the checkpoint job:
   ```bash
-   kubectl get jobs -l nvidia.com/checkpoint-source=true -n dynamo-system
+   kubectl get jobs -l nvidia.com/chrek-is-checkpoint-source=true -n dynamo-system
   kubectl logs job/checkpoint-<name> -n dynamo-system
   ```

@@ -430,10 +430,10 @@ Check logs for "Falling back to cold start" message.
 | Variable | Description |
 |----------|-------------|
 | `DYN_CHECKPOINT_STORAGE_TYPE` | Backend: `pvc`, `s3`, `oci` |
-| `DYN_CHECKPOINT_LOCATION` | Source location (URI) |
-| `DYN_CHECKPOINT_PATH` | Local path to tar file |
-| `DYN_CHECKPOINT_HASH` | Identity hash (debugging) |
-| `DYN_CHECKPOINT_SIGNAL_FILE` | Signal file (creation mode only) |
+| `DYN_CHECKPOINT_LOCATION` | Full checkpoint location (checkpoint jobs) |
+| `DYN_CHECKPOINT_PATH` | Base checkpoint directory (restore pods, PVC) |
+| `DYN_CHECKPOINT_HASH` | Identity hash |
+| `DYN_READY_FOR_CHECKPOINT_FILE` | Ready-for-checkpoint file path (checkpoint jobs) |

 ## Complete Example


--- a/docs/pages/kubernetes/chrek/standalone.md
+++ b/docs/pages/kubernetes/chrek/standalone.md
@@ -4,13 +4,14 @@
 title: Standalone Usage
 ---

-> ⚠️ **Experimental Feature**: ChReK is currently in **beta/preview**. It requires privileged mode for restore operations, which may not be suitable for all production environments. Review the [security implications](#security-considerations) before deploying.
+> ⚠️ **Experimental Feature**: ChReK is currently in **beta/preview**. The ChReK DaemonSet runs in privileged mode to perform CRIU operations. Review the [security implications](#security-considerations) before deploying.

 This guide explains how to use **ChReK** (Checkpoint/Restore for Kubernetes) as a standalone component without deploying the full Dynamo platform. This is useful if you want to add checkpoint/restore capabilities to your own GPU workloads.

 ## Table of Contents

 - [Overview](#overview)
+- [Using ChReK Without the Dynamo Operator](#using-chrek-without-the-dynamo-operator)
 - [Prerequisites](#prerequisites)
 - [Step 1: Deploy ChReK](#step-1-deploy-chrek)
 - [Step 2: Build Checkpoint-Enabled Images](#step-2-build-checkpoint-enabled-images)
@@ -27,7 +28,7 @@ This guide explains how to use **ChReK** (Checkpoint/Restore for Kubernetes) as
 When using ChReK standalone, you are responsible for:

 1. **Deploying the ChReK Helm chart** (DaemonSet + PVC)
-2. **Building checkpoint-enabled container images** with the restore entrypoint
+2. **Building checkpoint-enabled container images** with the CRIU runtime dependencies
 3. **Creating checkpoint jobs** with the correct environment variables
 4. **Creating restore pods** that detect and use the checkpoints

@@ -35,22 +36,89 @@ The ChReK DaemonSet handles the actual CRIU checkpoint/restore operations automa

 ---

+## Using ChReK Without the Dynamo Operator
+
+When using ChReK with the Dynamo operator, the operator automatically configures workload pods for checkpoint/restore. Without the operator, you must handle this configuration manually. This section documents what the operator normally injects and how to replicate it.
+
+### Container Naming
+
+The ChReK DaemonSet needs to identify which container in your pod is the model-serving workload (as opposed to sidecars like istio-proxy or log collectors). It resolves the target container by name:
+
+1. If a container is named `main`, it is selected
+2. Otherwise, the first container in the pod spec is selected
+
+When using the Dynamo operator, the model container is always named `main`. In standalone mode, you must either name your model container `main` or ensure it is the first container listed in your pod spec. All YAML examples in this guide use `name: main`.
+
+### Seccomp Profile
+
+The operator sets a seccomp profile on all checkpoint/restore workload pods to block `io_uring` syscalls. The chrek DaemonSet deploys the profile file (`profiles/block-iouring.json`) to each node, but you must reference it in your pod specs:
+
+```yaml
+spec:
+  securityContext:
+    seccompProfile:
+      type: Localhost
+      localhostProfile: profiles/block-iouring.json
+```
+
+Without this profile, `io_uring` syscalls during restore can cause CRIU failures.
+
+### Sleep Infinity Command for Restore Pods
+
+The operator overrides the container command to `["sleep", "infinity"]` on restore-target pods. This produces a Running-but-not-Ready placeholder pod that the chrek DaemonSet watcher detects and restores externally via `nsenter`. Without this override, the container runs its normal entrypoint (cold-starting instead of waiting for restore).
+
+```yaml
+containers:
+- name: main
+  image: my-app:checkpoint-enabled
+  command: ["sleep", "infinity"]
+```
+
+### Recreate Deployment Strategy
+
+The operator forces `Recreate` strategy when restore labels are present. This prevents the old and new pods from running simultaneously, which would cause failures — two pods competing for the same GPU checkpoint data. If you are using a Deployment, set this manually:
+
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+spec:
+  strategy:
+    type: Recreate
+```
+
+### PVC Volume Mount Consistency
+
+CRIU requires identical mount layouts between checkpoint and restore. The operator ensures the checkpoint PVC is mounted at the same path in both the checkpoint job and restore pod. When configuring manually, make sure your checkpoint job and restore pod use the exact same `mountPath` for the checkpoint PVC (e.g., `/checkpoints`).
+
+### Downward API Volume (Currently Unused)
+
+The operator injects a Downward API volume at `/etc/podinfo` for post-restore identity discovery (pod name, namespace, UID). This is not currently consumed by any component — you can skip it for now.
+
+### Environment Variables
+
+The following environment variables are normally injected by the operator. They are already documented in the [Environment Variables Reference](#environment-variables-reference) below, but note that without the operator you must set them manually:
+
+- **Checkpoint jobs:** `DYN_READY_FOR_CHECKPOINT_FILE`, `DYN_CHECKPOINT_LOCATION`, `DYN_CHECKPOINT_STORAGE_TYPE`, `DYN_CHECKPOINT_HASH`
+- **Restore pods:** `DYN_CHECKPOINT_PATH`, `DYN_CHECKPOINT_HASH`
+
+---
+
 ## Prerequisites

 - Kubernetes cluster with:
  - NVIDIA GPUs with checkpoint support
-  - **Privileged security context allowed** (⚠️ required for CRIU - see [Security Considerations](#security-considerations))
+  - **Privileged DaemonSet allowed** (⚠️ the ChReK DaemonSet runs privileged - see [Security Considerations](#security-considerations))
  - PVC storage (ReadWriteMany recommended for multi-node)
 - Docker or compatible container runtime for building images
 - Access to the ChReK source code: `deploy/chrek/`

 ### Security Considerations

-⚠️ **Important**: ChReK restore operations **require privileged mode**, which has significant security implications:
+⚠️ **Important**: The ChReK **DaemonSet** runs in privileged mode to perform CRIU checkpoint/restore operations. Your workload pods (checkpoint jobs, restore pods) do **not** need privileged mode — all CRIU privilege lives in the DaemonSet, which performs external restore via `nsenter`.

- **Privileged containers** can access all host devices and bypass most security restrictions
+- **The DaemonSet** has `privileged: true`, `hostPID`, `hostIPC`, and `hostNetwork`
 - This may violate security policies in production environments
- Privileged containers, if compromised, can potentially compromise node security
+- If the DaemonSet is compromised, it could potentially compromise node security

 **Recommended for:**
 - ✅ Development and testing environments
@@ -108,7 +176,7 @@ kubectl get pvc -n my-app

 ## Step 2: Build Checkpoint-Enabled Images

-ChReK provides a convenient `placeholder` target in its Dockerfile that automatically injects checkpoint/restore capabilities into your existing container images.
+ChReK provides a `placeholder` target in its Dockerfile that layers CRIU runtime dependencies onto your existing container images. The DaemonSet performs restore externally via `nsenter`, so these dependencies must be present in the image.

 ### Quick Start: Using the Placeholder Target (Recommended)

@@ -149,43 +217,14 @@ docker build \

 The ChReK Dockerfile's `placeholder` stage automatically:

- ✅ Builds the restore-entrypoint binary
- ✅ Injects it into `/usr/local/bin/restore-entrypoint`
- ✅ Adds `smart-entrypoint.sh` to `/usr/local/bin/`
- ✅ Sets executable permissions
- ✅ Configures the entrypoint to detect and restore checkpoints
- ✅ Preserves your original application CMD
-
-### Alternative: Manual Multi-Stage Build
-
-If you need more control, you can create your own Dockerfile:
-
-```dockerfile
-# Stage 1: Build restore-entrypoint
-FROM golang:1.23-alpine AS restore-builder
-WORKDIR /build
-COPY deploy/chrek/cmd/restore-entrypoint ./cmd/restore-entrypoint
-COPY deploy/chrek/pkg ./pkg
-COPY deploy/chrek/go.mod deploy/chrek/go.sum ./
-
-RUN go build -o /restore-entrypoint ./cmd/restore-entrypoint
-
-# Stage 2: Your application image
-FROM your-base-image:latest
+- ✅ Installs CRIU runtime libraries (required by `nsrestore` running inside the pod's namespaces)
+- ✅ Copies the `criu` binary to `/usr/local/sbin/criu`
+- ✅ Copies `cuda-checkpoint` to `/usr/local/sbin/cuda-checkpoint` (used for CUDA state checkpoint/restore)
+- ✅ Copies `nsrestore` to `/usr/local/bin/nsrestore` (invoked by DaemonSet via `nsenter`)
+- ✅ Creates checkpoint directories (`/checkpoints`, `/var/run/criu`, `/var/criu-work`)
+- ✅ Preserves your original application image contents

-# Copy restore-entrypoint
-COPY --from=restore-builder /restore-entrypoint /usr/local/bin/restore-entrypoint
-
-# Copy smart-entrypoint.sh
-COPY deploy/chrek/scripts/smart-entrypoint.sh /usr/local/bin/smart-entrypoint.sh
-RUN chmod +x /usr/local/bin/smart-entrypoint.sh /usr/local/bin/restore-entrypoint
-
-# Set smart-entrypoint as the default entrypoint
-ENTRYPOINT ["/usr/local/bin/smart-entrypoint.sh"]
-
-# Your application command (becomes CMD, can be overridden)
-CMD ["python", "your_app.py"]
-```
+The placeholder image does **not** override the entrypoint or CMD. For restore pods, the operator (or you, in standalone mode) overrides the command to `sleep infinity`.

 > **💡 Tip**: Using the `placeholder` target is the recommended approach as it's maintained with the ChReK codebase and ensures compatibility.

@@ -201,7 +240,6 @@ Your checkpoint job MUST set these environment variables:

 | Variable | Description | Example |
 |----------|-------------|---------|
-| `DYN_CHECKPOINT_SIGNAL_FILE` | Path where DaemonSet writes completion signal | `/checkpoint-signal/my-checkpoint.done` |
 | `DYN_READY_FOR_CHECKPOINT_FILE` | Path where your app signals it's ready | `/tmp/ready-for-checkpoint` |
 | `DYN_CHECKPOINT_HASH` | Unique identifier for this checkpoint | `abc123def456` |
 | `DYN_CHECKPOINT_LOCATION` | Directory where checkpoint is stored | `/checkpoints/abc123def456` |
@@ -213,7 +251,7 @@ Add this label to enable DaemonSet checkpoint detection:

 ```yaml
 labels:
-  nvidia.com/checkpoint-source: "true"
+  nvidia.com/chrek-is-checkpoint-source: "true"
 ```

 ### Example Checkpoint Job
@@ -228,39 +266,26 @@ spec:
  template:
    metadata:
      labels:
-        nvidia.com/checkpoint-source: "true"  # Required for DaemonSet detection
+        nvidia.com/chrek-is-checkpoint-source: "true"  # Required for DaemonSet detection
+        nvidia.com/chrek-checkpoint-hash: "abc123def456"  # Must match DYN_CHECKPOINT_HASH
    spec:
      restartPolicy: Never

-      # Init container to clean up stale signal files
-      initContainers:
-      - name: cleanup-signal-file
-        image: busybox:latest
-        command:
-        - sh
-        - -c
-        - |
-          rm -f /checkpoint-signal/my-checkpoint.done || true
-          echo "Signal file cleanup complete"
-        volumeMounts:
-        - name: checkpoint-signal
-          mountPath: /checkpoint-signal
+      # Seccomp profile to block io_uring syscalls (deployed by the chrek DaemonSet)
+      securityContext:
+        seccompProfile:
+          type: Localhost
+          localhostProfile: profiles/block-iouring.json

      containers:
      - name: main
        image: my-app:checkpoint-enabled

-        # Security context required for CRIU
-        securityContext:
-          privileged: true
-          capabilities:
-            add: ["SYS_ADMIN", "SYS_PTRACE", "SYS_CHROOT"]
-
        # Readiness probe: Pod becomes Ready when model is loaded
        # This is what triggers the DaemonSet to start checkpointing
        readinessProbe:
          exec:
-            command: ["sh", "-c", "cat ${DYN_READY_FOR_CHECKPOINT_FILE}"]
+            command: ["cat", "/tmp/ready-for-checkpoint"]
          initialDelaySeconds: 15
          periodSeconds: 2

@@ -271,8 +296,6 @@ spec:

        # Checkpoint-related environment variables
        env:
-        - name: DYN_CHECKPOINT_SIGNAL_FILE
-          value: "/checkpoint-signal/my-checkpoint.done"
        - name: DYN_READY_FOR_CHECKPOINT_FILE
          value: "/tmp/ready-for-checkpoint"
        - name: DYN_CHECKPOINT_HASH
@@ -291,105 +314,94 @@ spec:
        volumeMounts:
        - name: checkpoint-storage
          mountPath: /checkpoints
-        - name: checkpoint-signal
-          mountPath: /checkpoint-signal
-        - name: tmp
-          mountPath: /tmp

      volumes:
      - name: checkpoint-storage
        persistentVolumeClaim:
          claimName: chrek-pvc
-      - name: checkpoint-signal
-        hostPath:
-          path: /var/lib/chrek/signals
-          type: DirectoryOrCreate
-      - name: tmp
-        emptyDir: {}
 ```

 ### Application Code Requirements

-Your application must implement the checkpoint flow. Here's the pattern used by Dynamo vLLM:
+Your application must implement the checkpoint flow. The DaemonSet communicates with your application via Unix signals (not files):
+
+- **`SIGUSR1`**: Checkpoint completed — your process should exit gracefully
+- **`SIGCONT`**: Restore completed — your process should wake up and continue
+- **`SIGUSR2`**: Checkpoint/restore failed
+
+Here's the pattern used by Dynamo vLLM (see `components/src/dynamo/vllm/chrek.py`):

 ```python
+import asyncio
 import os
-import time
+import signal

-def main():
-    # 1. Check for checkpoint mode
-    signal_file = os.environ.get("DYN_CHECKPOINT_SIGNAL_FILE")
+async def main():
    ready_file = os.environ.get("DYN_READY_FOR_CHECKPOINT_FILE")
-    restore_marker = os.environ.get("DYN_RESTORE_MARKER_FILE")
-
-    is_checkpoint_mode = signal_file is not None
-
-    if is_checkpoint_mode:
-        print("Checkpoint mode detected")
-
-        # 2. Load your model/application
-        model = load_model()
-
-        # 3. Optional: Put model to sleep to reduce memory footprint
-        # model.sleep()
-
-        # 4. Write ready file (for application use, not DaemonSet)
-        if ready_file:
-            with open(ready_file, "w") as f:
-                f.write("ready")
-            print(f"Wrote checkpoint ready file: {ready_file}")
-
-        # 5. Log readiness messages (helps debugging)
-        print("CHECKPOINT_READY: Model loaded, ready for container checkpoint")
-        print(f"CHECKPOINT_READY: Waiting for signal file: {signal_file}")
-        print(f"CHECKPOINT_READY: Or restore marker file: {restore_marker}")
-
-        # 6. Wait for checkpoint completion OR restore detection
-        while True:
-            # Check if we've been restored (marker file created by restore entrypoint)
-            if os.path.exists(restore_marker):
-                print(f"Detected restore from checkpoint (marker: {restore_marker})")
-                # Continue with normal application flow
-                break
-
-            # Check if checkpoint is complete (signal file created by DaemonSet)
-            if os.path.exists(signal_file):
-                print(f"Checkpoint signal file detected: {signal_file}")
-                print("Checkpoint complete, exiting")
-                return  # Exit gracefully
-
-            time.sleep(1)
-
-    # Normal application flow (or post-restore flow)
-    run_application()
+    if not ready_file:
+        # Not in checkpoint mode, run normally
+        await run_application()
+        return
+
+    print("Checkpoint mode detected")
+
+    # 1. Load your model/application
+    model = await load_model()
+
+    # 2. Optional: Put model to sleep for CRIU-friendly GPU state
+    await model.sleep()
+
+    # 3. Write ready file — triggers DaemonSet checkpoint via readiness probe
+    with open(ready_file, "w") as f:
+        f.write("ready")
+
+    # 4. Set up signal handlers and wait for DaemonSet
+    checkpoint_done = asyncio.Event()
+    restore_done = asyncio.Event()
+
+    loop = asyncio.get_running_loop()
+    loop.add_signal_handler(signal.SIGUSR1, checkpoint_done.set)
+    loop.add_signal_handler(signal.SIGCONT, restore_done.set)
+
+    print("Ready for checkpoint. Waiting for watcher signal...")
+
+    # Wait for whichever signal comes first
+    done, pending = await asyncio.wait(
+        [asyncio.create_task(checkpoint_done.wait()),
+         asyncio.create_task(restore_done.wait())],
+        return_when=asyncio.FIRST_COMPLETED,
+    )
+    for task in pending:
+        task.cancel()
+
+    if restore_done.is_set():
+        # SIGCONT: Process was restored from checkpoint
+        print("Restore complete, waking model")
+        await model.wake_up()
+        await run_application()
+    else:
+        # SIGUSR1: Checkpoint complete, exit
+        print("Checkpoint complete, exiting")
 ```

 **Important Notes:**

-1. **Ready File & Readiness Probe**: The checkpoint job must have a readiness probe that checks for the ready file:
-   ```yaml
-   readinessProbe:
-     exec:
-       command: ["sh", "-c", "cat ${DYN_READY_FOR_CHECKPOINT_FILE}"]
-     initialDelaySeconds: 15
-     periodSeconds: 2
-   ```
-   The ChReK DaemonSet triggers checkpointing when:
-   - Pod has `nvidia.com/checkpoint-source: "true"` label
+1. **Ready File & Readiness Probe**: The checkpoint job must have a readiness probe that checks for the ready file. The ChReK DaemonSet triggers checkpointing when:
+   - Pod has `nvidia.com/chrek-is-checkpoint-source: "true"` label
   - Pod status is `Ready` (readiness probe passes = ready file exists)

-2. **Restore Marker**: Created by `restore-entrypoint` before CRIU restore, allows the restored process to detect it was restored
+2. **Signal-based coordination**: The DaemonSet sends `SIGUSR1` after checkpoint completes and `SIGCONT` after restore completes. Your application must handle these signals (not poll for files).

-3. **Two Exit Paths**:
-   - **Signal file found**: Checkpoint complete, exit gracefully
-   - **Restore marker found**: Process was restored, continue running
+3. **Two exit paths**:
+   - **SIGUSR1 received**: Checkpoint complete, exit gracefully
+   - **SIGCONT received**: Process was restored, wake model and continue


 ---

 ## Step 4: Restore from Checkpoints

-Restore pods automatically detect and restore from checkpoints if they exist.
+The DaemonSet performs restore externally — your restore pod just needs to be a placeholder that sleeps until the DaemonSet restores the checkpointed process into it.

 ### Example Restore Pod

@@ -399,18 +411,26 @@ kind: Pod
 metadata:
  name: my-app-restored
  namespace: my-app
+  labels:
+    nvidia.com/chrek-is-restore-target: "true"  # Required: watcher detects restore pods by this label
+    nvidia.com/chrek-checkpoint-hash: "abc123def456"  # Required: watcher uses this to locate the checkpoint
 spec:
  restartPolicy: Never

+  # Seccomp profile to block io_uring syscalls (deployed by the chrek DaemonSet)
+  # Without this, io_uring syscalls may cause CRIU restore failures
+  securityContext:
+    seccompProfile:
+      type: Localhost
+      localhostProfile: profiles/block-iouring.json
+
  containers:
  - name: main
    image: my-app:checkpoint-enabled

-    # Security context required for CRIU restore
-    securityContext:
-      privileged: true
-      capabilities:
-        add: ["SYS_ADMIN", "SYS_PTRACE", "SYS_CHROOT"]
+    # Override command to sleep — the chrek DaemonSet performs external restore
+    # on Running-but-not-Ready pods. Without this, the container would cold-start.
+    command: ["sleep", "infinity"]

    # Set checkpoint environment variables
    env:
@@ -419,38 +439,28 @@ spec:
    - name: DYN_CHECKPOINT_PATH
      value: "/checkpoints"  # Base path (hash appended automatically)

-    - name: DYN_RESTORE_MARKER_FILE
-      value: "/tmp/dynamo-restored"
-
    # GPU request
    resources:
      limits:
        nvidia.com/gpu: 1

-    # Mount checkpoint storage (READ-ONLY is fine for restore)
+    # CRIU needs write access for restore.log — do NOT set readOnly
    volumeMounts:
    - name: checkpoint-storage
      mountPath: /checkpoints
-      readOnly: true
-    - name: checkpoint-signal
-      mountPath: /checkpoint-signal

  volumes:
  - name: checkpoint-storage
    persistentVolumeClaim:
      claimName: chrek-pvc
-  - name: checkpoint-signal
-    hostPath:
-      path: /var/lib/chrek/signals
-      type: DirectoryOrCreate
 ```

 ### How Restore Works

-1. **Smart Entrypoint Detects Checkpoint**: The `smart-entrypoint.sh` checks if a checkpoint exists at `/checkpoints/${DYN_CHECKPOINT_HASH}/`
-2. **Calls Restore Entrypoint**: If found, calls `/usr/local/bin/restore-entrypoint` which invokes CRIU
-3. **CRIU Restores Process**: The entire process tree is restored from the checkpoint, including GPU state
-4. **Application Continues**: Your application resumes exactly where it was checkpointed
+1. **Pod starts as placeholder**: The `sleep infinity` command keeps the pod Running but not Ready
+2. **DaemonSet detects restore pod**: The watcher finds pods with `nvidia.com/chrek-is-restore-target=true` that are Running but not Ready
+3. **External restore via nsenter**: The DaemonSet enters the pod's namespaces and performs CRIU restore, including GPU state
+4. **Application continues**: Your application resumes exactly where it was checkpointed

 ---

@@ -460,10 +470,9 @@ spec:

 | Variable | Required | Description |
 |----------|----------|-------------|
-| `DYN_CHECKPOINT_SIGNAL_FILE` | Yes | Full path to signal file (e.g., `/checkpoint-signal/my-checkpoint.done`) |
 | `DYN_READY_FOR_CHECKPOINT_FILE` | Yes | Full path where app signals readiness (e.g., `/tmp/ready-for-checkpoint`) |
-| `DYN_CHECKPOINT_HASH` | Yes | Unique checkpoint identifier (alphanumeric string) |
-| `DYN_CHECKPOINT_LOCATION` | Yes | Directory where checkpoint is stored (e.g., `/checkpoints/abc123`) |
+| `DYN_CHECKPOINT_HASH` | Yes | Unique checkpoint identifier (16-char hex string) |
+| `DYN_CHECKPOINT_LOCATION` | Yes | Directory where checkpoint is stored (e.g., `/checkpoints/abc123def456`) |
 | `DYN_CHECKPOINT_STORAGE_TYPE` | Yes | Storage backend: `pvc`, `s3`, or `oci` |

 ### Restore Pods
@@ -472,22 +481,18 @@ spec:
 |----------|----------|-------------|
 | `DYN_CHECKPOINT_HASH` | Yes | Checkpoint identifier (must match checkpoint job) |
 | `DYN_CHECKPOINT_PATH` | Yes | Base checkpoint directory (hash appended automatically) |
-| `DYN_RESTORE_MARKER_FILE` | Yes | Path for restore marker file |
-
-### Optional CRIU Tuning (Advanced)
-
-| Variable | Default | Description |
-|----------|---------|-------------|
-| `CRIU_TIMEOUT` | `0` (unlimited) | CRIU operation timeout in seconds |
-| `CRIU_LOG_LEVEL` | `4` | CRIU log verbosity (0-4) |
-| `CRIU_WORK_DIR` | `/tmp` | CRIU working directory |
-| `CUDA_PLUGIN_DIR` | `/usr/local/lib/criu` | Path to CRIU CUDA plugin |
-| `CRIU_SKIP_IN_FLIGHT` | `false` | Skip in-flight TCP connections |
-| `CRIU_AUTO_DEDUP` | `false` | Enable auto-deduplication |
-| `CRIU_LAZY_PAGES` | `false` | Enable lazy page migration (experimental) |
-| `WAIT_FOR_CHECKPOINT` | `false` | Wait for checkpoint to appear before starting |
-| `RESTORE_WAIT_TIMEOUT` | `300` | Max seconds to wait for checkpoint |
-| `DEBUG` | `false` | Enable debug mode (sleeps 300s on error) |
+
+### Signals (DaemonSet → Application)
+
+The DaemonSet communicates checkpoint/restore completion via Unix signals, not files:
+
+| Signal | Direction | Meaning |
+|--------|-----------|---------|
+| `SIGUSR1` | DaemonSet → checkpoint pod | Checkpoint completed, process should exit |
+| `SIGCONT` | DaemonSet → restored pod | Restore completed, process should wake up |
+| `SIGUSR2` | DaemonSet → checkpoint pod | Checkpoint failed (wake process to continue) |
+
+CRIU tuning options are configured via the ChReK Helm chart's `config.checkpoint.criu` values, not environment variables. See the [Helm Chart Values](https://github.com/ai-dynamo/dynamo/tree/main/deploy/helm/charts/chrek/values.yaml) for available options.

 ---

@@ -497,7 +502,7 @@ spec:

 ```
 ┌─────────────────────────────────────────────────────────────┐
-│ 1. Pod starts with nvidia.com/checkpoint-source=true label  │
+│ 1. Pod starts with nvidia.com/chrek-is-checkpoint-source=true label  │
 └──────────────────────┬──────────────────────────────────────┘
                       │
                       ▼
@@ -515,13 +520,13 @@ spec:
 ┌─────────────────────────────────────────────────────────────┐
 │ 4. ChReK DaemonSet detects:                                 │
 │    - Pod is Ready                                            │
-│    - Has checkpoint-source label                             │
-│    - Ready file exists: /tmp/ready-for-checkpoint           │
+│    - Has chrek-is-checkpoint-source label                     │
+│    - Has chrek-checkpoint-hash label                         │
 └──────────────────────┬──────────────────────────────────────┘
                       │
                       ▼
 ┌─────────────────────────────────────────────────────────────┐
-│ 5. DaemonSet executes CRIU checkpoint via runc:             │
+│ 5. DaemonSet executes CRIU checkpoint:                      │
 │    - Freezes container process                               │
 │    - Dumps memory (CPU + GPU)                                │
 │    - Saves to /checkpoints/${HASH}/                          │
@@ -529,13 +534,12 @@ spec:
                       │
                       ▼
 ┌─────────────────────────────────────────────────────────────┐
-│ 6. DaemonSet writes signal file:                            │
-│    /checkpoint-signal/${HASH}.done                           │
+│ 6. DaemonSet sends SIGUSR1 to the application process       │
 └──────────────────────┬──────────────────────────────────────┘
                       │
                       ▼
 ┌─────────────────────────────────────────────────────────────┐
-│ 7. Application detects signal file and exits gracefully     │
+│ 7. Application receives SIGUSR1 and exits gracefully        │
 └─────────────────────────────────────────────────────────────┘
 ```

@@ -543,44 +547,34 @@ spec:

 ```
 ┌─────────────────────────────────────────────────────────────┐
-│ 1. Pod starts with DYN_CHECKPOINT_HASH set                  │
+│ 1. Pod starts with restore labels and sleep infinity        │
+│    (Running but not Ready)                                   │
 └──────────────────────┬──────────────────────────────────────┘
                       │
                       ▼
 ┌─────────────────────────────────────────────────────────────┐
-│ 2. smart-entrypoint.sh checks for checkpoint:               │
-│    /checkpoints/${DYN_CHECKPOINT_HASH}/checkpoint.done      │
-└──────────────────────┬──────────────────────────────────────┘
-                       │
-                       ├─ Not Found ─────────────────┐
-                       │                              │
-                       ▼                              ▼
-           ┌───────────────────────┐    ┌──────────────────────┐
-           │ Checkpoint exists     │    │ Cold start           │
-           └──────────┬────────────┘    │ Run original CMD     │
-                      │                 └──────────────────────┘
-                      ▼
-┌─────────────────────────────────────────────────────────────┐
-│ 3. Call restore-entrypoint with checkpoint path             │
+│ 2. ChReK DaemonSet detects:                                 │
+│    - Pod is Running but not Ready                            │
+│    - Has chrek-is-restore-target label                       │
+│    - Has chrek-checkpoint-hash label                         │
 └──────────────────────┬──────────────────────────────────────┘
                       │
                       ▼
 ┌─────────────────────────────────────────────────────────────┐
-│ 4. restore-entrypoint extracts checkpoint and calls CRIU:   │
-│    criu restore --images-dir /checkpoints/${HASH}/images    │
+│ 3. DaemonSet performs external restore via nsenter:          │
+│    - Enters pod's namespaces (mount, net, pid, ipc)         │
+│    - Runs nsrestore with CRIU inside the pod's context      │
+│    - Restores memory (CPU + GPU via cuda-checkpoint)        │
 └──────────────────────┬──────────────────────────────────────┘
                       │
                       ▼
 ┌─────────────────────────────────────────────────────────────┐
-│ 5. CRIU restores process from checkpoint                    │
-│    - Restores memory (CPU + GPU)                             │
-│    - Restores file descriptors                               │
-│    - Resumes process execution                               │
+│ 4. DaemonSet sends SIGCONT to the restored process           │
 └──────────────────────┬──────────────────────────────────────┘
                       │
                       ▼
 ┌─────────────────────────────────────────────────────────────┐
-│ 6. Application continues from checkpointed state            │
+│ 5. Application receives SIGCONT, wakes model, continues      │
 │    (Model already loaded, GPU memory initialized)           │
 └─────────────────────────────────────────────────────────────┘
 ```
@@ -596,7 +590,7 @@ spec:
 **Checks**:
 1. Verify the pod has the label:
   ```bash
-   kubectl get pod <pod-name> -o jsonpath='{.metadata.labels.nvidia\.com/checkpoint-source}'
+   kubectl get pod <pod-name> -o jsonpath='{.metadata.labels.nvidia\.com/chrek-is-checkpoint-source}'
   ```

 2. Check pod readiness:
@@ -624,61 +618,49 @@ spec:
   kubectl exec <pod-name> -- ls -la /checkpoints/${DYN_CHECKPOINT_HASH}/
   ```

-2. Check privileged mode is enabled:
+2. Check DaemonSet logs for restore errors:
   ```bash
-   kubectl get pod <pod-name> -o jsonpath='{.spec.containers[0].securityContext.privileged}'
+   kubectl logs -n my-app daemonset/chrek-agent --all-containers
   ```

-3. Check CRIU logs in `/tmp/criu-restore.log`:
+3. Check pod events for restore status annotations:
   ```bash
-   kubectl exec <pod-name> -- cat /tmp/criu-restore.log
+   kubectl describe pod <pod-name>
   ```

 4. Ensure checkpoint and restore have same:
-   - Container image
+   - Container image (built with `placeholder` target)
   - GPU count
-   - Volume mounts
-   - Environment variables (except POD_NAME, POD_IP, etc.)
-
-### Permission Denied Errors
-
-**Symptom**: `CRIU: Permission denied` or `Operation not permitted`
+   - Volume mounts (same `mountPath` for checkpoint PVC)

-**Solution**: Ensure pod has:
-```yaml
-securityContext:
-  privileged: true
-  capabilities:
-    add:
-    - SYS_ADMIN
-    - SYS_PTRACE
-    - SYS_CHROOT
-```
+### Restore Pod Not Detected

-### Signal File Not Appearing
-
-**Symptom**: Application waits forever for signal file
+**Symptom**: Pod runs `sleep infinity` but DaemonSet never restores it

 **Checks**:
-1. Verify hostPath mount is correct:
+1. Verify the pod has the required labels:
   ```bash
-   kubectl get pod <pod-name> -o jsonpath='{.spec.volumes[?(@.name=="checkpoint-signal")]}'
+   kubectl get pod <pod-name> -o jsonpath='{.metadata.labels}'
   ```
+   Must have both `nvidia.com/chrek-is-restore-target: "true"` and `nvidia.com/chrek-checkpoint-hash: "<hash>"`.

-2. Check DaemonSet has access to the same path:
+2. Verify the pod is Running but not Ready (this is the trigger):
   ```bash
-   kubectl get daemonset -n my-app chrek-agent -o jsonpath='{.spec.template.spec.volumes[?(@.name=="signal-dir")]}'
+   kubectl get pod <pod-name> -o jsonpath='{.status.phase}'
+   kubectl get pod <pod-name> -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}'
   ```

-3. Verify paths match exactly:
-   - Pod: `/var/lib/chrek/signals`
-   - DaemonSet: `/var/lib/chrek/signals`
+3. Verify the DaemonSet is running on the same node:
+   ```bash
+   kubectl get pods -n my-app -l app.kubernetes.io/name=chrek -o wide
+   ```

 ---

 ## Additional Resources

 - [ChReK Helm Chart Values](https://github.com/ai-dynamo/dynamo/tree/main/deploy/helm/charts/chrek/values.yaml)
+- [Dynamo vLLM ChReK Integration](https://github.com/ai-dynamo/dynamo/tree/main/components/src/dynamo/vllm/chrek.py) - Reference signal handler implementation
 - [ChReK Dockerfile](https://github.com/ai-dynamo/dynamo/tree/main/deploy/chrek/Dockerfile)
 - [CRIU Documentation](https://criu.org/Main_Page)
 - [CUDA Checkpoint Utility](https://github.com/NVIDIA/cuda-checkpoint)