w.log.V(1).Info("Checkpoint not ready on disk, skipping restore","pod",podKey,"checkpoint_hash",checkpointHash,"checkpoint_location",checkpointLocation)
return
return
}
}
if!w.tryAcquire(podKey){
containerName:=resolveMainContainerName(pod)
ifcontainerName==""{
w.log.Info("Restore pod has no containers","pod",podKey)
@@ -261,53 +317,56 @@ func (w *Watcher) handleRestorePodEvent(ctx context.Context, pod *corev1.Pod) {
...
@@ -261,53 +317,56 @@ func (w *Watcher) handleRestorePodEvent(ctx context.Context, pod *corev1.Pod) {
emitPodEvent(ctx,w.clientset,w.log,pod,"snapshot",corev1.EventTypeNormal,"RestoreRequested",fmt.Sprintf("Restore requested from checkpoint %s",checkpointHash))
emitPodEvent(ctx,w.clientset,w.log,pod,"snapshot",corev1.EventTypeNormal,"RestoreRequested",fmt.Sprintf("Restore requested from checkpoint %s",checkpointHash))
emitPodEvent(ctx,w.clientset,log,pod,"snapshot",corev1.EventTypeNormal,"RestoreSucceeded",fmt.Sprintf("Restore completed from checkpoint %s",checkpointHash))
emitPodEvent(ctx,w.clientset,log,pod,"snapshot",corev1.EventTypeNormal,"RestoreSucceeded",fmt.Sprintf("Restore completed from checkpoint %s",checkpointHash))
...
@@ -496,7 +571,7 @@ func (w *Watcher) doRestore(ctx context.Context, pod *corev1.Pod, checkpointHash
...
@@ -496,7 +571,7 @@ func (w *Watcher) doRestore(ctx context.Context, pod *corev1.Pod, checkpointHash
| `podTemplateSpec` _[PodTemplateSpec](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#podtemplatespec-v1-core)_ | PodTemplateSpec allows customizing the checkpoint Job pod<br/>This should include the container that runs the workload to be checkpointed | | Required: \{\}<br/> |
| `podTemplateSpec` _[PodTemplateSpec](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#podtemplatespec-v1-core)_ | PodTemplateSpec allows customizing the checkpoint Job pod<br/>This should include the container that runs the workload to be checkpointed | | Required: \{\}<br/> |
| `activeDeadlineSeconds` _integer_ | ActiveDeadlineSeconds specifies the maximum time the Job can run | 3600 | Optional: \{\}<br/> |
| `sharedMemory` _[SharedMemorySpec](#sharedmemoryspec)_ | SharedMemory controls the tmpfs mounted at /dev/shm for the checkpoint Job pod.<br/>When omitted, checkpoint Jobs use the same default 8Gi tmpfs as Dynamo components. | | Optional: \{\}<br/> |
| `backoffLimit` _integer_ | BackoffLimit specifies the number of retries before marking the Job failed | 3 | Optional: \{\}<br/> |
| `activeDeadlineSeconds` _integer_ | ActiveDeadlineSeconds specifies the maximum time the Job can run | 3600 | Minimum: 1 <br/>Optional: \{\}<br/> |
| `ttlSecondsAfterFinished` _integer_ | TTLSecondsAfterFinished specifies how long to keep the Job after completion | 300 | Optional: \{\}<br/> |
| `backoffLimit` _integer_ | Deprecated: BackoffLimit is ignored. Checkpoint Jobs never retry. | | Minimum: 0 <br/>Optional: \{\}<br/> |
| `ttlSecondsAfterFinished` _integer_ | TTLSecondsAfterFinished specifies how long to keep the Job after completion | 300 | Minimum: 0 <br/>Optional: \{\}<br/> |
#### DynamoCheckpointPhase
#### DynamoCheckpointPhase
...
@@ -324,7 +325,7 @@ _Appears in:_
...
@@ -324,7 +325,7 @@ _Appears in:_
| `jobName` _string_ | JobName is the name of the checkpoint creation Job | | Optional: \{\}<br/> |
| `jobName` _string_ | JobName is the name of the checkpoint creation Job | | Optional: \{\}<br/> |
| `createdAt` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#time-v1-meta)_ | CreatedAt is the timestamp when the checkpoint tar was created | | Optional: \{\}<br/> |
| `createdAt` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#time-v1-meta)_ | CreatedAt is the timestamp when the checkpoint tar was created | | Optional: \{\}<br/> |
| `message` _string_ | Message provides additional information about the current state | | Optional: \{\}<br/> |
| `message` _string_ | Message provides additional information about the current state | | Optional: \{\}<br/> |
| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions represent the latest available observations of the checkpoint's state | | Optional: \{\}<br/> |
| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | DEPRECATED: Conditions are deprecated. Use status.phase instead. | | Optional: \{\}<br/> |
#### DynamoCheckpointStorageType
#### DynamoCheckpointStorageType
...
@@ -1155,7 +1156,7 @@ _Appears in:_
...
@@ -1155,7 +1156,7 @@ _Appears in:_
| --- | --- | --- | --- |
| --- | --- | --- | --- |
| `enabled` _boolean_ | Enabled indicates whether checkpointing is enabled for this service | false | Optional: \{\} <br /> |
| `enabled` _boolean_ | Enabled indicates whether checkpointing is enabled for this service | false | Optional: \{\} <br /> |
| `mode` _[CheckpointMode](#checkpointmode)_ | Mode defines how checkpoint creation is handled<br />- Auto: DGD controller creates Checkpoint CR automatically<br />- Manual: User must create Checkpoint CR | Auto | Enum: [Auto Manual] <br />Optional: \{\} <br /> |
| `mode` _[CheckpointMode](#checkpointmode)_ | Mode defines how checkpoint creation is handled<br />- Auto: DGD controller creates Checkpoint CR automatically<br />- Manual: User must create Checkpoint CR | Auto | Enum: [Auto Manual] <br />Optional: \{\} <br /> |
| `checkpointRef` _string_ | CheckpointRef references an existing Checkpoint CR to use<br />If specified, Identity is ignored and this checkpoint is used directly | | Optional: \{\} <br /> |
| `checkpointRef` _string_ | CheckpointRef references an existing DynamoCheckpoint CR by metadata.name.<br />If specified, this service's Identity is ignored and the referenced checkpoint is used directly. | | Optional: \{\} <br /> |
| `identity` _[DynamoCheckpointIdentity](#dynamocheckpointidentity)_ | Identity defines the checkpoint identity for hash computation<br />Used when Mode is Auto or when looking up existing checkpoints<br />Required when checkpointRef is not specified | | Optional: \{\} <br /> |
| `identity` _[DynamoCheckpointIdentity](#dynamocheckpointidentity)_ | Identity defines the checkpoint identity for hash computation<br />Used when Mode is Auto or when looking up existing checkpoints<br />Required when checkpointRef is not specified | | Optional: \{\} <br /> |
...
@@ -1174,7 +1175,7 @@ _Appears in:_
...
@@ -1174,7 +1175,7 @@ _Appears in:_
| --- | --- | --- | --- |
| --- | --- | --- | --- |
| `checkpointName` _string_ | CheckpointName is the name of the associated Checkpoint CR | | Optional: \{\} <br /> |
| `checkpointName` _string_ | CheckpointName is the name of the associated Checkpoint CR | | Optional: \{\} <br /> |
| `identityHash` _string_ | IdentityHash is the computed hash of the checkpoint identity | | Optional: \{\} <br /> |
| `identityHash` _string_ | IdentityHash is the computed hash of the checkpoint identity | | Optional: \{\} <br /> |
| `ready` _boolean_ | Ready indicates if the checkpoint is ready for use | | Optional: \{\} <br /> |
| `ready` _boolean_ | Ready indicates if the checkpoint was visible to the worker at startup | | Optional: \{\} <br /> |
| `DYN_CHECKPOINT_PATH` | Base directory where checkpoint data is stored | From operator checkpoint config `storage.pvc.basePath` | `string` | PVC storage type |
| `DYN_CHECKPOINT_LOCATION` | Full checkpoint URI (for non-PVC backends) | — | `string` | S3 or OCI storage type |
| `DYN_CHECKPOINT_HASH` | Identity hash that uniquely identifies the checkpoint | — | `string` | Always set when checkpoint is enabled |
| `SKIP_WAIT_FOR_CHECKPOINT` | Skips the checkpoint readiness polling loop; checks once and proceeds | — | `string` | Set on restored and DGD pods |
## Service Accounts
## Service Accounts
The following component types automatically receive dedicated service accounts:
The following component types automatically receive dedicated service accounts:
On the first rollout, the worker cold-starts, the operator creates a `DynamoCheckpoint`, and the checkpoint Job writes data into `snapshot-pvc`.
On the first rollout, the worker cold-starts, the operator resolves the checkpoint identity hash, and the checkpoint Job writes a new checkpoint directory into `snapshot-pvc`.
### 5. Wait for the checkpoint to become ready
### 5. Wait for the checkpoint to become ready
Capture the checkpoint name from DGD status, then wait for the `DynamoCheckpoint` phase to become `Ready`:
Auto mode resolves checkpoints by identity hash. It may create `checkpoint-<hash>` or reuse an existing checkpoint with a different CR name. For the sample identity above, the hash is `73e74442beb109ed`:
```bash
```bash
CHECKPOINT_NAME=$(kubectl get dgd vllm-snapshot-demo -n${NAMESPACE}\
The DGD status also reports the computed checkpoint hash at `.status.checkpoints.VllmDecodeWorker.identityHash`.
If you change the checkpoint identity, the hash changes and so does the checkpoint selected by Auto mode.
### 6. Trigger restore
### 6. Trigger restore
...
@@ -218,7 +199,7 @@ New worker pods for `VllmDecodeWorker` will restore from the ready checkpoint au
...
@@ -218,7 +199,7 @@ New worker pods for `VllmDecodeWorker` will restore from the ready checkpoint au
### Auto Mode (Recommended)
### Auto Mode (Recommended)
The operator computes the checkpoint identity hash, looks for an existing `DynamoCheckpoint`with a matching `nvidia.com/snapshot-checkpoint-hash` label, and creates one if it does not find one:
The operator computes the checkpoint identity hash, looks up an existing `DynamoCheckpoint`by that hash, and creates a new `DynamoCheckpoint` only when no matching checkpoint already exists:
```yaml
```yaml
checkpoint:
checkpoint:
...
@@ -232,7 +213,12 @@ checkpoint:
...
@@ -232,7 +213,12 @@ checkpoint:
maxModelLen:4096
maxModelLen:4096
```
```
When a service uses checkpointing, DGD status reports the resolved `checkpointName`, `identityHash`, and `ready` fields under `.status.checkpoints.<service-name>`.
The `DynamoGraphDeployment` mirrors checkpoint resolution state under `.status.checkpoints`, including the resolved checkpoint CR name, identity hash, and whether the checkpoint was visible to the worker when it started:
```bash
kubectl get dgd vllm-snapshot-demo -n${NAMESPACE}\
@@ -241,26 +227,26 @@ Use `checkpointRef` when you want a service to restore from a specific `DynamoCh
...
@@ -241,26 +227,26 @@ Use `checkpointRef` when you want a service to restore from a specific `DynamoCh
```yaml
```yaml
checkpoint:
checkpoint:
enabled:true
enabled:true
checkpointRef:"qwen3-06b-vllm-prewarm"
checkpointRef:"qwen3-06b-bf16"
```
```
This is useful when:
This is useful when:
- You want to **pre-warm checkpoints** before creating DGDs
- You want to **pre-warm checkpoints** before creating DGDs
- You want **explicit control** over which checkpoint to use
- You want **explicit control** over which checkpoint to use
`checkpointRef` resolves by `DynamoCheckpoint.metadata.name`, not by `status.identityHash`. A manual checkpoint can use any valid Kubernetes resource name.
`checkpointRef` resolves by `DynamoCheckpoint.metadata.name`. Use a readable CR name when you want an explicit checkpoint that operators can reference directly.
If you are managing checkpoint CRs yourself, set `mode: Manual` on the service to prevent the operator from creating a new `DynamoCheckpoint` when identity-based lookup does not find one.
If you are managing checkpoint CRs yourself, set `mode: Manual` on the service to prevent the operator from creating a new `DynamoCheckpoint` when identity-based lookup does not find one.
```bash
```bash
# Check checkpoint status by CR name
# Check checkpoint status by CR name
kubectl get dynamocheckpoint qwen3-06b-vllm-prewarm-n${NAMESPACE}
kubectl get dynamocheckpoint qwen3-06b-bf16-n${NAMESPACE}
# Now create DGD referencing it
# Now create DGD referencing it
kubectl apply -f my-dgd.yaml -n${NAMESPACE}
kubectl apply -f my-dgd.yaml -n${NAMESPACE}
```
```
If you want `mode: Auto`DGDs to discover a manually created checkpoint by identity, add the label `nvidia.com/snapshot-checkpoint-hash=<identity-hash>` to that `DynamoCheckpoint`. Auto-created checkpoints already use that label, and currently use the same hash as the CR name.
`mode: Auto`still resolves checkpoints by identity hash. The operator backfills `status.identityHash` and the `nvidia.com/snapshot-checkpoint-hash` label on each `DynamoCheckpoint` so auto lookup and uniqueness checks do not depend on the CR name.
## Checkpoint Identity
## Checkpoint Identity
...
@@ -309,7 +295,8 @@ The `DynamoCheckpoint` (shortname: `dckpt`) is a Kubernetes Custom Resource that
...
@@ -309,7 +295,8 @@ The `DynamoCheckpoint` (shortname: `dckpt`) is a Kubernetes Custom Resource that
-**Pre-warming:** Create checkpoints before deploying DGDs for instant startup
-**Pre-warming:** Create checkpoints before deploying DGDs for instant startup
-**Explicit control:** Manage checkpoint lifecycle independently from DGDs
-**Explicit control:** Manage checkpoint lifecycle independently from DGDs
The operator requires `spec.identity` and `spec.job.podTemplateSpec`. The pod template should match the worker container you want checkpointed, including image, command, args, secrets, volumes, and resource limits. You do not need to set the checkpoint environment variables manually; the operator injects them for checkpoint jobs and restored pods.
The operator requires `spec.identity` and `spec.job.podTemplateSpec`. The pod template should match the worker container you want checkpointed, including image, command, args, secrets, volumes, and resource limits. You do not need to set checkpoint-control plumbing manually; the operator injects the checkpoint-ready signal path for checkpoint Jobs and adds the restore metadata consumed by restored pods and the node-local controller inside the `snapshot-agent` DaemonSet.
`spec.job.backoffLimit` is deprecated and ignored. Checkpoint Jobs are always single-attempt.
**Create a checkpoint:**
**Create a checkpoint:**
...
@@ -317,9 +304,7 @@ The operator requires `spec.identity` and `spec.job.podTemplateSpec`. The pod te
...
@@ -317,9 +304,7 @@ The operator requires `spec.identity` and `spec.job.podTemplateSpec`. The pod te
apiVersion:nvidia.com/v1alpha1
apiVersion:nvidia.com/v1alpha1
kind:DynamoCheckpoint
kind:DynamoCheckpoint
metadata:
metadata:
name:qwen3-06b-vllm-prewarm
name:qwen3-06b-bf16
labels:
nvidia.com/snapshot-checkpoint-hash:"e5962d34ba272638"# Add this if Auto-mode identity lookup should find the CR
spec:
spec:
identity:
identity:
model:Qwen/Qwen3-0.6B
model:Qwen/Qwen3-0.6B
...
@@ -330,7 +315,6 @@ spec:
...
@@ -330,7 +315,6 @@ spec:
job:
job:
activeDeadlineSeconds:3600
activeDeadlineSeconds:3600
backoffLimit:3
ttlSecondsAfterFinished:300
ttlSecondsAfterFinished:300
podTemplateSpec:
podTemplateSpec:
spec:
spec:
...
@@ -345,18 +329,19 @@ spec:
...
@@ -345,18 +329,19 @@ spec:
args:
args:
---model
---model
-Qwen/Qwen3-0.6B
-Qwen/Qwen3-0.6B
---disable-custom-all-reduce
env:
env:
-name:GLOO_SOCKET_IFNAME
-name:NCCL_DEBUG
value:lo
value:ERROR
-name:NCCL_SOCKET_IFNAME
-name:TORCH_CPP_LOG_LEVEL
value:lo
value:ERROR
-name:TORCH_DISTRIBUTED_DEBUG
value:"OFF"
resources:
resources:
limits:
limits:
nvidia.com/gpu:"1"
nvidia.com/gpu:"1"
```
```
You can name the CR however you want if you plan to use `checkpointRef`. If you want `mode: Auto` identity lookup to find a manual CR, set the `nvidia.com/snapshot-checkpoint-hash` label to the computed 16-character identity hash. Using the hash as the CR name is a convenient convention, but it is not required.
For this example identity, the operator computes a deterministic identity hash and stores it in `status.identityHash`. Auto mode uses that hash, not the CR name, when it decides whether to reuse or create a checkpoint.
**Check status:**
**Check status:**
...
@@ -366,9 +351,9 @@ kubectl get dynamocheckpoint -n ${NAMESPACE}
...
@@ -366,9 +351,9 @@ kubectl get dynamocheckpoint -n ${NAMESPACE}
`Ready` is a value in `status.phase`, not a Kubernetes condition. The `conditions` array tracks job lifecycle events:
| Condition Type | Meaning |
|----------------|---------|
| `JobCreated` | The checkpoint Job has been created |
| `JobCompleted` | The checkpoint Job has completed successfully or failed |
Other useful status fields are:
Other useful status fields are:
| Field | Meaning |
| Field | Meaning |
|-------|---------|
|-------|---------|
| `status.identityHash` | Deterministic hash of `spec.identity` used for auto lookup and reuse |
| `status.jobName` | Name of the checkpoint Job |
| `status.jobName` | Name of the checkpoint Job |
| `status.identityHash` | Computed 16-character hash for the checkpoint identity |
| `status.location` | Checkpoint location in the configured storage backend |
| `status.location` | Checkpoint location in the configured storage backend |
| `status.storageType` | Storage backend type (`pvc`, `s3`, or `oci`) |
| `status.storageType` | Storage backend type (`pvc`, `s3`, or `oci`) |
| `status.createdAt` | Timestamp recorded when the checkpoint becomes ready |
| `status.createdAt` | Timestamp recorded when the checkpoint becomes ready |
| `status.message` | Failure or progress message when available |
| `status.message` | Failure or progress message when available |
`status.conditions` is deprecated for `DynamoCheckpoint`. The legacy condition types `JobCreated` and `JobCompleted` are kept for compatibility only. Prefer `status.phase`, `status.jobName`, and `status.message` when checking checkpoint progress.
Or use `mode: Auto` with the same identity and snapshot-hash label, and the operator will reuse it automatically.
Or use `mode: Auto` with the same identity, and the operator will reuse the same deterministic checkpoint object automatically.
## Limitations
## Limitations
-**LLM workers only**: Checkpoint/restore supports LLM decode and prefill workers. Specialized workers (multimodal, embedding, diffusion) are not supported.
-**LLM workers only**: Checkpoint/restore supports LLM decode and prefill workers. Specialized workers (multimodal, embedding, diffusion) are not supported.
-**Single-GPU only**: Multi-GPU configurations may work in very basic hardware configurations, but are not officially supported yet.
-**Single-GPU only**: Multi-GPU configurations may work in very basic hardware configurations, but are not officially supported yet.
-**Network state**: No active TCP connections can be checkpointed
-**Network state**: Restore is sensitive to live TCP socket state. Loopback bootstrap/control sockets can work with the supported CRIU TCP policies, but non-loopback or pod-IP-bound connections can still break restore.
-**Security**: Dynamo Snapshot runs as a **privileged DaemonSet** which is required to run CRIU and cuda-checkpoint. However, workload pods do not need to be privileged.
-**Security**: Dynamo Snapshot runs as a **privileged DaemonSet** which is required to run CRIU and cuda-checkpoint. However, workload pods do not need to be privileged.
## Troubleshooting
## Troubleshooting
...
@@ -451,7 +424,10 @@ Or use `mode: Auto` with the same identity and snapshot-hash label, and the oper
...
@@ -451,7 +424,10 @@ Or use `mode: Auto` with the same identity and snapshot-hash label, and the oper