r.GetRecorder().Eventf(parentResource,corev1.EventTypeWarning,fmt.Sprintf("Get%s",resourceType),"Failed to get %s %s: %s",resourceType,resourceNamespace,err)
r.GetRecorder().Eventf(resource,corev1.EventTypeWarning,fmt.Sprintf("Get%s",resourceType),"Failed to get %s %s: %s",resourceType,resourceNamespace,err)
logs.Error(err,"Failed to set controller reference.")
r.GetRecorder().Eventf(parentResource,corev1.EventTypeWarning,"SetControllerReference","Failed to set controller reference for %s %s: %s",resourceType,resourceNamespace,err)
r.GetRecorder().Eventf(resource,corev1.EventTypeWarning,"SetControllerReference","Failed to set controller reference for %s %s: %s",resourceType,resourceNamespace,err)
r.GetRecorder().Eventf(parentResource,corev1.EventTypeWarning,"GetSpecHash","Failed to get spec hash for %s %s: %s",resourceType,resourceNamespace,err)
r.GetRecorder().Eventf(resource,corev1.EventTypeWarning,"GetSpecHash","Failed to get spec hash for %s %s: %s",resourceType,resourceNamespace,err)
return
}
// On create, set generation to 1 (new resources start at generation 1)
updateAnnotations(resource,hash,1)
r.GetRecorder().Eventf(parentResource,corev1.EventTypeNormal,fmt.Sprintf("Create%s",resourceType),"Creating a new %s %s",resourceType,resourceNamespace)
r.GetRecorder().Eventf(resource,corev1.EventTypeNormal,fmt.Sprintf("Create%s",resourceType),"Creating a new %s %s",resourceType,resourceNamespace)
err=r.Create(ctx,resource)
iferr!=nil{
logs.Error(err,"Failed to create Resource.")
r.GetRecorder().Eventf(parentResource,corev1.EventTypeWarning,fmt.Sprintf("Create%s",resourceType),"Failed to create %s %s: %s",resourceType,resourceNamespace,err)
r.GetRecorder().Eventf(resource,corev1.EventTypeWarning,fmt.Sprintf("Create%s",resourceType),"Failed to create %s %s: %s",resourceType,resourceNamespace,err)
r.GetRecorder().Eventf(parentResource,corev1.EventTypeWarning,fmt.Sprintf("CalculatePatch%s",resourceType),"Failed to calculate patch for %s %s: %s",resourceType,resourceNamespace,err)
r.GetRecorder().Eventf(resource,corev1.EventTypeWarning,fmt.Sprintf("CalculatePatch%s",resourceType),"Failed to calculate patch for %s %s: %s",resourceType,resourceNamespace,err)
returnfalse,resource,fmt.Errorf("failed to check if spec has changed: %w",err)
}
if!changeResult.NeedsUpdate{
logs.Info(fmt.Sprintf("%s spec is the same. Skipping update.",resourceType))
logs.Error(err,fmt.Sprintf("Failed to copy spec for %s.",resourceType))
r.GetRecorder().Eventf(parentResource,corev1.EventTypeWarning,fmt.Sprintf("CopySpec%s",resourceType),"Failed to copy spec for %s %s: %s",resourceType,resourceNamespace,err)
r.GetRecorder().Eventf(oldResource,corev1.EventTypeWarning,fmt.Sprintf("CopySpec%s",resourceType),"Failed to copy spec for %s %s: %s",resourceType,resourceNamespace,err)
| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | |
| `dynamoVersion` _string_ | DynamoVersion is the Dynamo platform version (optional)<br/>If not specified, version is not included in identity hash<br/>This ensures checkpoint compatibility across Dynamo releases | | Optional: \{\}<br/> |
| `tensorParallelSize` _integer_ | TensorParallelSize is the tensor parallel configuration | 1 | Minimum: 1 <br/>Optional: \{\}<br/> |
| `pipelineParallelSize` _integer_ | PipelineParallelSize is the pipeline parallel configuration | 1 | Minimum: 1 <br/>Optional: \{\}<br/> |
| `dtype` _string_ | Dtype is the data type (fp16, bf16, fp8, etc.) | | Optional: \{\}<br/> |
| `maxModelLen` _integer_ | MaxModelLen is the maximum sequence length | | Minimum: 1 <br/>Optional: \{\}<br/> |
| `extraParameters` _object (keys:string, values:string)_ | ExtraParameters are additional parameters that affect the checkpoint hash<br/>Use for any framework-specific or custom parameters not covered above | | Optional: \{\}<br/> |
#### DynamoCheckpointJobConfig
DynamoCheckpointJobConfig defines the configuration for the checkpoint creation Job
_Appears in:_
-[DynamoCheckpointSpec](#dynamocheckpointspec)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `podTemplateSpec` _[PodTemplateSpec](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#podtemplatespec-v1-core)_ | PodTemplateSpec allows customizing the checkpoint Job pod<br/>This should include the container that runs the workload to be checkpointed | | Required: \{\}<br/> |
| `activeDeadlineSeconds` _integer_ | ActiveDeadlineSeconds specifies the maximum time the Job can run | 3600 | Optional: \{\}<br/> |
| `backoffLimit` _integer_ | BackoffLimit specifies the number of retries before marking the Job failed | 3 | Optional: \{\}<br/> |
| `ttlSecondsAfterFinished` _integer_ | TTLSecondsAfterFinished specifies how long to keep the Job after completion | 300 | Optional: \{\}<br/> |
#### DynamoCheckpointPhase
_Underlying type:_ _string_
DynamoCheckpointPhase represents the current phase of the checkpoint lifecycle
| `Pending` | DynamoCheckpointPhasePending indicates the checkpoint CR has been created but the Job has not started<br/> |
| `Creating` | DynamoCheckpointPhaseCreating indicates the checkpoint Job is running<br/> |
| `Ready` | DynamoCheckpointPhaseReady indicates the checkpoint tar file is available on the PVC<br/> |
| `Failed` | DynamoCheckpointPhaseFailed indicates the checkpoint creation failed<br/> |
#### DynamoCheckpointSpec
DynamoCheckpointSpec defines the desired state of DynamoCheckpoint
_Appears in:_
-[DynamoCheckpoint](#dynamocheckpoint)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `identity` _[DynamoCheckpointIdentity](#dynamocheckpointidentity)_ | Identity defines the inputs that determine checkpoint equivalence | | Required: \{\}<br/> |
| `job` _[DynamoCheckpointJobConfig](#dynamocheckpointjobconfig)_ | Job defines the configuration for the checkpoint creation Job | | Required: \{\}<br/> |
#### DynamoCheckpointStatus
DynamoCheckpointStatus defines the observed state of DynamoCheckpoint
_Appears in:_
-[DynamoCheckpoint](#dynamocheckpoint)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `phase` _[DynamoCheckpointPhase](#dynamocheckpointphase)_ | Phase represents the current phase of the checkpoint lifecycle | | Enum: [Pending Creating Ready Failed] <br/>Optional: \{\}<br/> |
| `identityHash` _string_ | IdentityHash is the computed hash of the checkpoint identity<br/>This hash is used to identify equivalent checkpoints | | Optional: \{\}<br/> |
| `location` _string_ | Location is the full URI/path to the checkpoint in the storage backend<br/>For PVC: same as TarPath (e.g., /checkpoints/\{hash\}.tar)<br/>For S3: s3://bucket/prefix/\{hash\}.tar<br/>For OCI: oci://registry/repo:\{hash\} | | Optional: \{\}<br/> |
| `storageType` _[DynamoCheckpointStorageType](#dynamocheckpointstoragetype)_ | StorageType indicates the storage backend type used for this checkpoint | | Enum: [pvc s3 oci] <br/>Optional: \{\}<br/> |
| `jobName` _string_ | JobName is the name of the checkpoint creation Job | | Optional: \{\}<br/> |
| `createdAt` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#time-v1-meta)_ | CreatedAt is the timestamp when the checkpoint tar was created | | Optional: \{\}<br/> |
| `message` _string_ | Message provides additional information about the current state | | Optional: \{\}<br/> |
| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions represent the latest available observations of the checkpoint's state | | Optional: \{\}<br/> |
#### DynamoCheckpointStorageType
_Underlying type:_ _string_
DynamoCheckpointStorageType defines the supported storage backends for checkpoints
| `multinode` _[MultinodeSpec](#multinodespec)_ | Multinode is the configuration for multinode components. | | |
| `scalingAdapter` _[ScalingAdapter](#scalingadapter)_ | ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter.<br/>When enabled, replicas are managed via DGDSA and external autoscalers can scale<br/>the service using the Scale subresource. When disabled, replicas can be modified directly. | | Optional: \{\}<br/> |
| `eppConfig` _[EPPConfig](#eppconfig)_ | EPPConfig defines EPP-specific configuration options for Endpoint Picker Plugin components.<br/>Only applicable when ComponentType is "epp". | | Optional: \{\}<br/> |
| `checkpoint` _[ServiceCheckpointConfig](#servicecheckpointconfig)_ | Checkpoint configures container checkpointing for this service.<br/>When enabled, pods can be restored from a checkpoint files for faster cold start. | | Optional: \{\}<br/> |
#### DynamoComponentDeploymentSpec
...
...
@@ -242,6 +402,7 @@ _Appears in:_
| `multinode` _[MultinodeSpec](#multinodespec)_ | Multinode is the configuration for multinode components. | | |
| `scalingAdapter` _[ScalingAdapter](#scalingadapter)_ | ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter.<br/>When enabled, replicas are managed via DGDSA and external autoscalers can scale<br/>the service using the Scale subresource. When disabled, replicas can be modified directly. | | Optional: \{\}<br/> |
| `eppConfig` _[EPPConfig](#eppconfig)_ | EPPConfig defines EPP-specific configuration options for Endpoint Picker Plugin components.<br/>Only applicable when ComponentType is "epp". | | Optional: \{\}<br/> |
| `checkpoint` _[ServiceCheckpointConfig](#servicecheckpointconfig)_ | Checkpoint configures container checkpointing for this service.<br/>When enabled, pods can be restored from a checkpoint files for faster cold start. | | Optional: \{\}<br/> |
#### DynamoGraphDeployment
...
...
@@ -456,6 +617,7 @@ _Appears in:_
| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions contains the latest observed conditions of the graph deployment.<br/>The slice is merged by type on patch updates. | | |
| `services` _object (keys:string, values:[ServiceReplicaStatus](#servicereplicastatus))_ | Services contains per-service replica status information.<br/>The map key is the service name from spec.services. | | Optional: \{\}<br/> |
| `restart` _[RestartStatus](#restartstatus)_ | Restart contains the status of the restart of the graph deployment. | | Optional: \{\}<br/> |
| `checkpoints` _object (keys:string, values:[ServiceCheckpointStatus](#servicecheckpointstatus))_ | Checkpoints contains per-service checkpoint status information.<br/>The map key is the service name from spec.services. | | Optional: \{\}<br/> |
#### DynamoModel
...
...
@@ -872,6 +1034,44 @@ _Appears in:_
| `enabled` _boolean_ | Enabled indicates whether the ScalingAdapter should be enabled for this service.<br/>When true, a DGDSA is created and owns the replicas field.<br/>When false (default), no DGDSA is created and replicas can be modified directly in the DGD. | false | Optional: \{\}<br/> |
#### ServiceCheckpointConfig
ServiceCheckpointConfig configures checkpointing for a DGD service
| `enabled` _boolean_ | Enabled indicates whether checkpointing is enabled for this service | false | Optional: \{\}<br/> |
| `mode` _[CheckpointMode](#checkpointmode)_ | Mode defines how checkpoint creation is handled<br/>- Auto: DGD controller creates Checkpoint CR automatically<br/>- Manual: User must create Checkpoint CR | Auto | Enum: [Auto Manual] <br/>Optional: \{\}<br/> |
| `checkpointRef` _string_ | CheckpointRef references an existing Checkpoint CR to use<br/>If specified, Identity is ignored and this checkpoint is used directly | | Optional: \{\}<br/> |
| `identity` _[DynamoCheckpointIdentity](#dynamocheckpointidentity)_ | Identity defines the checkpoint identity for hash computation<br/>Used when Mode is Auto or when looking up existing checkpoints<br/>Required when checkpointRef is not specified | | Optional: \{\}<br/> |
#### ServiceCheckpointStatus
ServiceCheckpointStatus contains checkpoint information for a single service.
> ⚠️ **Experimental Feature**: ChReK is currently in **beta/preview**. It requires privileged mode for restore operations, which may not be suitable for all production environments. See [Limitations](#limitations) for details.
**ChReK** (Checkpoint/Restore in Kubernetes) is an experimental infrastructure for fast-starting GPU applications using CRIU (Checkpoint/Restore in User-space). ChReK dramatically reduces cold-start times for large models from minutes to seconds by capturing initialized application state and restoring it on-demand.
## What is ChReK?
ChReK provides:
-**Fast cold starts**: Restore GPU-accelerated applications in seconds instead of minutes
-**CUDA state preservation**: Checkpoint and restore GPU memory and CUDA contexts
-**Kubernetes-native**: Integrates seamlessly with Kubernetes primitives
-**Storage flexibility**: PVC-based storage (S3/OCI planned for future releases)
-**Namespace isolation**: Each namespace gets its own checkpoint infrastructure
## Use Cases
### 1. With NVIDIA Dynamo Platform (Recommended)
Use ChReK as part of the Dynamo platform for automatic checkpoint management:
- Automatic checkpoint creation and lifecycle management
- Seamless integration with DynamoGraphDeployment CRDs
- Built-in autoscaling with fast restore
📖 **[Read the Dynamo Integration Guide →](dynamo.md)**
### 2. Standalone (Without Dynamo)
Use ChReK independently in your own Kubernetes applications:
- Manual checkpoint job creation
- Build your own restore-enabled container images
- Full control over checkpoint lifecycle
📖 **[Read the Standalone Usage Guide →](standalone.md)**
## Architecture
ChReK consists of two main components:
### 1. ChReK Helm Chart
Deploys the checkpoint/restore infrastructure:
-**DaemonSet**: Runs on GPU nodes to perform CRIU checkpoint operations
-**PVC**: Stores checkpoint data (rootfs diffs, CUDA memory state)
-**RBAC**: Namespace-scoped or cluster-wide permissions
-**Seccomp Profile**: Security policies for CRIU syscalls
### 2. Smart Entrypoint
A wrapper script that intelligently decides between:
-**Cold start**: Normal application startup (when no checkpoint exists)
-**Restore**: CRIU restore from checkpoint (when checkpoint available)
## Quick Start
### Install ChReK Infrastructure
```bash
helm install chrek nvidia/chrek \
--namespace my-team \
--create-namespace\
--set storage.pvc.size=100Gi
```
### Choose Your Integration Path
-**Using Dynamo Platform?** → Follow the [Dynamo Integration Guide](dynamo.md)
-**Using standalone?** → Follow the [Standalone Usage Guide](standalone.md)
## Key Features
### ✅ Currently Supported
- ✅ **vLLM backend only** (SGLang and TensorRT-LLM planned)
- ✅ Single-node, single-GPU checkpoints
- ✅ PVC storage backend (RWX for multi-node)
- ✅ CUDA checkpoint/restore
- ✅ PyTorch distributed state (with `GLOO_SOCKET_IFNAME=lo`)
⚠️ **Important**: ChReK has significant limitations that may impact production readiness:
### Security Considerations
-**🔴 Privileged mode required**: Restore pods **must run in privileged mode** for CRIU to function. This grants containers elevated host access and may violate security policies in many production environments.
-**Security Impact**: Privileged containers can:
- Access all host devices
- Bypass most security restrictions
- Potentially compromise node security if the container is exploited
### Technical Limitations
-**vLLM backend only**: Currently only the vLLM backend supports checkpoint/restore. SGLang and TensorRT-LLM support is planned.
-**Single-node only**: Checkpoints must be created and restored on the same node
-**Single-GPU only**: Multi-GPU configurations not yet supported
-**Network state limitations**: Active TCP connections are closed during restore (use `tcp-close` CRIU option)
-**Storage**: Only PVC storage is currently implemented (S3/OCI planned)
### Recommendation
ChReK is best suited for:
- ✅ Development and testing environments
- ✅ Research and experimentation
- ✅ Controlled production environments with appropriate security controls
- ❌ Security-sensitive production workloads without proper risk assessment
## Documentation
### Getting Started
-[Dynamo Integration Guide](dynamo.md) - Using ChReK with Dynamo Platform
-[Standalone Usage Guide](standalone.md) - Using ChReK independently
SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# Checkpoint/Restore for Fast Pod Startup
> ⚠️ **Experimental Feature**: ChReK is currently in **beta/preview**. It requires privileged mode for restore operations. See [Limitations](#limitations) for details.
Reduce cold start times for LLM inference workers from ~3 minutes to ~30 seconds using container checkpointing.
## Overview
Checkpointing captures the complete state of a running worker pod (including GPU memory) and saves it to storage. New pods can restore from this checkpoint instead of performing a full cold start.
| Startup Type | Time | What Happens |
|--------------|------|--------------|
| **Cold Start** | ~3 min | Download model, load to GPU, initialize engine |
| **Warm Start** (checkpoint) | ~30 sec | Restore from checkpoint tar |
## Prerequisites
- Dynamo Platform installed (v0.4.0+)
- ChReK Helm chart installed (separate from platform)
- GPU nodes with CRIU support
- RWX PVC storage (PVC is currently the only supported backend)
## Quick Start
### 1. Install ChReK Infrastructure
First, install the ChReK Helm chart in each namespace where you need checkpointing:
```bash
# Install ChReK infrastructure
helm install chrek nvidia/chrek \
--namespace my-team \
--create-namespace\
--set storage.pvc.size=100Gi
```
This creates:
- A PVC for checkpoint storage (`chrek-pvc`)
- A DaemonSet for CRIU operations (`chrek-agent`)
### 2. Configure Operator Values
Update your Helm values to point to the ChReK infrastructure:
```yaml
# values.yaml
dynamo-operator:
checkpoint:
enabled:true
storage:
type:pvc# Only PVC is currently supported (S3/OCI planned)
pvc:
pvcName:"chrek-pvc"# Must match ChReK chart
basePath:"/checkpoints"
signalHostPath:"/var/lib/chrek/signals"# Must match ChReK chart
**Not included in hash** (don't invalidate checkpoint):
-`replicas`
-`nodeSelector`, `affinity`, `tolerations`
-`resources` (requests/limits)
- Logging/observability config
**Example with all fields:**
```yaml
checkpoint:
enabled:true
mode:auto
identity:
model:"meta-llama/Llama-3-8B"
backendFramework:"vllm"
dynamoVersion:"0.9.0"
tensorParallelSize:1
pipelineParallelSize:1
dtype:"bfloat16"
maxModelLen:8192
extraParameters:
enableChunkedPrefill:"true"
quantization:"awq"
```
**Checkpoint Naming:** The `DynamoCheckpoint` CR is automatically named using the 16-character identity hash (e.g., `e5962d34ba272638`).
**Checkpoint Sharing:** Multiple DGDs with the same identity automatically share the same checkpoint.
## DynamoCheckpoint CRD
The `DynamoCheckpoint` (shortname: `dckpt`) is a Kubernetes Custom Resource that manages checkpoint lifecycle.
**When to create a DynamoCheckpoint directly:**
-**Pre-warming:** Create checkpoints before deploying DGDs for instant startup
-**Explicit control:** Manage checkpoint lifecycle independently from DGDs
**Note:** With the new hash-based naming, checkpoint names are automatically generated (16-character hash). The operator handles checkpoint discovery and reuse automatically in `auto` mode.
**Create a checkpoint:**
```yaml
apiVersion:nvidia.com/v1alpha1
kind:DynamoCheckpoint
metadata:
name:e5962d34ba272638# Use the computed 16-char hash
spec:
identity:
model:"meta-llama/Llama-3-8B"
backendFramework:"vllm"
tensorParallelSize:1
dtype:"bfloat16"
job:
activeDeadlineSeconds:3600
podTemplateSpec:
spec:
containers:
-name:main
image:nvcr.io/nvidia/ai-dynamo/dynamo-vllm:latest
command:["python3","-m","dynamo.vllm"]
args:["--model","meta-llama/Llama-3-8B"]
resources:
limits:
nvidia.com/gpu:"1"
env:
-name:HF_TOKEN
valueFrom:
secretKeyRef:
name:hf-token-secret
key:HF_TOKEN
```
**Note:** You can compute the hash yourself, or use `auto` mode to let the operator create it.
> ⚠️ **Experimental Feature**: ChReK is currently in **beta/preview**. It requires privileged mode for restore operations, which may not be suitable for all production environments. Review the [security implications](#security-considerations) before deploying.
This guide explains how to use **ChReK** (Checkpoint/Restore for Kubernetes) as a standalone component without deploying the full Dynamo platform. This is useful if you want to add checkpoint/restore capabilities to your own GPU workloads.
ChReK provides a convenient `placeholder` target in its Dockerfile that automatically injects checkpoint/restore capabilities into your existing container images.
### Quick Start: Using the Placeholder Target (Recommended)
```bash
cd deploy/chrek
# Define your images
export BASE_IMAGE="your-app:latest"# Your existing application image