Unverified Commit f3aa1e01 authored by Julien Mancuso's avatar Julien Mancuso Committed by GitHub
Browse files

feat: introducing ChReK (Checkpoint Restore in K8s) (#4978)


Signed-off-by: default avatarJulien Mancuso <jmancuso@nvidia.com>
parent 44986bf5
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: nvidia.com/v1alpha1
kind: DynamoCheckpoint
metadata:
name: vllm-llama3-8b-tp1
spec:
# Identity - determines the checkpoint hash
identity:
model: "meta-llama/Meta-Llama-3-8B-Instruct"
backendFramework: "vllm"
dynamoVersion: "0.6.0"
tensorParallelSize: 1
pipelineParallelSize: 1
dtype: "bfloat16"
maxModelLen: 8192
extraParameters:
enableChunkedPrefill: "true"
# Job configuration for checkpoint creation
job:
activeDeadlineSeconds: 3600
backoffLimit: 3
ttlSecondsAfterFinished: 300
podTemplateSpec:
spec:
containers:
- name: checkpoint-worker
image: nvcr.io/nvidia/ai-dynamo/dynamo-vllm:latest
command: ["python", "-m", "vllm.entrypoints.openai.api_server"]
args:
- "--model"
- "meta-llama/Meta-Llama-3-8B-Instruct"
- "--tensor-parallel-size"
- "1"
- "--dtype"
- "bfloat16"
- "--max-model-len"
- "8192"
env:
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-secret
key: token
resources:
limits:
nvidia.com/gpu: 1
restartPolicy: Never
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package checkpoint
import (
"context"
"fmt"
"path/filepath"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
controller_common "github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/utils/ptr"
"sigs.k8s.io/controller-runtime/pkg/client"
)
// getCheckpointInfoFromCheckpoint extracts CheckpointInfo from a DynamoCheckpoint CR
func getCheckpointInfoFromCheckpoint(ckpt *nvidiacomv1alpha1.DynamoCheckpoint) *CheckpointInfo {
info := &CheckpointInfo{
Enabled: true,
CheckpointName: ckpt.Name,
Hash: ckpt.Status.IdentityHash,
Location: ckpt.Status.Location,
StorageType: ckpt.Status.StorageType,
Ready: ckpt.Status.Phase == nvidiacomv1alpha1.DynamoCheckpointPhaseReady,
Identity: &ckpt.Spec.Identity,
}
return info
}
// DefaultCheckpointPVCName is the default PVC name for checkpoint storage
const DefaultCheckpointPVCName = "checkpoint-storage"
// getPVCBasePath returns the PVC base path from storage config, or the default
// Only applicable for PVC storage type
func getPVCBasePath(storageConfig *controller_common.CheckpointStorageConfig) string {
if storageConfig != nil && storageConfig.PVC.BasePath != "" {
return storageConfig.PVC.BasePath
}
return consts.CheckpointBasePath
}
// GetPVCBasePath returns the configured PVC base path from controller config,
// or the default if not set. This is used by both CheckpointReconciler and DynamoGraphDeploymentReconciler.
// Only applicable for PVC storage type.
func GetPVCBasePath(config *controller_common.CheckpointConfig) string {
if config != nil && config.Enabled {
return getPVCBasePath(&config.Storage)
}
return consts.CheckpointBasePath
}
// storageTypeToAPI converts controller_common storage type string to API enum
func storageTypeToAPI(storageType string) nvidiacomv1alpha1.DynamoCheckpointStorageType {
// Simply cast - the values match between controller constants and API enum
return nvidiacomv1alpha1.DynamoCheckpointStorageType(storageType)
}
// CheckpointInfo contains resolved checkpoint information for a DGD service
type CheckpointInfo struct {
// Enabled indicates if checkpointing is enabled
Enabled bool
// Identity is the resolved checkpoint identity (model, framework, etc.)
Identity *nvidiacomv1alpha1.DynamoCheckpointIdentity
// Hash is the computed identity hash
Hash string
// Location is the full URI/path in the storage backend
Location string
// StorageType is the storage backend type (pvc, s3, oci)
StorageType nvidiacomv1alpha1.DynamoCheckpointStorageType
// CheckpointName is the name of the Checkpoint CR
CheckpointName string
// Ready indicates if the checkpoint is ready for use
Ready bool
}
// ResolveCheckpointForService resolves checkpoint information for a DGD service.
// It handles both checkpointRef (direct reference) and identity-based lookup.
// Returns CheckpointInfo with the resolved identity populated.
func ResolveCheckpointForService(
ctx context.Context,
c client.Client,
namespace string,
config *nvidiacomv1alpha1.ServiceCheckpointConfig,
) (*CheckpointInfo, error) {
if config == nil || !config.Enabled {
return &CheckpointInfo{Enabled: false}, nil
}
// If a direct checkpoint reference is provided, use it
if config.CheckpointRef != nil && *config.CheckpointRef != "" {
ckpt := &nvidiacomv1alpha1.DynamoCheckpoint{}
err := c.Get(ctx, types.NamespacedName{
Namespace: namespace,
Name: *config.CheckpointRef,
}, ckpt)
if err != nil {
return nil, fmt.Errorf("failed to get referenced checkpoint %s: %w", *config.CheckpointRef, err)
}
// Extract all checkpoint info including identity from the CR
return getCheckpointInfoFromCheckpoint(ckpt), nil
}
// Otherwise, compute hash from identity and look up checkpoint
if config.Identity == nil {
return nil, fmt.Errorf("checkpoint enabled but no checkpointRef or identity provided")
}
hash, err := ComputeIdentityHash(*config.Identity)
if err != nil {
return nil, fmt.Errorf("failed to compute identity hash: %w", err)
}
info := &CheckpointInfo{
Enabled: true,
Identity: config.Identity,
Hash: hash,
}
// Look for existing checkpoint with matching hash using label selector
checkpointList := &nvidiacomv1alpha1.DynamoCheckpointList{}
if err = c.List(ctx, checkpointList,
client.InNamespace(namespace),
client.MatchingLabels{consts.KubeLabelCheckpointHash: info.Hash},
); err != nil {
return nil, fmt.Errorf("failed to list checkpoints: %w", err)
}
// Return the first matching checkpoint (there should be at most one per hash)
if len(checkpointList.Items) > 0 {
ckpt := &checkpointList.Items[0]
// Merge checkpoint info from the CR (overrides the computed values)
foundInfo := getCheckpointInfoFromCheckpoint(ckpt)
// Keep the hash and identity we computed from the config
foundInfo.Hash = info.Hash
foundInfo.Identity = info.Identity
return foundInfo, nil
}
// No existing checkpoint found
// In Auto mode, the controller should create one
return info, nil
}
// InjectCheckpointEnvVars adds checkpoint-related environment variables to a container
// Sets STORAGE_TYPE, LOCATION, PATH, HASH, and CRIU-related vars for unified storage backend handling.
func InjectCheckpointEnvVars(container *corev1.Container, info *CheckpointInfo, config *controller_common.CheckpointConfig) {
if !info.Enabled {
return
}
// Determine storage type (default to PVC if not set)
storageType := info.StorageType
if storageType == "" {
storageType = nvidiacomv1alpha1.DynamoCheckpointStorageType(controller_common.CheckpointStorageTypePVC)
}
envVars := []corev1.EnvVar{
{
Name: consts.EnvCheckpointStorageType,
Value: string(storageType),
},
}
// Location is the source (where to fetch from)
if info.Location != "" {
envVars = append(envVars, corev1.EnvVar{
Name: consts.EnvCheckpointLocation,
Value: info.Location,
})
}
// For PVC storage, also inject DYNAMO_CHECKPOINT_PATH (base directory)
// This is used by k8s-runc-bypass restore entrypoint
if string(storageType) == controller_common.CheckpointStorageTypePVC && info.Location != "" {
// Extract base path using filepath.Dir()
basePath := filepath.Dir(info.Location)
envVars = append(envVars, corev1.EnvVar{
Name: consts.EnvCheckpointPath,
Value: basePath,
})
}
// Include hash for debugging/observability and for k8s-runc-bypass
if info.Hash != "" {
envVars = append(envVars, corev1.EnvVar{
Name: consts.EnvCheckpointHash,
Value: info.Hash,
})
}
// Add CRIU-related env vars for restore operations
criuTimeout := consts.DefaultCRIUTimeout
if config != nil && config.CRIUTimeout != "" {
criuTimeout = config.CRIUTimeout
}
envVars = append(envVars,
corev1.EnvVar{
Name: consts.EnvRestoreMarkerFile,
Value: consts.RestoreMarkerFilePath,
},
corev1.EnvVar{
Name: consts.EnvCRIUWorkDir,
Value: consts.CRIUWorkDirPath,
},
corev1.EnvVar{
Name: consts.EnvCRIULogDir,
Value: consts.CRIULogDirPath,
},
corev1.EnvVar{
Name: consts.EnvCUDAPluginDir,
Value: consts.CUDAPluginDirPath,
},
corev1.EnvVar{
Name: consts.EnvCRIUTimeout,
Value: criuTimeout,
},
)
// Prepend checkpoint env vars to ensure they're available
container.Env = append(envVars, container.Env...)
}
// InjectCheckpointVolume adds the checkpoint PVC volume to a pod spec
func InjectCheckpointVolume(podSpec *corev1.PodSpec, pvcName string) {
// Check if volume already exists
for _, v := range podSpec.Volumes {
if v.Name == consts.CheckpointVolumeName {
return
}
}
podSpec.Volumes = append(podSpec.Volumes, corev1.Volume{
Name: consts.CheckpointVolumeName,
VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
ClaimName: pvcName,
ReadOnly: false, // CRIU needs write access during restore
},
},
})
}
// InjectCheckpointVolumeMount adds the checkpoint volume mount to a container
func InjectCheckpointVolumeMount(container *corev1.Container, basePath string) {
// Check if mount already exists
for _, m := range container.VolumeMounts {
if m.Name == consts.CheckpointVolumeName {
return
}
}
if basePath == "" {
basePath = consts.CheckpointBasePath
}
container.VolumeMounts = append(container.VolumeMounts, corev1.VolumeMount{
Name: consts.CheckpointVolumeName,
MountPath: basePath,
ReadOnly: false, // CRIU needs write access for restore.log and restore-criu.conf
})
}
// InjectCheckpointSignalVolume adds the checkpoint signal hostPath volume to a pod spec
// This is needed for CRIU mount namespace consistency between checkpoint and restore pods
func InjectCheckpointSignalVolume(podSpec *corev1.PodSpec, checkpointConfig *controller_common.CheckpointConfig) {
// Check if volume already exists
for _, v := range podSpec.Volumes {
if v.Name == consts.CheckpointSignalVolumeName {
return
}
}
// Get signal host path from config or use default
signalHostPath := consts.CheckpointSignalHostPath
if checkpointConfig != nil && checkpointConfig.Storage.SignalHostPath != "" {
signalHostPath = checkpointConfig.Storage.SignalHostPath
}
hostPathType := corev1.HostPathDirectoryOrCreate
podSpec.Volumes = append(podSpec.Volumes, corev1.Volume{
Name: consts.CheckpointSignalVolumeName,
VolumeSource: corev1.VolumeSource{
HostPath: &corev1.HostPathVolumeSource{
Path: signalHostPath,
Type: &hostPathType,
},
},
})
}
// InjectCheckpointSignalVolumeMount adds the checkpoint signal volume mount to a container
// This is needed for CRIU mount namespace consistency between checkpoint and restore pods
func InjectCheckpointSignalVolumeMount(container *corev1.Container) {
// Check if mount already exists
for _, m := range container.VolumeMounts {
if m.Name == consts.CheckpointSignalVolumeName {
return
}
}
container.VolumeMounts = append(container.VolumeMounts, corev1.VolumeMount{
Name: consts.CheckpointSignalVolumeName,
MountPath: consts.CheckpointSignalMountPath,
ReadOnly: false,
})
}
// InjectPodInfoVolume adds a Downward API volume for pod identity and DGD info.
// This is critical for CRIU checkpoint/restore scenarios where environment variables
// contain stale values from the checkpoint source pod. The Downward API files
// always reflect the current pod's identity and DGD configuration.
func InjectPodInfoVolume(podSpec *corev1.PodSpec) {
// Check if volume already exists
for _, v := range podSpec.Volumes {
if v.Name == consts.PodInfoVolumeName {
return
}
}
podSpec.Volumes = append(podSpec.Volumes, corev1.Volume{
Name: consts.PodInfoVolumeName,
VolumeSource: corev1.VolumeSource{
DownwardAPI: &corev1.DownwardAPIVolumeSource{
Items: []corev1.DownwardAPIVolumeFile{
// Pod identity fields
{
Path: "pod_name",
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: consts.PodInfoFieldPodName,
},
},
{
Path: "pod_uid",
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: consts.PodInfoFieldPodUID,
},
},
{
Path: "pod_namespace",
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: consts.PodInfoFieldPodNamespace,
},
},
// DGD info from annotations (for CRIU restore)
{
Path: consts.PodInfoFileDynNamespace,
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.annotations['" + consts.AnnotationDynNamespace + "']",
},
},
{
Path: consts.PodInfoFileDynComponent,
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.annotations['" + consts.AnnotationDynComponent + "']",
},
},
{
Path: consts.PodInfoFileDynParentDGDName,
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.annotations['" + consts.AnnotationDynParentDGDName + "']",
},
},
{
Path: consts.PodInfoFileDynParentDGDNS,
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.annotations['" + consts.AnnotationDynParentDGDNS + "']",
},
},
{
Path: consts.PodInfoFileDynDiscoveryBackend,
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.annotations['" + consts.AnnotationDynDiscoveryBackend + "']",
},
},
},
},
},
})
}
// InjectPodInfoVolumeMount adds the Downward API volume mount to a container.
func InjectPodInfoVolumeMount(container *corev1.Container) {
// Check if mount already exists
for _, m := range container.VolumeMounts {
if m.Name == consts.PodInfoVolumeName {
return
}
}
container.VolumeMounts = append(container.VolumeMounts, corev1.VolumeMount{
Name: consts.PodInfoVolumeName,
MountPath: consts.PodInfoMountPath,
ReadOnly: true,
})
}
// InjectCheckpointIntoPodSpec injects checkpoint configuration into a pod spec.
// This is the single entry point for ALL checkpoint-related pod modifications:
// 1. Command/Args transformation - moves Command to Args to respect image ENTRYPOINT
// 2. Security context - applies hostIPC and privileged mode for CRIU restore
// 3. Environment variables - injects checkpoint path, hash, and CRIU settings
// 4. Storage configuration - adds volumes and mounts based on storage type
//
// Takes CheckpointInfo (resolved by ResolveCheckpointForService) and checkpoint config.
// Returns error if checkpoint is enabled but configuration is invalid.
func InjectCheckpointIntoPodSpec(
podSpec *corev1.PodSpec,
checkpointInfo *CheckpointInfo,
checkpointConfig *controller_common.CheckpointConfig,
) error {
if checkpointInfo == nil || !checkpointInfo.Enabled {
return nil
}
// Use the checkpoint info as-is (already computed by ResolveCheckpointForService)
// We only need to compute hash if it's not already set
info := checkpointInfo
if info.Hash == "" {
// Identity is required to compute the hash
if info.Identity == nil {
return fmt.Errorf("checkpoint enabled but identity is nil and hash is not set")
}
hash, err := ComputeIdentityHash(*info.Identity)
if err != nil {
return fmt.Errorf("failed to compute identity hash: %w", err)
}
info.Hash = hash
}
// Find the main container first (needed for all modifications)
var mainContainer *corev1.Container
for i := range podSpec.Containers {
if podSpec.Containers[i].Name == consts.MainContainerName {
mainContainer = &podSpec.Containers[i]
break
}
}
// If no main container found by name, use the first container
if mainContainer == nil && len(podSpec.Containers) > 0 {
mainContainer = &podSpec.Containers[0]
}
if mainContainer == nil {
return fmt.Errorf("no container found to inject checkpoint config")
}
// 1. Handle command/args for checkpoint-enabled images
// When checkpoint is enabled, the image has a smart ENTRYPOINT (e.g., /smart-entrypoint.sh)
// that detects checkpoints and decides between restore and cold start.
// We need to pass the user's command as arguments to this ENTRYPOINT rather than
// overriding it with Command.
if len(mainContainer.Command) > 0 {
// Combine Command + Args into a single Args array
// This allows the image's ENTRYPOINT to receive the full command as arguments
combinedArgs := append(mainContainer.Command, mainContainer.Args...)
mainContainer.Args = combinedArgs
mainContainer.Command = nil // Clear Command to use image's ENTRYPOINT
}
// If Command is empty but Args exists, keep Args as-is (they'll be passed to ENTRYPOINT)
// 2. Apply pod-level security context for CRIU restore
// hostIPC: Required for CRIU to access shared memory segments and IPC resources
podSpec.HostIPC = true
// Apply seccomp profile to match checkpoint environment
// This blocks io_uring syscalls required for CRIU compatibility
if podSpec.SecurityContext == nil {
podSpec.SecurityContext = &corev1.PodSecurityContext{}
}
podSpec.SecurityContext.SeccompProfile = &corev1.SeccompProfile{
Type: corev1.SeccompProfileTypeLocalhost,
LocalhostProfile: ptr.To("profiles/block-iouring.json"),
}
// Apply container-level security context for CRIU restore
// Privileged mode is required for CRIU restore operations
if mainContainer.SecurityContext == nil {
mainContainer.SecurityContext = &corev1.SecurityContext{}
}
mainContainer.SecurityContext.Privileged = ptr.To(true)
// Determine storage type and compute location/path
storageType := controller_common.CheckpointStorageTypePVC // default
var storageConfig *controller_common.CheckpointStorageConfig
if checkpointConfig != nil {
storageConfig = &checkpointConfig.Storage
if storageConfig.Type != "" {
storageType = storageConfig.Type
}
}
switch storageType {
case controller_common.CheckpointStorageTypeS3:
// S3 storage: location is s3:// URI
// URI format: s3://[endpoint/]bucket/prefix
info.StorageType = storageTypeToAPI(storageType)
s3URI := "s3://checkpoint-storage/checkpoints" // default
if storageConfig != nil && storageConfig.S3.URI != "" {
s3URI = storageConfig.S3.URI
}
// Append hash to the URI
info.Location = fmt.Sprintf("%s/%s.tar", s3URI, info.Hash)
case controller_common.CheckpointStorageTypeOCI:
// OCI storage: location is oci:// URI
// URI format: oci://registry/repository
info.StorageType = storageTypeToAPI(storageType)
ociURI := "oci://localhost/checkpoints" // default
if storageConfig != nil && storageConfig.OCI.URI != "" {
ociURI = storageConfig.OCI.URI
}
// Append hash as tag
info.Location = fmt.Sprintf("%s:%s", ociURI, info.Hash)
default: // controller_common.CheckpointStorageTypePVC
// PVC storage: location is the checkpoint directory
// k8s-runc-bypass expects: /checkpoints/{hash}/ (directory with checkpoint data)
info.StorageType = storageTypeToAPI(storageType)
basePath := getPVCBasePath(storageConfig)
pvcName := DefaultCheckpointPVCName
if storageConfig != nil && storageConfig.PVC.PVCName != "" {
pvcName = storageConfig.PVC.PVCName
}
info.Location = fmt.Sprintf("%s/%s", basePath, info.Hash)
// Inject PVC volume and mount (only for PVC storage)
InjectCheckpointVolume(podSpec, pvcName)
InjectCheckpointVolumeMount(mainContainer, basePath)
}
// Inject signal volume for CRIU mount namespace consistency
// Even though restore pods don't use the signal file, they need it mounted
// to match the checkpoint job's mount namespace for CRIU compatibility
InjectCheckpointSignalVolume(podSpec, checkpointConfig)
InjectCheckpointSignalVolumeMount(mainContainer)
// Inject Downward API volume for pod identity after CRIU restore
// CRIU preserves environment variables from checkpoint time, so pod identity
// env vars (POD_NAME, POD_UID, POD_NAMESPACE) contain stale values.
// The Dynamo runtime reads from /etc/podinfo/ files first to get correct identity.
InjectPodInfoVolume(podSpec)
InjectPodInfoVolumeMount(mainContainer)
// Inject checkpoint environment variables (for all storage types)
InjectCheckpointEnvVars(mainContainer, info, checkpointConfig)
return nil
}
// InjectCheckpointLabelsFromConfig adds checkpoint labels to a label map based on config
func InjectCheckpointLabelsFromConfig(labels map[string]string, config *nvidiacomv1alpha1.ServiceCheckpointConfig) (map[string]string, error) {
if config == nil || !config.Enabled {
return labels, nil
}
if labels == nil {
labels = make(map[string]string)
}
// Compute hash from identity if provided
if config.Identity != nil {
hash, err := ComputeIdentityHash(*config.Identity)
if err != nil {
return nil, fmt.Errorf("failed to compute identity hash for labels: %w", err)
}
labels[consts.KubeLabelCheckpointHash] = hash
}
return labels, nil
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package checkpoint
import (
"crypto/sha256"
"encoding/hex"
"encoding/json"
"fmt"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
)
// normalizedIdentity is the canonical form used for hash computation
// Only fields that affect checkpoint equivalence are included
type normalizedIdentity struct {
Model string `json:"model"`
BackendFramework string `json:"backendFramework"`
DynamoVersion string `json:"dynamoVersion,omitempty"`
TensorParallelSize int32 `json:"tensorParallelSize"`
PipelineParallelSize int32 `json:"pipelineParallelSize"`
Dtype string `json:"dtype,omitempty"`
MaxModelLen int32 `json:"maxModelLen,omitempty"`
ExtraParameters map[string]string `json:"extraParameters,omitempty"`
}
// ComputeIdentityHash computes a deterministic hash from a DynamoCheckpointIdentity
// The hash is computed by:
// 1. Normalizing all fields
// 2. Serializing to JSON (with sorted keys)
// 3. Computing SHA256 hash
// 4. Returning first 16 characters of hex encoding (64 bits)
//
// 16 hex characters (64 bits) provides excellent collision resistance:
// - 1% collision probability at ~500 million configs
// - 50% collision probability at ~4 billion configs
// This is a perfect balance between readability and safety.
func ComputeIdentityHash(identity nvidiacomv1alpha1.DynamoCheckpointIdentity) (string, error) {
normalized := normalizeIdentity(identity)
// Serialize to JSON (Go's json.Marshal sorts map keys)
data, err := json.Marshal(normalized)
if err != nil {
// This should never happen with our controlled types, but bubble up error if it does
return "", fmt.Errorf("failed to marshal identity for hashing: %w", err)
}
// Compute SHA256 hash
hash := sha256.Sum256(data)
// Return first 16 characters of hex encoding (64 bits)
// Provides excellent collision resistance while remaining readable
return hex.EncodeToString(hash[:])[:16], nil
}
func normalizeIdentity(identity nvidiacomv1alpha1.DynamoCheckpointIdentity) normalizedIdentity {
// Apply defaults for TP/PP if not set
tp := identity.TensorParallelSize
if tp == 0 {
tp = 1
}
pp := identity.PipelineParallelSize
if pp == 0 {
pp = 1
}
// ExtraParameters - ensure non-nil for consistent JSON
extraParams := identity.ExtraParameters
if extraParams == nil {
extraParams = make(map[string]string)
}
return normalizedIdentity{
Model: identity.Model,
BackendFramework: identity.BackendFramework,
DynamoVersion: identity.DynamoVersion,
TensorParallelSize: tp,
PipelineParallelSize: pp,
Dtype: identity.Dtype,
MaxModelLen: identity.MaxModelLen,
ExtraParameters: extraParams,
}
}
package checkpoint
import (
"testing"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestComputeIdentityHash(t *testing.T) {
tests := []struct {
name string
identity nvidiacomv1alpha1.DynamoCheckpointIdentity
expectError bool
expectedHash string // Only set for deterministic checks
otherIdentity *nvidiacomv1alpha1.DynamoCheckpointIdentity
shouldMatch bool
}{
{
name: "basic identity produces deterministic hash",
identity: nvidiacomv1alpha1.DynamoCheckpointIdentity{
Model: "meta-llama/Llama-2-7b-hf",
BackendFramework: "vllm",
},
expectError: false,
expectedHash: "96429b2725761a09", // Known hash for this specific identity
},
{
name: "identity with all fields produces deterministic hash",
identity: nvidiacomv1alpha1.DynamoCheckpointIdentity{
Model: "meta-llama/Llama-2-13b-hf",
BackendFramework: "sglang",
DynamoVersion: "0.4.2",
TensorParallelSize: 2,
PipelineParallelSize: 1,
Dtype: "float16",
MaxModelLen: 4096,
ExtraParameters: map[string]string{
"gpu_memory_utilization": "0.9",
},
},
expectError: false,
expectedHash: "f4ba65bccbb8e4fb", // Known hash for this specific identity
},
{
name: "same identity produces same hash",
identity: nvidiacomv1alpha1.DynamoCheckpointIdentity{
Model: "meta-llama/Llama-2-7b-hf",
BackendFramework: "vllm",
},
otherIdentity: &nvidiacomv1alpha1.DynamoCheckpointIdentity{
Model: "meta-llama/Llama-2-7b-hf",
BackendFramework: "vllm",
},
expectError: false,
shouldMatch: true,
},
{
name: "different models produce different hashes",
identity: nvidiacomv1alpha1.DynamoCheckpointIdentity{
Model: "meta-llama/Llama-2-7b-hf",
BackendFramework: "vllm",
},
otherIdentity: &nvidiacomv1alpha1.DynamoCheckpointIdentity{
Model: "meta-llama/Llama-2-13b-hf",
BackendFramework: "vllm",
},
expectError: false,
shouldMatch: false,
},
{
name: "different frameworks produce different hashes",
identity: nvidiacomv1alpha1.DynamoCheckpointIdentity{
Model: "meta-llama/Llama-2-7b-hf",
BackendFramework: "vllm",
},
otherIdentity: &nvidiacomv1alpha1.DynamoCheckpointIdentity{
Model: "meta-llama/Llama-2-7b-hf",
BackendFramework: "sglang",
},
expectError: false,
shouldMatch: false,
},
{
name: "normalization: zero vs unset numeric fields",
identity: nvidiacomv1alpha1.DynamoCheckpointIdentity{
Model: "meta-llama/Llama-2-7b-hf",
BackendFramework: "vllm",
TensorParallelSize: 0,
MaxModelLen: 0,
},
otherIdentity: &nvidiacomv1alpha1.DynamoCheckpointIdentity{
Model: "meta-llama/Llama-2-7b-hf",
BackendFramework: "vllm",
// TensorParallelSize and MaxModelLen omitted (defaults to 0)
},
expectError: false,
shouldMatch: true,
},
{
name: "normalization: empty vs nil map",
identity: nvidiacomv1alpha1.DynamoCheckpointIdentity{
Model: "meta-llama/Llama-2-7b-hf",
BackendFramework: "vllm",
ExtraParameters: map[string]string{},
},
otherIdentity: &nvidiacomv1alpha1.DynamoCheckpointIdentity{
Model: "meta-llama/Llama-2-7b-hf",
BackendFramework: "vllm",
ExtraParameters: nil,
},
expectError: false,
shouldMatch: true,
},
{
name: "extra parameters order should not matter",
identity: nvidiacomv1alpha1.DynamoCheckpointIdentity{
Model: "meta-llama/Llama-2-7b-hf",
BackendFramework: "vllm",
ExtraParameters: map[string]string{
"param_a": "value1",
"param_b": "value2",
},
},
otherIdentity: &nvidiacomv1alpha1.DynamoCheckpointIdentity{
Model: "meta-llama/Llama-2-7b-hf",
BackendFramework: "vllm",
ExtraParameters: map[string]string{
"param_b": "value2",
"param_a": "value1",
},
},
expectError: false,
shouldMatch: true,
},
{
name: "different extra parameters produce different hashes",
identity: nvidiacomv1alpha1.DynamoCheckpointIdentity{
Model: "meta-llama/Llama-2-7b-hf",
BackendFramework: "vllm",
ExtraParameters: map[string]string{
"gpu_memory_utilization": "0.9",
},
},
otherIdentity: &nvidiacomv1alpha1.DynamoCheckpointIdentity{
Model: "meta-llama/Llama-2-7b-hf",
BackendFramework: "vllm",
ExtraParameters: map[string]string{
"gpu_memory_utilization": "0.8",
},
},
expectError: false,
shouldMatch: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
hash1, err1 := ComputeIdentityHash(tt.identity)
if tt.expectError {
require.Error(t, err1)
return
}
require.NoError(t, err1)
assert.NotEmpty(t, hash1, "hash should not be empty")
assert.Len(t, hash1, 16, "hash should be 16 characters (64 bits)")
// Verify it's hex
assert.Regexp(t, "^[0-9a-f]{16}$", hash1, "hash should be 16 hex characters")
// If we have an expected hash, check it
if tt.expectedHash != "" {
assert.Equal(t, tt.expectedHash, hash1)
}
// If we have another identity to compare, compute its hash
if tt.otherIdentity != nil {
hash2, err2 := ComputeIdentityHash(*tt.otherIdentity)
require.NoError(t, err2)
if tt.shouldMatch {
assert.Equal(t, hash1, hash2, "hashes should match")
} else {
assert.NotEqual(t, hash1, hash2, "hashes should differ")
}
}
})
}
}
......@@ -117,6 +117,89 @@ const (
ResourceStateReady = "ready"
ResourceStateNotReady = "not_ready"
ResourceStateUnknown = "unknown"
// Checkpoint related constants
KubeLabelCheckpointSource = "nvidia.com/checkpoint-source"
KubeLabelCheckpointHash = "nvidia.com/checkpoint-hash"
KubeLabelCheckpointName = "nvidia.com/checkpoint-name"
// EnvCheckpointStorageType indicates the storage backend type (pvc, s3, oci)
EnvCheckpointStorageType = "DYN_CHECKPOINT_STORAGE_TYPE"
// EnvCheckpointLocation is the source location of the checkpoint
// For PVC: same as path (e.g., /checkpoints/{hash}.tar)
// For S3: s3://bucket/prefix/{hash}.tar
// For OCI: oci://registry/repo:{hash}
EnvCheckpointLocation = "DYN_CHECKPOINT_LOCATION"
// EnvCheckpointPath is the local path to the checkpoint tar file
// For PVC: same as location
// For S3/OCI: download destination (e.g., /tmp/{hash}.tar)
EnvCheckpointPath = "DYN_CHECKPOINT_PATH"
// EnvCheckpointHash is the identity hash (for debugging/observability)
EnvCheckpointHash = "DYN_CHECKPOINT_HASH"
// EnvCheckpointSignalFile is the full path to the signal file
// The DaemonSet writes this file after checkpoint is complete
// The checkpoint job pod waits for this file, then exits successfully
EnvCheckpointSignalFile = "DYN_CHECKPOINT_SIGNAL_FILE"
// EnvCheckpointReadyFile is the full path to a file the worker creates
// when the model is loaded and ready for checkpointing.
// The readiness probe watches this file to trigger DaemonSet checkpoint.
EnvCheckpointReadyFile = "DYN_CHECKPOINT_READY_FILE"
// CRIU-related environment variables for restore operations
// EnvRestoreMarkerFile is the file created by CRIU after successful restore
EnvRestoreMarkerFile = "DYN_RESTORE_MARKER_FILE"
// EnvCRIUWorkDir is the working directory for CRIU operations
EnvCRIUWorkDir = "CRIU_WORK_DIR"
// EnvCRIULogDir is the directory where CRIU writes logs
EnvCRIULogDir = "CRIU_LOG_DIR"
// EnvCUDAPluginDir is the directory containing CRIU CUDA plugins
EnvCUDAPluginDir = "CUDA_PLUGIN_DIR"
// EnvCRIUTimeout is the timeout for CRIU operations
EnvCRIUTimeout = "CRIU_TIMEOUT"
// CheckpointReadyFilePath is the default path for the ready file
CheckpointReadyFilePath = "/tmp/checkpoint-ready"
// RestoreMarkerFilePath is the default path for the restore marker
RestoreMarkerFilePath = "/tmp/dynamo-restored"
// CRIUWorkDirPath is the default CRIU work directory
CRIUWorkDirPath = "/var/criu-work"
// CRIULogDirPath is the default CRIU log directory
CRIULogDirPath = "/checkpoints/restore-logs"
// CUDAPluginDirPath is the default CUDA plugin directory
CUDAPluginDirPath = "/usr/local/lib/criu"
// DefaultCRIUTimeout is the default CRIU timeout in seconds (6 hours)
DefaultCRIUTimeout = "21600"
CheckpointVolumeName = "checkpoint-storage"
CheckpointSignalVolumeName = "checkpoint-signal"
CheckpointBasePath = "/checkpoints"
CheckpointSignalHostPath = "/var/lib/dynamo-checkpoint/signals"
CheckpointSignalMountPath = "/checkpoint-signal"
// PodInfo volume for Downward API (critical for CRIU restore)
// After CRIU restore, environment variables contain stale values from checkpoint pod.
// The Downward API files at /etc/podinfo always have current pod identity.
PodInfoVolumeName = "podinfo"
PodInfoMountPath = "/etc/podinfo"
// Downward API field paths
PodInfoFieldPodName = "metadata.name"
PodInfoFieldPodUID = "metadata.uid"
PodInfoFieldPodNamespace = "metadata.namespace"
// Downward API file names for DGD annotations
PodInfoFileDynNamespace = "dyn_namespace"
PodInfoFileDynComponent = "dyn_component"
PodInfoFileDynParentDGDName = "dyn_parent_dgd_name"
PodInfoFileDynParentDGDNS = "dyn_parent_dgd_namespace"
PodInfoFileDynDiscoveryBackend = "dyn_discovery_backend"
// Annotation keys for DGD info (exposed via Downward API)
AnnotationDynNamespace = "nvidia.com/dyn-namespace"
AnnotationDynComponent = "nvidia.com/dyn-component"
AnnotationDynParentDGDName = "nvidia.com/dyn-parent-dgd-name"
AnnotationDynParentDGDNS = "nvidia.com/dyn-parent-dgd-namespace"
AnnotationDynDiscoveryBackend = "nvidia.com/dyn-discovery-backend"
)
type MultinodeDeploymentType string
......
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package controller
import (
"context"
"fmt"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/tools/record"
"k8s.io/utils/ptr"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/builder"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/event"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/predicate"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/checkpoint"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
commonController "github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common"
)
// CheckpointReconciler reconciles a DynamoCheckpoint object
type CheckpointReconciler struct {
client.Client
Config commonController.Config
Recorder record.EventRecorder
}
// Helper function to compute checkpoint location from operator config
func (r *CheckpointReconciler) getCheckpointLocation(identityHash string) string {
basePath := checkpoint.GetPVCBasePath(&r.Config.Checkpoint)
return fmt.Sprintf("%s/%s", basePath, identityHash)
}
// Helper function to get checkpoint storage type from operator config
func (r *CheckpointReconciler) getCheckpointStorageType() nvidiacomv1alpha1.DynamoCheckpointStorageType {
return nvidiacomv1alpha1.DynamoCheckpointStorageType(commonController.CheckpointStorageTypePVC)
}
// GetRecorder returns the event recorder (implements controller_common.Reconciler interface)
func (r *CheckpointReconciler) GetRecorder() record.EventRecorder {
return r.Recorder
}
// getSignalHostPath returns the configured signal host path, or the default if not set
func (r *CheckpointReconciler) getSignalHostPath() string {
if r.Config.Checkpoint.Enabled && r.Config.Checkpoint.Storage.SignalHostPath != "" {
return r.Config.Checkpoint.Storage.SignalHostPath
}
return consts.CheckpointSignalHostPath
}
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamocheckpoints,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamocheckpoints/status,verbs=get;update;patch
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamocheckpoints/finalizers,verbs=update
// +kubebuilder:rbac:groups=batch,resources=jobs,verbs=get;list;watch;create;update;patch;delete
func (r *CheckpointReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
logger := log.FromContext(ctx)
// Fetch the DynamoCheckpoint instance
ckpt := &nvidiacomv1alpha1.DynamoCheckpoint{}
if err := r.Get(ctx, req.NamespacedName, ckpt); err != nil {
if apierrors.IsNotFound(err) {
return ctrl.Result{}, nil
}
return ctrl.Result{}, err
}
logger.Info("Reconciling DynamoCheckpoint", "name", ckpt.Name, "phase", ckpt.Status.Phase)
// Compute identity hash if not already set
if ckpt.Status.IdentityHash == "" {
hash, err := checkpoint.ComputeIdentityHash(ckpt.Spec.Identity)
if err != nil {
logger.Error(err, "Failed to compute identity hash")
return ctrl.Result{}, fmt.Errorf("failed to compute identity hash: %w", err)
}
ckpt.Status.IdentityHash = hash
ckpt.Status.Phase = nvidiacomv1alpha1.DynamoCheckpointPhasePending
if err := r.Status().Update(ctx, ckpt); err != nil {
logger.Error(err, "Failed to update DynamoCheckpoint status with hash")
return ctrl.Result{}, err
}
// Status update will trigger a new reconcile via the watch
return ctrl.Result{}, nil
}
// Handle based on current phase
switch ckpt.Status.Phase {
case nvidiacomv1alpha1.DynamoCheckpointPhasePending:
return r.handlePending(ctx, ckpt)
case nvidiacomv1alpha1.DynamoCheckpointPhaseCreating:
return r.handleCreating(ctx, ckpt)
case nvidiacomv1alpha1.DynamoCheckpointPhaseReady:
// Nothing to do, checkpoint is ready
return ctrl.Result{}, nil
case nvidiacomv1alpha1.DynamoCheckpointPhaseFailed:
// Could implement retry logic here
return ctrl.Result{}, nil
default:
// Unknown phase, reset to Pending
ckpt.Status.Phase = nvidiacomv1alpha1.DynamoCheckpointPhasePending
if err := r.Status().Update(ctx, ckpt); err != nil {
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
}
}
func (r *CheckpointReconciler) handlePending(ctx context.Context, ckpt *nvidiacomv1alpha1.DynamoCheckpoint) (ctrl.Result, error) {
logger := log.FromContext(ctx)
jobName := fmt.Sprintf("checkpoint-%s", ckpt.Name)
// Use SyncResource to create/update the checkpoint Job
modified, _, err := commonController.SyncResource(ctx, r, ckpt, func(ctx context.Context) (*batchv1.Job, bool, error) {
job := r.buildCheckpointJob(ckpt, jobName)
return job, false, nil
})
if err != nil {
logger.Error(err, "Failed to sync checkpoint Job")
return ctrl.Result{}, err
}
if modified {
logger.Info("Created/updated checkpoint Job", "job", jobName)
}
// Update status to Creating phase
ckpt.Status.Phase = nvidiacomv1alpha1.DynamoCheckpointPhaseCreating
ckpt.Status.JobName = jobName
meta.SetStatusCondition(&ckpt.Status.Conditions, metav1.Condition{
Type: string(nvidiacomv1alpha1.DynamoCheckpointConditionJobCreated),
Status: metav1.ConditionTrue,
Reason: "JobCreated",
Message: fmt.Sprintf("Checkpoint job %s created", jobName),
LastTransitionTime: metav1.Now(),
})
if err := r.Status().Update(ctx, ckpt); err != nil {
return ctrl.Result{}, err
}
// Status update will trigger next reconcile via watch
return ctrl.Result{}, nil
}
func (r *CheckpointReconciler) handleCreating(ctx context.Context, ckpt *nvidiacomv1alpha1.DynamoCheckpoint) (ctrl.Result, error) {
logger := log.FromContext(ctx)
// Check Job status
job := &batchv1.Job{}
if err := r.Get(ctx, client.ObjectKey{Namespace: ckpt.Namespace, Name: ckpt.Status.JobName}, job); err != nil {
if apierrors.IsNotFound(err) {
// Job was deleted, go back to Pending
ckpt.Status.Phase = nvidiacomv1alpha1.DynamoCheckpointPhasePending
ckpt.Status.JobName = ""
meta.SetStatusCondition(&ckpt.Status.Conditions, metav1.Condition{
Type: string(nvidiacomv1alpha1.DynamoCheckpointConditionJobCreated),
Status: metav1.ConditionFalse,
Reason: "JobDeleted",
Message: "Checkpoint job was deleted",
LastTransitionTime: metav1.Now(),
})
if err := r.Status().Update(ctx, ckpt); err != nil {
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
}
return ctrl.Result{}, err
}
// Check if job succeeded
if job.Status.Succeeded > 0 {
logger.Info("Checkpoint Job succeeded", "job", job.Name)
r.Recorder.Event(ckpt, corev1.EventTypeNormal, "CheckpointReady", "Checkpoint creation completed successfully")
now := metav1.Now()
ckpt.Status.Phase = nvidiacomv1alpha1.DynamoCheckpointPhaseReady
ckpt.Status.CreatedAt = &now
// Set checkpoint location and storage type using helper functions
ckpt.Status.Location = r.getCheckpointLocation(ckpt.Status.IdentityHash)
ckpt.Status.StorageType = r.getCheckpointStorageType()
meta.SetStatusCondition(&ckpt.Status.Conditions, metav1.Condition{
Type: string(nvidiacomv1alpha1.DynamoCheckpointConditionJobCompleted),
Status: metav1.ConditionTrue,
Reason: "JobSucceeded",
Message: "Checkpoint job completed successfully",
LastTransitionTime: metav1.Now(),
})
meta.SetStatusCondition(&ckpt.Status.Conditions, metav1.Condition{
Type: string(nvidiacomv1alpha1.DynamoCheckpointConditionTarAvailable),
Status: metav1.ConditionTrue,
Reason: "TarCreated",
Message: fmt.Sprintf("Checkpoint available at %s", ckpt.Status.Location),
LastTransitionTime: metav1.Now(),
})
if err := r.Status().Update(ctx, ckpt); err != nil {
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
}
// Check if job failed
if job.Status.Failed > 0 {
logger.Info("Checkpoint Job failed", "job", job.Name)
r.Recorder.Event(ckpt, corev1.EventTypeWarning, "CheckpointFailed", "Checkpoint creation failed")
ckpt.Status.Phase = nvidiacomv1alpha1.DynamoCheckpointPhaseFailed
ckpt.Status.Message = "Checkpoint job failed"
meta.SetStatusCondition(&ckpt.Status.Conditions, metav1.Condition{
Type: string(nvidiacomv1alpha1.DynamoCheckpointConditionJobCompleted),
Status: metav1.ConditionFalse,
Reason: "JobFailed",
Message: "Checkpoint job failed",
LastTransitionTime: metav1.Now(),
})
if err := r.Status().Update(ctx, ckpt); err != nil {
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
}
// Job is still running - we'll be notified via Update event when status changes
return ctrl.Result{}, nil
}
func (r *CheckpointReconciler) buildCheckpointJob(ckpt *nvidiacomv1alpha1.DynamoCheckpoint, jobName string) *batchv1.Job {
// Use the pod template from the spec
podTemplate := ckpt.Spec.Job.PodTemplateSpec.DeepCopy()
// Add checkpoint-related labels
if podTemplate.Labels == nil {
podTemplate.Labels = make(map[string]string)
}
podTemplate.Labels[consts.KubeLabelCheckpointName] = ckpt.Name
podTemplate.Labels[consts.KubeLabelCheckpointHash] = ckpt.Status.IdentityHash
podTemplate.Labels[consts.KubeLabelCheckpointSource] = "true"
// Add signal volume (hostPath for communication with DaemonSet)
// The DaemonSet writes a signal file after checkpoint is complete
hostPathType := corev1.HostPathDirectoryOrCreate
podTemplate.Spec.Volumes = append(podTemplate.Spec.Volumes, corev1.Volume{
Name: consts.CheckpointSignalVolumeName,
VolumeSource: corev1.VolumeSource{
HostPath: &corev1.HostPathVolumeSource{
Path: r.getSignalHostPath(),
Type: &hostPathType,
},
},
})
// Compute the signal file path - unique per checkpoint hash
signalFilePath := consts.CheckpointSignalMountPath + "/" + ckpt.Status.IdentityHash + ".done"
// Add initContainer to clean up any leftover signal file from previous runs
// This ensures a fresh start for each checkpoint job without affecting the checkpoint itself
// InitContainers complete before the main container starts, so they don't appear in the checkpoint
initContainerImage := r.Config.Checkpoint.InitContainerImage
podTemplate.Spec.InitContainers = append(podTemplate.Spec.InitContainers, corev1.Container{
Name: "cleanup-signal-file",
Image: initContainerImage,
Command: []string{
"sh",
"-c",
fmt.Sprintf("rm -f %s || true; echo 'Signal file cleanup complete'", signalFilePath),
},
VolumeMounts: []corev1.VolumeMount{
{
Name: consts.CheckpointSignalVolumeName,
MountPath: consts.CheckpointSignalMountPath,
},
},
})
// Add checkpoint env vars and volume mounts to main container
if len(podTemplate.Spec.Containers) > 0 {
mainContainer := &podTemplate.Spec.Containers[0]
// Compute checkpoint location and storage type using helper functions
checkpointLocation := r.getCheckpointLocation(ckpt.Status.IdentityHash)
storageType := string(r.getCheckpointStorageType())
// Add checkpoint-related env vars
mainContainer.Env = append(mainContainer.Env,
// Signal file: DaemonSet writes this after checkpoint completes
corev1.EnvVar{
Name: consts.EnvCheckpointSignalFile,
Value: signalFilePath,
},
// Ready file: Worker creates this when model is loaded
corev1.EnvVar{
Name: consts.EnvCheckpointReadyFile,
Value: consts.CheckpointReadyFilePath,
},
// Checkpoint hash: For idempotency check
corev1.EnvVar{
Name: consts.EnvCheckpointHash,
Value: ckpt.Status.IdentityHash,
},
// Checkpoint location: For idempotency check
corev1.EnvVar{
Name: consts.EnvCheckpointLocation,
Value: checkpointLocation,
},
// Storage type: For idempotency check (pvc, s3, oci)
corev1.EnvVar{
Name: consts.EnvCheckpointStorageType,
Value: storageType,
},
)
// Add signal volume mount (required for DaemonSet communication)
mainContainer.VolumeMounts = append(mainContainer.VolumeMounts,
corev1.VolumeMount{
Name: consts.CheckpointSignalVolumeName,
MountPath: consts.CheckpointSignalMountPath,
},
)
// Add checkpoint PVC volume and mount for mount namespace consistency with restore pods
// CRIU requires the exact same mount layout between checkpoint and restore
if r.Config.Checkpoint.Storage.PVC.PVCName != "" {
pvcName := r.Config.Checkpoint.Storage.PVC.PVCName
basePath := r.Config.Checkpoint.Storage.PVC.BasePath
if basePath == "" {
basePath = consts.CheckpointBasePath
}
checkpoint.InjectCheckpointVolume(&podTemplate.Spec, pvcName)
checkpoint.InjectCheckpointVolumeMount(mainContainer, basePath)
}
// Add Downward API volume for pod identity (mount namespace consistency with restore pods)
checkpoint.InjectPodInfoVolume(&podTemplate.Spec)
checkpoint.InjectPodInfoVolumeMount(mainContainer)
// Override probes for checkpoint mode
// Checkpoint jobs need different probe behavior than regular worker pods:
// - Readiness: Wait for model to load before checkpoint
// - Liveness/Startup: Remove to prevent restarts during slow model loading
mainContainer.ReadinessProbe = &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
Exec: &corev1.ExecAction{
Command: []string{"cat", consts.CheckpointReadyFilePath},
},
},
InitialDelaySeconds: 15,
PeriodSeconds: 2,
}
// Remove liveness probe - we don't want restarts during model loading
mainContainer.LivenessProbe = nil
// Remove startup probe - not needed for checkpoint jobs
mainContainer.StartupProbe = nil
}
// Set restart policy to Never for Jobs
podTemplate.Spec.RestartPolicy = corev1.RestartPolicyNever
// Apply seccomp profile to block io_uring syscalls
// CRIU doesn't support io_uring memory mappings, so we must block these syscalls
podTemplate.Spec.SecurityContext = &corev1.PodSecurityContext{
SeccompProfile: &corev1.SeccompProfile{
Type: corev1.SeccompProfileTypeLocalhost,
LocalhostProfile: ptr.To("profiles/block-iouring.json"),
},
}
// Build the Job
activeDeadlineSeconds := ckpt.Spec.Job.ActiveDeadlineSeconds
if activeDeadlineSeconds == nil {
defaultDeadline := int64(3600)
activeDeadlineSeconds = &defaultDeadline
}
backoffLimit := ckpt.Spec.Job.BackoffLimit
if backoffLimit == nil {
defaultBackoff := int32(3)
backoffLimit = &defaultBackoff
}
ttlSeconds := ckpt.Spec.Job.TTLSecondsAfterFinished
if ttlSeconds == nil {
defaultTTL := int32(300)
ttlSeconds = &defaultTTL
}
job := &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Name: jobName,
Namespace: ckpt.Namespace,
Labels: map[string]string{
consts.KubeLabelCheckpointName: ckpt.Name,
consts.KubeLabelCheckpointHash: ckpt.Status.IdentityHash,
},
},
Spec: batchv1.JobSpec{
ActiveDeadlineSeconds: activeDeadlineSeconds,
BackoffLimit: backoffLimit,
TTLSecondsAfterFinished: ttlSeconds,
Template: *podTemplate,
},
}
return job
}
// SetupWithManager sets up the controller with the Manager.
func (r *CheckpointReconciler) SetupWithManager(mgr ctrl.Manager) error {
return ctrl.NewControllerManagedBy(mgr).
For(&nvidiacomv1alpha1.DynamoCheckpoint{}).
Owns(&batchv1.Job{}, builder.WithPredicates(predicate.Funcs{
// Ignore creation - we don't need to reconcile when we just created the Job
CreateFunc: func(ce event.CreateEvent) bool { return false },
DeleteFunc: func(de event.DeleteEvent) bool { return true },
UpdateFunc: func(ue event.UpdateEvent) bool { return true },
GenericFunc: func(ge event.GenericEvent) bool { return true },
})).
WithEventFilter(commonController.EphemeralDeploymentEventFilter(r.Config)).
Complete(r)
}
......@@ -34,6 +34,7 @@ import (
"emperror.dev/errors"
"github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/checkpoint"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/common"
commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
commonController "github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common"
......@@ -1200,7 +1201,17 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex
isDebugModeEnabled := checkIfIsDebugModeEnabled(resourceAnnotations)
podSpec, err := dynamo.GenerateBasePodSpecForController(opt.dynamoComponentDeployment, r.DockerSecretRetriever, r.Config, role, commonconsts.MultinodeDeploymentTypeLWS)
// Resolve checkpoint for this component
var checkpointInfo *checkpoint.CheckpointInfo
if opt.dynamoComponentDeployment.Spec.Checkpoint != nil && opt.dynamoComponentDeployment.Spec.Checkpoint.Enabled {
info, err := checkpoint.ResolveCheckpointForService(ctx, r.Client, opt.dynamoComponentDeployment.Namespace, opt.dynamoComponentDeployment.Spec.Checkpoint)
if err != nil {
return nil, errors.Wrap(err, "failed to resolve checkpoint")
}
checkpointInfo = info
}
podSpec, err := dynamo.GenerateBasePodSpecForController(opt.dynamoComponentDeployment, r.DockerSecretRetriever, r.Config, role, commonconsts.MultinodeDeploymentTypeLWS, checkpointInfo)
if err != nil {
err = errors.Wrap(err, "failed to generate base pod spec")
return nil, err
......
......@@ -26,6 +26,7 @@ import (
grovev1alpha1 "github.com/NVIDIA/grove/operator/api/core/v1alpha1"
"k8s.io/apimachinery/pkg/api/errors"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/checkpoint"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/discovery"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/secret"
......@@ -46,7 +47,6 @@ import (
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/predicate"
"github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
commoncontroller "github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common"
......@@ -211,7 +211,7 @@ func (r *DynamoGraphDeploymentReconciler) Reconcile(ctx context.Context, req ctr
type Resource interface {
IsReady() (ready bool, reason string)
GetName() string
GetServiceStatuses() map[string]v1alpha1.ServiceReplicaStatus
GetServiceStatuses() map[string]nvidiacomv1alpha1.ServiceReplicaStatus
}
type ReconcileResult struct {
......@@ -267,6 +267,14 @@ func (r *DynamoGraphDeploymentReconciler) reconcileResources(ctx context.Context
return ReconcileResult{}, fmt.Errorf("failed to reconcile top-level PVCs: %w", err)
}
// Reconcile checkpoints for services with checkpointing enabled
checkpointStatuses, checkpointInfos, err := r.reconcileCheckpoints(ctx, dynamoDeployment)
if err != nil {
logger.Error(err, "Failed to reconcile checkpoints")
return ReconcileResult{}, fmt.Errorf("failed to reconcile checkpoints: %w", err)
}
dynamoDeployment.Status.Checkpoints = checkpointStatuses
// Reconcile DynamoGraphDeploymentScalingAdapters for each service
err = r.reconcileScalingAdapters(ctx, dynamoDeployment)
if err != nil {
......@@ -313,7 +321,7 @@ func (r *DynamoGraphDeploymentReconciler) reconcileResources(ctx context.Context
var result ReconcileResult
if r.isGrovePathway(dynamoDeployment) {
logger.Info("Reconciling Grove resources", "hasMultinode", hasMultinode, "lwsEnabled", r.Config.LWS.Enabled)
result, err = r.reconcileGroveResources(ctx, dynamoDeployment, restartState)
result, err = r.reconcileGroveResources(ctx, dynamoDeployment, restartState, checkpointInfos)
} else {
logger.Info("Reconciling Dynamo components deployments", "hasMultinode", hasMultinode, "lwsEnabled", r.Config.LWS.Enabled)
result, err = r.reconcileDynamoComponentsDeployments(ctx, dynamoDeployment, restartState)
......@@ -432,7 +440,7 @@ func (r *DynamoGraphDeploymentReconciler) scaleGroveResource(ctx context.Context
return err
}
func (r *DynamoGraphDeploymentReconciler) reconcileGrovePodCliqueSet(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment, restartState *dynamo.RestartState) (*commoncontroller.Resource, error) {
func (r *DynamoGraphDeploymentReconciler) reconcileGrovePodCliqueSet(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment, restartState *dynamo.RestartState, checkpointInfos map[string]*checkpoint.CheckpointInfo) (*commoncontroller.Resource, error) {
logger := log.FromContext(ctx)
existingRestartAnnotations, err := r.getExistingRestartAnnotationsPCS(ctx, dynamoDeployment)
......@@ -442,7 +450,7 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGrovePodCliqueSet(ctx context
}
// generate the dynamoComponentsDeployments from the config
grovePodCliqueSet, err := dynamo.GenerateGrovePodCliqueSet(ctx, dynamoDeployment, r.Config, r.DockerSecretRetriever, restartState, existingRestartAnnotations)
grovePodCliqueSet, err := dynamo.GenerateGrovePodCliqueSet(ctx, dynamoDeployment, r.Config, r.DockerSecretRetriever, restartState, existingRestartAnnotations, checkpointInfos)
if err != nil {
logger.Error(err, "failed to generate the Grove GangSet")
return nil, fmt.Errorf("failed to generate the Grove GangSet: %w", err)
......@@ -456,7 +464,7 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGrovePodCliqueSet(ctx context
}
syncedGrovePodCliqueSetAsResource, err := commoncontroller.NewResourceWithServiceStatuses(
syncedGrovePodCliqueSet,
func() (bool, string, map[string]v1alpha1.ServiceReplicaStatus) {
func() (bool, string, map[string]nvidiacomv1alpha1.ServiceReplicaStatus) {
// Grove readiness: all underlying PodCliques and PodCliqueScalingGroups have replicas == availableReplicas
allComponentsReady, reason, serviceStatuses := dynamo.GetComponentReadinessAndServiceReplicaStatuses(ctx, r.Client, dynamoDeployment)
if !allComponentsReady {
......@@ -542,10 +550,10 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveScaling(ctx context.Cont
return nil
}
func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment, restartState *dynamo.RestartState) (ReconcileResult, error) {
func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment, restartState *dynamo.RestartState, checkpointInfos map[string]*checkpoint.CheckpointInfo) (ReconcileResult, error) {
logger := log.FromContext(ctx)
grovePodCliqueSetAsResource, err := r.reconcileGrovePodCliqueSet(ctx, dynamoDeployment, restartState)
grovePodCliqueSetAsResource, err := r.reconcileGrovePodCliqueSet(ctx, dynamoDeployment, restartState, checkpointInfos)
if err != nil {
logger.Error(err, "failed to reconcile the Grove PodClique Set")
return ReconcileResult{}, fmt.Errorf("failed to reconcile the Grove PodClique Set: %w", err)
......@@ -912,7 +920,7 @@ func (r *DynamoGraphDeploymentReconciler) checkResourcesReadiness(resources []Re
var notReadyReasons []string
notReadyResources := []string{}
serviceStatuses := make(map[string]v1alpha1.ServiceReplicaStatus)
serviceStatuses := make(map[string]nvidiacomv1alpha1.ServiceReplicaStatus)
for _, resource := range resources {
ready, reason := resource.IsReady()
......@@ -1106,6 +1114,205 @@ func (r *DynamoGraphDeploymentReconciler) reconcilePVCs(ctx context.Context, dyn
return nil
}
// reconcileCheckpoints reconciles Checkpoint CRs for services with checkpointing enabled
// For Auto mode, it creates Checkpoint CRs if they don't exist
// Returns a map of service names to checkpoint status and a map of service names to checkpoint info
func (r *DynamoGraphDeploymentReconciler) reconcileCheckpoints(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) (map[string]nvidiacomv1alpha1.ServiceCheckpointStatus, map[string]*checkpoint.CheckpointInfo, error) {
logger := log.FromContext(ctx)
statuses := make(map[string]nvidiacomv1alpha1.ServiceCheckpointStatus)
checkpointInfos := make(map[string]*checkpoint.CheckpointInfo)
for serviceName, component := range dynamoDeployment.Spec.Services {
if component.Checkpoint == nil || !component.Checkpoint.Enabled {
continue
}
logger.Info("Reconciling checkpoint for service", "service", serviceName)
// Resolve checkpoint for this service
info, err := checkpoint.ResolveCheckpointForService(ctx, r.Client, dynamoDeployment.Namespace, component.Checkpoint)
if err != nil {
logger.Error(err, "Failed to resolve checkpoint for service", "service", serviceName)
return nil, nil, fmt.Errorf("failed to resolve checkpoint for service %s: %w", serviceName, err)
}
// Store checkpoint info for later use in pod spec generation
checkpointInfos[serviceName] = info
// If no checkpoint found and mode is Auto, create one
if info.CheckpointName == "" && component.Checkpoint.Mode == nvidiacomv1alpha1.CheckpointModeAuto {
logger.Info("Creating DynamoCheckpoint CR in Auto mode", "service", serviceName)
ckpt, err := r.createCheckpointCR(ctx, dynamoDeployment, serviceName, component)
if err != nil {
logger.Error(err, "Failed to create DynamoCheckpoint CR", "service", serviceName)
return nil, nil, fmt.Errorf("failed to create checkpoint for service %s: %w", serviceName, err)
}
info.CheckpointName = ckpt.Name
// Compute hash locally since status may not be populated yet
// (checkpoint controller reconciles asynchronously)
hash, err := checkpoint.ComputeIdentityHash(*component.Checkpoint.Identity)
if err != nil {
logger.Error(err, "Failed to compute checkpoint identity hash", "service", serviceName)
return nil, nil, fmt.Errorf("failed to compute checkpoint hash for service %s: %w", serviceName, err)
}
info.Hash = hash
info.Ready = false // Newly created checkpoint is not ready yet
}
// Update status
statuses[serviceName] = nvidiacomv1alpha1.ServiceCheckpointStatus{
CheckpointName: info.CheckpointName,
IdentityHash: info.Hash,
Ready: info.Ready,
}
}
return statuses, checkpointInfos, nil
}
// createCheckpointCR creates a DynamoCheckpoint CR for a service in Auto mode
func (r *DynamoGraphDeploymentReconciler) createCheckpointCR(
ctx context.Context,
dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment,
serviceName string,
component *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec,
) (*nvidiacomv1alpha1.DynamoCheckpoint, error) {
if component.Checkpoint == nil || component.Checkpoint.Identity == nil {
return nil, fmt.Errorf("checkpoint identity is required for Auto mode")
}
identity := component.Checkpoint.Identity
// Compute hash for naming
hash, err := checkpoint.ComputeIdentityHash(*identity)
if err != nil {
return nil, fmt.Errorf("failed to compute identity hash: %w", err)
}
// Generate checkpoint name: use hash directly (16 chars, 64 bits)
// This allows natural deduplication - same identity = same checkpoint name
// 16 characters provides excellent collision resistance (1% at 500M configs)
ckptName := hash
// Use SyncResource to create/update the DynamoCheckpoint CR
// Pass nil as parentResource to create an independent checkpoint (no owner reference)
// This ensures the checkpoint persists even if the DGD is deleted
_, ckpt, err := commoncontroller.SyncResource(ctx, r, nil, func(ctx context.Context) (*nvidiacomv1alpha1.DynamoCheckpoint, bool, error) {
// Build the checkpoint identity from service identity
checkpointIdentity := nvidiacomv1alpha1.DynamoCheckpointIdentity{
Model: identity.Model,
BackendFramework: identity.BackendFramework,
DynamoVersion: identity.DynamoVersion,
TensorParallelSize: identity.TensorParallelSize,
PipelineParallelSize: identity.PipelineParallelSize,
Dtype: identity.Dtype,
MaxModelLen: identity.MaxModelLen,
ExtraParameters: identity.ExtraParameters,
}
// Build pod template from service spec for checkpoint job
// This uses GenerateBasePodSpec to ensure same config as worker pods (image pull secrets, etc.)
// Pass framework from checkpoint identity for accurate backend detection
podTemplate, err := r.buildCheckpointJobPodTemplate(
dynamoDeployment,
component,
serviceName,
identity.BackendFramework, // Use framework from checkpoint identity
)
if err != nil {
return nil, false, fmt.Errorf("failed to build checkpoint job pod template: %w", err)
}
ckpt := &nvidiacomv1alpha1.DynamoCheckpoint{
ObjectMeta: metav1.ObjectMeta{
Name: ckptName,
Namespace: dynamoDeployment.Namespace,
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: dynamoDeployment.Name,
consts.KubeLabelDynamoComponent: serviceName,
consts.KubeLabelCheckpointHash: hash,
},
},
Spec: nvidiacomv1alpha1.DynamoCheckpointSpec{
Identity: checkpointIdentity,
Job: nvidiacomv1alpha1.DynamoCheckpointJobConfig{
PodTemplateSpec: podTemplate,
},
},
}
return ckpt, false, nil
})
if err != nil {
return nil, fmt.Errorf("failed to sync checkpoint CR: %w", err)
}
return ckpt, nil
}
// buildCheckpointJobPodTemplate builds a pod template for the checkpoint job from service spec
// It reuses GenerateBasePodSpec to ensure checkpoint jobs have the same configuration as regular pods,
// including auto-discovered image pull secrets, envFromSecret, resources, security context, etc.
func (r *DynamoGraphDeploymentReconciler) buildCheckpointJobPodTemplate(
dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment,
component *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec,
serviceName string,
framework string, // From checkpoint identity (e.g., "vllm", "sglang", "trtllm")
) (corev1.PodTemplateSpec, error) {
// Parse framework string to BackendFramework type
backendFramework, err := dynamo.ParseBackendFramework(framework)
if err != nil {
return corev1.PodTemplateSpec{}, err
}
// Create a copy of the component spec without checkpoint config
// The checkpoint job is CREATING the checkpoint, not restoring from one
componentForJob := component.DeepCopy()
componentForJob.Checkpoint = nil
// Ensure DYN_NAMESPACE is set for checkpoint job using the same logic as regular pods
// This is required for service discovery and distributed coordination
dynamoNamespace := dynamo.GetDynamoNamespace(dynamoDeployment, component)
componentForJob.DynamoNamespace = &dynamoNamespace
// Generate base PodSpec using the same logic as regular worker pods
// This includes: image pull secrets (auto-discovered + explicit), envFromSecret,
// resources, security context, tolerations, node selectors, etc.
//
// Note: For checkpoint jobs, we use Grove deployment type even though it's single-node.
// This is because GenerateBasePodSpec requires a valid MultinodeDeployer, and for
// single-node cases, the backends simply return early without modifications.
podSpec, err := dynamo.GenerateBasePodSpec(
componentForJob,
backendFramework,
r.DockerSecretRetriever,
dynamoDeployment.Name,
dynamoDeployment.Namespace,
dynamo.RoleCheckpoint, // Use checkpoint role
1, // Single node for checkpoint job
r.Config,
consts.MultinodeDeploymentTypeGrove, // Use Grove (single-node backends return early)
serviceName,
nil, // No checkpoint info for checkpoint creation jobs
)
if err != nil {
return corev1.PodTemplateSpec{}, fmt.Errorf("failed to generate base pod spec: %w", err)
}
// Override RestartPolicy for job (must be Never or OnFailure)
podSpec.RestartPolicy = corev1.RestartPolicyNever
return corev1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
consts.KubeLabelDynamoComponent: serviceName,
},
},
Spec: *podSpec,
}, nil
}
// reconcileScalingAdapters ensures a DynamoGraphDeploymentScalingAdapter exists for each service in the DGD
// that has scaling adapter explicitly enabled. Services without scalingAdapter.enabled=true will not have a DGDSA.
// This enables pluggable autoscaling via HPA, KEDA, or Planner.
......
......@@ -678,7 +678,7 @@ func Test_reconcileGroveResources(t *testing.T) {
},
}
result, err := reconciler.reconcileGroveResources(ctx, dgd, nil)
result, err := reconciler.reconcileGroveResources(ctx, dgd, nil, nil)
g.Expect(err).NotTo(gomega.HaveOccurred())
g.Expect(result).To(gomega.Equal(tt.wantReconcileResult))
......
......@@ -84,6 +84,9 @@ type Config struct {
// When true, controllers skip validation (webhooks handle it)
// When false, controllers perform validation (defense in depth)
WebhooksEnabled bool
// Checkpoint configuration for checkpoint/restore functionality
Checkpoint CheckpointConfig
}
// RBACConfig holds configuration for RBAC management
......@@ -96,6 +99,65 @@ type RBACConfig struct {
EPPClusterRoleName string
}
// CheckpointConfig holds configuration for checkpoint/restore functionality
type CheckpointConfig struct {
// Enabled indicates if checkpoint functionality is enabled
Enabled bool
// Storage holds storage backend configuration
Storage CheckpointStorageConfig
// CRIUTimeout is the CRIU timeout in seconds (required for CUDA checkpoints/restores)
CRIUTimeout string
// InitContainerImage is the image used for init containers (e.g., signal file cleanup)
// Defaults to "busybox:latest" if not specified
InitContainerImage string
}
// Checkpoint storage type constants
const (
CheckpointStorageTypePVC = "pvc"
CheckpointStorageTypeS3 = "s3"
CheckpointStorageTypeOCI = "oci"
)
// CheckpointStorageConfig holds storage backend configuration for checkpoints
type CheckpointStorageConfig struct {
// Type is the storage backend type: pvc, s3, or oci
Type string
// SignalHostPath is the host path for signal files (used for checkpoint job coordination)
SignalHostPath string
// PVC configuration (used when Type=pvc)
PVC CheckpointPVCConfig
// S3 configuration (used when Type=s3)
S3 CheckpointS3Config
// OCI configuration (used when Type=oci)
OCI CheckpointOCIConfig
}
// CheckpointPVCConfig holds PVC storage configuration
type CheckpointPVCConfig struct {
// PVCName is the name of the PVC
PVCName string
// BasePath is the base directory within the PVC
BasePath string
}
// CheckpointS3Config holds S3 storage configuration
type CheckpointS3Config struct {
// URI is the S3 URI (s3://[endpoint/]bucket/prefix)
URI string
// CredentialsSecretRef is the name of the credentials secret
// (should contain AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, and optionally AWS_REGION)
CredentialsSecretRef string
}
// CheckpointOCIConfig holds OCI registry storage configuration
type CheckpointOCIConfig struct {
// URI is the OCI URI (oci://registry/repository)
URI string
// CredentialsSecretRef is the name of the docker config secret
CredentialsSecretRef string
}
type IngressConfig struct {
VirtualServiceGateway string
IngressControllerClassName string
......
......@@ -96,7 +96,7 @@ func SyncResource[T client.Object](ctx context.Context, r Reconciler, parentReso
err = r.Get(ctx, types.NamespacedName{Name: resourceName, Namespace: resourceNamespace}, oldResource)
oldResourceIsNotFound := errors.IsNotFound(err)
if err != nil && !oldResourceIsNotFound {
r.GetRecorder().Eventf(parentResource, corev1.EventTypeWarning, fmt.Sprintf("Get%s", resourceType), "Failed to get %s %s: %s", resourceType, resourceNamespace, err)
r.GetRecorder().Eventf(resource, corev1.EventTypeWarning, fmt.Sprintf("Get%s", resourceType), "Failed to get %s %s: %s", resourceType, resourceNamespace, err)
logs.Error(err, "Failed to get resource.")
return
}
......@@ -109,33 +109,39 @@ func SyncResource[T client.Object](ctx context.Context, r Reconciler, parentReso
}
logs.Info("Resource not found. Creating a new one.")
// Only set controller reference if parentResource is provided
// Passing nil as parentResource creates an independent resource (no owner reference)
if parentResource != nil {
err = ctrl.SetControllerReference(parentResource, resource, r.Scheme())
if err != nil {
logs.Error(err, "Failed to set controller reference.")
r.GetRecorder().Eventf(parentResource, corev1.EventTypeWarning, "SetControllerReference", "Failed to set controller reference for %s %s: %s", resourceType, resourceNamespace, err)
r.GetRecorder().Eventf(resource, corev1.EventTypeWarning, "SetControllerReference", "Failed to set controller reference for %s %s: %s", resourceType, resourceNamespace, err)
return
}
} else {
logs.Info("No parent resource provided, creating resource without owner reference (independent lifecycle)")
}
var hash string
hash, err = GetSpecHash(resource)
if err != nil {
logs.Error(err, "Failed to get spec hash.")
r.GetRecorder().Eventf(parentResource, corev1.EventTypeWarning, "GetSpecHash", "Failed to get spec hash for %s %s: %s", resourceType, resourceNamespace, err)
r.GetRecorder().Eventf(resource, corev1.EventTypeWarning, "GetSpecHash", "Failed to get spec hash for %s %s: %s", resourceType, resourceNamespace, err)
return
}
// On create, set generation to 1 (new resources start at generation 1)
updateAnnotations(resource, hash, 1)
r.GetRecorder().Eventf(parentResource, corev1.EventTypeNormal, fmt.Sprintf("Create%s", resourceType), "Creating a new %s %s", resourceType, resourceNamespace)
r.GetRecorder().Eventf(resource, corev1.EventTypeNormal, fmt.Sprintf("Create%s", resourceType), "Creating a new %s %s", resourceType, resourceNamespace)
err = r.Create(ctx, resource)
if err != nil {
logs.Error(err, "Failed to create Resource.")
r.GetRecorder().Eventf(parentResource, corev1.EventTypeWarning, fmt.Sprintf("Create%s", resourceType), "Failed to create %s %s: %s", resourceType, resourceNamespace, err)
r.GetRecorder().Eventf(resource, corev1.EventTypeWarning, fmt.Sprintf("Create%s", resourceType), "Failed to create %s %s: %s", resourceType, resourceNamespace, err)
return
}
logs.Info(fmt.Sprintf("%s created.", resourceType))
r.GetRecorder().Eventf(parentResource, corev1.EventTypeNormal, fmt.Sprintf("Create%s", resourceType), "Created %s %s", resourceType, resourceNamespace)
r.GetRecorder().Eventf(resource, corev1.EventTypeNormal, fmt.Sprintf("Create%s", resourceType), "Created %s %s", resourceType, resourceNamespace)
modified = true
res = resource
} else {
......@@ -145,11 +151,11 @@ func SyncResource[T client.Object](ctx context.Context, r Reconciler, parentReso
err = r.Delete(ctx, oldResource)
if err != nil {
logs.Error(err, fmt.Sprintf("Failed to delete %s.", resourceType))
r.GetRecorder().Eventf(parentResource, corev1.EventTypeWarning, fmt.Sprintf("Delete%s", resourceType), "Failed to delete %s %s: %s", resourceType, resourceNamespace, err)
r.GetRecorder().Eventf(oldResource, corev1.EventTypeWarning, fmt.Sprintf("Delete%s", resourceType), "Failed to delete %s %s: %s", resourceType, resourceNamespace, err)
return
}
logs.Info(fmt.Sprintf("%s deleted.", resourceType))
r.GetRecorder().Eventf(parentResource, corev1.EventTypeNormal, fmt.Sprintf("Delete%s", resourceType), "Deleted %s %s", resourceType, resourceNamespace)
r.GetRecorder().Eventf(oldResource, corev1.EventTypeNormal, fmt.Sprintf("Delete%s", resourceType), "Deleted %s %s", resourceType, resourceNamespace)
modified = true
return
}
......@@ -158,13 +164,13 @@ func SyncResource[T client.Object](ctx context.Context, r Reconciler, parentReso
var changeResult SpecChangeResult
changeResult, err = GetSpecChangeResult(oldResource, resource)
if err != nil {
r.GetRecorder().Eventf(parentResource, corev1.EventTypeWarning, fmt.Sprintf("CalculatePatch%s", resourceType), "Failed to calculate patch for %s %s: %s", resourceType, resourceNamespace, err)
r.GetRecorder().Eventf(resource, corev1.EventTypeWarning, fmt.Sprintf("CalculatePatch%s", resourceType), "Failed to calculate patch for %s %s: %s", resourceType, resourceNamespace, err)
return false, resource, fmt.Errorf("failed to check if spec has changed: %w", err)
}
if !changeResult.NeedsUpdate {
logs.Info(fmt.Sprintf("%s spec is the same. Skipping update.", resourceType))
r.GetRecorder().Eventf(parentResource, corev1.EventTypeNormal, fmt.Sprintf("Update%s", resourceType), "Skipping update %s %s", resourceType, resourceNamespace)
r.GetRecorder().Eventf(oldResource, corev1.EventTypeNormal, fmt.Sprintf("Update%s", resourceType), "Skipping update %s %s", resourceType, resourceNamespace)
res = oldResource
return
}
......@@ -188,7 +194,7 @@ func SyncResource[T client.Object](ctx context.Context, r Reconciler, parentReso
err = CopySpec(resource, oldResource)
if err != nil {
logs.Error(err, fmt.Sprintf("Failed to copy spec for %s.", resourceType))
r.GetRecorder().Eventf(parentResource, corev1.EventTypeWarning, fmt.Sprintf("CopySpec%s", resourceType), "Failed to copy spec for %s %s: %s", resourceType, resourceNamespace, err)
r.GetRecorder().Eventf(oldResource, corev1.EventTypeWarning, fmt.Sprintf("CopySpec%s", resourceType), "Failed to copy spec for %s %s: %s", resourceType, resourceNamespace, err)
return
}
......@@ -197,11 +203,11 @@ func SyncResource[T client.Object](ctx context.Context, r Reconciler, parentReso
err = r.Update(ctx, oldResource)
if err != nil {
logs.Error(err, fmt.Sprintf("Failed to update %s.", resourceType))
r.GetRecorder().Eventf(parentResource, corev1.EventTypeWarning, fmt.Sprintf("Update%s", resourceType), "Failed to update %s %s: %s", resourceType, resourceNamespace, err)
r.GetRecorder().Eventf(oldResource, corev1.EventTypeWarning, fmt.Sprintf("Update%s", resourceType), "Failed to update %s %s: %s", resourceType, resourceNamespace, err)
return
}
logs.Info(fmt.Sprintf("%s updated.", resourceType))
r.GetRecorder().Eventf(parentResource, corev1.EventTypeNormal, fmt.Sprintf("Update%s", resourceType), "Updated %s %s", resourceType, resourceNamespace)
r.GetRecorder().Eventf(oldResource, corev1.EventTypeNormal, fmt.Sprintf("Update%s", resourceType), "Updated %s %s", resourceType, resourceNamespace)
modified = true
res = oldResource
}
......
......@@ -35,6 +35,7 @@ import (
grovev1alpha1 "github.com/NVIDIA/grove/operator/api/core/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/checkpoint"
commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/discovery"
......@@ -252,7 +253,7 @@ func ParseDynDeploymentConfig(ctx context.Context, jsonContent []byte) (DynDeplo
func GenerateDynamoComponentsDeployments(ctx context.Context, parentDynamoGraphDeployment *v1alpha1.DynamoGraphDeployment, defaultIngressSpec *v1alpha1.IngressSpec, restartState *RestartState, existingRestartAnnotations map[string]string) (map[string]*v1alpha1.DynamoComponentDeployment, error) {
deployments := make(map[string]*v1alpha1.DynamoComponentDeployment)
for componentName, component := range parentDynamoGraphDeployment.Spec.Services {
dynamoNamespace := getDynamoNamespace(parentDynamoGraphDeployment, component)
dynamoNamespace := GetDynamoNamespace(parentDynamoGraphDeployment, component)
deployment := &v1alpha1.DynamoComponentDeployment{}
deployment.Spec.DynamoComponentDeploymentSharedSpec = *component
deployment.Name = GetDynamoComponentName(parentDynamoGraphDeployment, componentName)
......@@ -336,7 +337,7 @@ func GenerateDynamoComponentsDeployments(ctx context.Context, parentDynamoGraphD
return deployments, nil
}
func getDynamoNamespace(object metav1.Object, service *v1alpha1.DynamoComponentDeploymentSharedSpec) string {
func GetDynamoNamespace(object metav1.Object, service *v1alpha1.DynamoComponentDeploymentSharedSpec) string {
return v1alpha1.ComputeDynamoNamespace(service.GlobalDynamoNamespace, object.GetNamespace(), object.GetName())
}
......@@ -736,6 +737,7 @@ const (
RoleLeader Role = "leader"
RoleWorker Role = "worker"
RoleMain Role = "main"
RoleCheckpoint Role = "checkpoint"
)
// Update ServiceRole struct for expandRolesForService
......@@ -766,8 +768,21 @@ const (
BackendFrameworkSGLang BackendFramework = "sglang"
BackendFrameworkVLLM BackendFramework = "vllm"
BackendFrameworkTRTLLM BackendFramework = "trtllm"
BackendFrameworkNoop BackendFramework = "noop"
)
// ParseBackendFramework converts a string to BackendFramework type.
// Returns an error if the framework string is not recognized.
func ParseBackendFramework(framework string) (BackendFramework, error) {
bf := BackendFramework(framework)
switch bf {
case BackendFrameworkVLLM, BackendFrameworkSGLang, BackendFrameworkTRTLLM, BackendFrameworkNoop:
return bf, nil
default:
return "", fmt.Errorf("unsupported backend framework: %s (valid values: vllm, sglang, trtllm)", framework)
}
}
// Backend interface for modular backend logic
// Each backend (SGLang, VLLM, etc.) implements this interface
type Backend interface {
......@@ -897,6 +912,7 @@ func GenerateBasePodSpec(
controllerConfig controller_common.Config,
multinodeDeploymentType commonconsts.MultinodeDeploymentType,
serviceName string,
checkpointInfo *checkpoint.CheckpointInfo, // Optional checkpoint info (resolved by ResolveCheckpointForService)
) (*corev1.PodSpec, error) {
// Start with base container generated per component type
componentContext := generateComponentContext(component, parentGraphDeploymentName, namespace, numberOfNodes, controllerConfig.GetDiscoveryBackend(component.Annotations))
......@@ -1071,6 +1087,23 @@ func GenerateBasePodSpec(
podSpec.ImagePullSecrets = controller_common.AppendUniqueImagePullSecrets(podSpec.ImagePullSecrets, imagePullSecrets)
backend.UpdatePodSpec(&podSpec, numberOfNodes, role, component, serviceName)
// Inject checkpoint configuration if enabled
// This handles ALL checkpoint-related modifications:
// - Command/Args transformation (moves Command to Args to respect image ENTRYPOINT)
// - Security context (hostIPC, privileged mode)
// - Environment variables (checkpoint path, hash, CRIU settings)
// - Storage configuration (volumes, mounts)
// CheckpointInfo should have been resolved by ResolveCheckpointForService before calling this function
// Checkpoint config comes from the operator's controller config (Helm values)
var checkpointConfig *controller_common.CheckpointConfig
if controllerConfig.Checkpoint.Enabled {
checkpointConfig = &controllerConfig.Checkpoint
}
if err := checkpoint.InjectCheckpointIntoPodSpec(&podSpec, checkpointInfo, checkpointConfig); err != nil {
return nil, fmt.Errorf("failed to inject checkpoint config: %w", err)
}
return &podSpec, nil
}
......@@ -1111,11 +1144,12 @@ func GeneratePodSpecForComponent(
controllerConfig controller_common.Config,
multinodeDeploymentType commonconsts.MultinodeDeploymentType,
serviceName string,
checkpointInfo *checkpoint.CheckpointInfo, // Optional checkpoint info
) (*corev1.PodSpec, error) {
if len(dynamoDeployment.Spec.Envs) > 0 {
component.Envs = MergeEnvs(dynamoDeployment.Spec.Envs, component.Envs)
}
podSpec, err := GenerateBasePodSpec(component, backendFramework, secretsRetriever, dynamoDeployment.Name, dynamoDeployment.Namespace, role, numberOfNodes, controllerConfig, multinodeDeploymentType, serviceName)
podSpec, err := GenerateBasePodSpec(component, backendFramework, secretsRetriever, dynamoDeployment.Name, dynamoDeployment.Namespace, role, numberOfNodes, controllerConfig, multinodeDeploymentType, serviceName, checkpointInfo)
if err != nil {
return nil, err
}
......@@ -1130,6 +1164,7 @@ func GenerateGrovePodCliqueSet(
secretsRetriever SecretsRetriever,
restartState *RestartState,
existingRestartAnnotations map[string]string,
checkpointInfoByService map[string]*checkpoint.CheckpointInfo, // Optional checkpoint info per service
) (*grovev1alpha1.PodCliqueSet, error) {
gangSet := &grovev1alpha1.PodCliqueSet{}
gangSet.Name = dynamoDeployment.Name
......@@ -1157,7 +1192,7 @@ func GenerateGrovePodCliqueSet(
var scalingGroups []grovev1alpha1.PodCliqueScalingGroupConfig
for serviceName, component := range dynamoDeployment.Spec.Services {
dynamoNamespace := getDynamoNamespace(dynamoDeployment, component)
dynamoNamespace := GetDynamoNamespace(dynamoDeployment, component)
component.DynamoNamespace = &dynamoNamespace
// Determine backend framework using hybrid approach
backendFramework, err := getBackendFrameworkFromComponent(component, dynamoDeployment)
......@@ -1172,6 +1207,12 @@ func GenerateGrovePodCliqueSet(
component.Annotations[commonconsts.KubeAnnotationDynamoDiscoveryBackend] = discoveryBackend
}
// Get checkpoint info for this service if available
var checkpointInfo *checkpoint.CheckpointInfo
if checkpointInfoByService != nil {
checkpointInfo = checkpointInfoByService[serviceName]
}
numberOfNodes := component.GetNumberOfNodes()
isMultinode := numberOfNodes > 1
roles := expandRolesForService(serviceName, component.Replicas, numberOfNodes)
......@@ -1188,6 +1229,7 @@ func GenerateGrovePodCliqueSet(
controllerConfig,
commonconsts.MultinodeDeploymentTypeGrove,
serviceName,
checkpointInfo,
)
if err != nil {
return nil, fmt.Errorf("failed to generate podSpec for role %s: %w", r.Name, err)
......@@ -1272,15 +1314,21 @@ func generateLabels(component *v1alpha1.DynamoComponentDeploymentSharedSpec, dyn
}
// Add base model label if modelRef is specified
AddBaseModelLabel(labels, component.ModelRef)
// Add checkpoint labels if checkpointing is enabled
var err error
labels, err = checkpoint.InjectCheckpointLabelsFromConfig(labels, component.Checkpoint)
if err != nil {
return nil, fmt.Errorf("failed to inject checkpoint labels: %w", err)
}
setMetricsLabels(labels, dynamoDeployment)
if component.Labels != nil {
err := mergo.Merge(&labels, component.Labels, mergo.WithOverride)
err = mergo.Merge(&labels, component.Labels, mergo.WithOverride)
if err != nil {
return nil, fmt.Errorf("failed to merge labels: %w", err)
}
}
if component.ExtraPodMetadata != nil {
err := mergo.Merge(&labels, component.ExtraPodMetadata.Labels, mergo.WithOverride)
err = mergo.Merge(&labels, component.ExtraPodMetadata.Labels, mergo.WithOverride)
if err != nil {
return nil, fmt.Errorf("failed to merge extraPodMetadata labels: %w", err)
}
......@@ -1336,9 +1384,6 @@ func detectBackendFrameworkFromArgs(command []string, args []string) (BackendFra
return detected[0], nil
}
// BackendFrameworkNoop represents no backend processing needed
const BackendFrameworkNoop BackendFramework = "noop"
// determineBackendFramework is the core logic for hybrid backend framework detection
// Takes extracted parameters and applies the detection logic
func determineBackendFramework(
......@@ -1457,6 +1502,7 @@ func GenerateBasePodSpecForController(
controllerConfig controller_common.Config,
role Role,
multinodeDeploymentType commonconsts.MultinodeDeploymentType,
checkpointInfo *checkpoint.CheckpointInfo, // Optional checkpoint info (resolved by caller)
) (*corev1.PodSpec, error) {
// Convert to our interface
componentSpec := ConvertDynamoComponentDeploymentToSpec(dynComponent)
......@@ -1483,6 +1529,7 @@ func GenerateBasePodSpecForController(
controllerConfig,
multinodeDeploymentType,
serviceName,
checkpointInfo,
)
if err != nil {
return nil, err
......
......@@ -3665,7 +3665,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := GenerateGrovePodCliqueSet(tt.args.ctx, tt.args.dynamoDeployment, tt.args.controllerConfig, nil, nil, nil)
got, err := GenerateGrovePodCliqueSet(tt.args.ctx, tt.args.dynamoDeployment, tt.args.controllerConfig, nil, nil, nil, nil)
if (err != nil) != tt.wantErr {
t.Errorf("GenerateGrovePodCliqueSet() error = %v, wantErr %v", err, tt.wantErr)
return
......@@ -3717,7 +3717,7 @@ func Test_GeneratePodCliqueSetGlobalDynamoNamespace(t *testing.T) {
},
}
got, err := GenerateGrovePodCliqueSet(context.Background(), dynamoDeployment, controller_common.Config{}, nil, nil, nil)
got, err := GenerateGrovePodCliqueSet(context.Background(), dynamoDeployment, controller_common.Config{}, nil, nil, nil, nil)
if !assert.NoError(t, err) {
return
}
......@@ -3880,6 +3880,7 @@ func TestGeneratePodSpecForComponent_SGLang(t *testing.T) {
controllerConfig,
commonconsts.MultinodeDeploymentTypeGrove,
"worker",
nil, // No checkpoint info in tests
)
if tt.expectError {
......@@ -4037,6 +4038,7 @@ func TestGeneratePodSpecForComponent_VLLM(t *testing.T) {
controllerConfig,
commonconsts.MultinodeDeploymentTypeGrove,
"worker",
nil, // No checkpoint info in tests
)
if tt.expectError {
......@@ -4123,6 +4125,7 @@ func TestGeneratePodSpecForComponent_UnsupportedBackend(t *testing.T) {
controllerConfig,
commonconsts.MultinodeDeploymentTypeGrove,
"worker",
nil, // No checkpoint info in tests
)
if tt.expectError {
......@@ -4800,7 +4803,7 @@ func TestGenerateGrovePodCliqueSet_StartsAfterDependencies(t *testing.T) {
NatsAddress: "nats-address",
}
got, err := GenerateGrovePodCliqueSet(context.Background(), dynamoDeployment, controllerConfig, secretsRetriever, nil, nil)
got, err := GenerateGrovePodCliqueSet(context.Background(), dynamoDeployment, controllerConfig, secretsRetriever, nil, nil, nil)
if err != nil {
t.Errorf("GenerateGrovePodCliqueSet() error = %v", err)
return
......@@ -4909,6 +4912,7 @@ func TestGenerateBasePodSpec_Frontend(t *testing.T) {
controllerConfig,
commonconsts.MultinodeDeploymentTypeGrove,
"test-service",
nil, // No checkpoint info in tests
)
if (err != nil) != tt.wantErr {
......@@ -4984,6 +4988,7 @@ func TestGenerateBasePodSpec_PlannerServiceAccount(t *testing.T) {
controllerConfig,
commonconsts.MultinodeDeploymentTypeGrove,
"test-service",
nil, // No checkpoint info in tests
)
if err != nil {
......@@ -5106,6 +5111,7 @@ func TestGenerateBasePodSpec_DisableImagePullSecretDiscovery(t *testing.T) {
controllerConfig,
commonconsts.MultinodeDeploymentTypeGrove,
"test-service",
nil, // No checkpoint info in tests
)
if err != nil {
......@@ -5201,6 +5207,7 @@ func TestGenerateBasePodSpec_DiscoverBackend(t *testing.T) {
tt.controllerConfig,
commonconsts.MultinodeDeploymentTypeGrove,
"test-service",
nil, // No checkpoint info in tests
)
if !assert.NoError(t, err) {
return
......@@ -5360,6 +5367,7 @@ func TestGenerateBasePodSpec_Worker(t *testing.T) {
controllerConfig,
commonconsts.MultinodeDeploymentTypeGrove,
"test-service",
nil, // No checkpoint info in tests
)
if err != nil {
......@@ -5456,6 +5464,7 @@ func TestGenerateBasePodSpec_VolumeMounts(t *testing.T) {
controllerConfig,
commonconsts.MultinodeDeploymentTypeGrove,
"test-service",
nil, // No checkpoint info in tests
)
if tt.expectError {
......@@ -5691,6 +5700,7 @@ func TestGenerateBasePodSpec_ResourceClaims(t *testing.T) {
controllerConfig,
commonconsts.MultinodeDeploymentTypeGrove,
"test-service",
nil, // No checkpoint info in tests
)
if tt.expectError {
......@@ -5902,6 +5912,7 @@ func TestGenerateBasePodSpec_UseAsCompilationCache_BackendSupport(t *testing.T)
controllerConfig,
commonconsts.MultinodeDeploymentTypeGrove,
"test-service",
nil, // No checkpoint info in tests
)
if tt.expectError {
......@@ -6087,6 +6098,7 @@ func TestGenerateBasePodSpec_SecurityContext(t *testing.T) {
controllerConfig,
commonconsts.MultinodeDeploymentTypeGrove,
"test-service",
nil, // No checkpoint info in tests
)
if err != nil {
......@@ -6581,7 +6593,7 @@ func TestGenerateGrovePodCliqueSet_RestartAnnotations(t *testing.T) {
NatsAddress: "nats-address",
}
got, err := GenerateGrovePodCliqueSet(context.Background(), dgd, controllerConfig, nil, tt.restartState, nil)
got, err := GenerateGrovePodCliqueSet(context.Background(), dgd, controllerConfig, nil, tt.restartState, nil, nil)
if err != nil {
t.Fatalf("GenerateGrovePodCliqueSet() error = %v", err)
}
......
......@@ -12,3 +12,4 @@ Deployment Guide
Minikube Setup <../kubernetes/deployment/minikube>
Managing Models with DynamoModel <../kubernetes/deployment/dynamomodel-guide>
Autoscaling <../kubernetes/autoscaling>
Checkpointing <../kubernetes/chrek/README>
......@@ -27,6 +27,9 @@
kubernetes/api_reference.md
kubernetes/deployment/create_deployment.md
kubernetes/deployment/dynamomodel-guide.md
kubernetes/chrek/README.md
kubernetes/chrek/dynamo.md
kubernetes/chrek/standalone.md
kubernetes/fluxcd.md
kubernetes/grove.md
......
......@@ -244,6 +244,7 @@ Key customization points include:
- **[Operator Documentation](/docs/kubernetes/dynamo_operator.md)** - How the platform works
- **[Service Discovery](/docs/kubernetes/service_discovery.md)** - Discovery backends and configuration
- **[Helm Charts](/deploy/helm/README.md)** - For advanced users
- **[Checkpointing](/docs/kubernetes/chrek/README.md)** - Fast pod startup with checkpoint/restore
- **[GitOps Deployment with FluxCD](/docs/kubernetes/fluxcd.md)** - For advanced users
- **[Logging](/docs/kubernetes/observability/logging.md)** - For logging setup
- **[Multinode Deployment](/docs/kubernetes/deployment/multinode-deployment.md)** - For multinode deployment
......
......@@ -34,6 +34,7 @@ a high-level, SLA-driven interface for deploying machine learning models on Dyna
Package v1alpha1 contains API Schema definitions for the nvidia.com v1alpha1 API group.
### Resource Types
- [DynamoCheckpoint](#dynamocheckpoint)
- [DynamoComponentDeployment](#dynamocomponentdeployment)
- [DynamoGraphDeployment](#dynamographdeployment)
- [DynamoGraphDeploymentRequest](#dynamographdeploymentrequest)
......@@ -67,6 +68,24 @@ _Appears in:_
#### CheckpointMode
_Underlying type:_ _string_
CheckpointMode defines how checkpoint creation is handled
_Validation:_
- Enum: [Auto Manual]
_Appears in:_
- [ServiceCheckpointConfig](#servicecheckpointconfig)
| Field | Description |
| --- | --- |
| `Auto` | CheckpointModeAuto means the DGD controller will automatically create a Checkpoint CR<br /> |
| `Manual` | CheckpointModeManual means the user must create the Checkpoint CR themselves<br /> |
#### ComponentKind
_Underlying type:_ _string_
......@@ -148,6 +167,146 @@ _Appears in:_
#### DynamoCheckpoint
DynamoCheckpoint is the Schema for the dynamocheckpoints API
It represents a container checkpoint that can be used to restore pods to a warm state
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `apiVersion` _string_ | `nvidia.com/v1alpha1` | | |
| `kind` _string_ | `DynamoCheckpoint` | | |
| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | |
| `spec` _[DynamoCheckpointSpec](#dynamocheckpointspec)_ | | | |
| `status` _[DynamoCheckpointStatus](#dynamocheckpointstatus)_ | | | |
#### DynamoCheckpointIdentity
DynamoCheckpointIdentity defines the inputs that determine checkpoint equivalence
Two checkpoints with the same identity hash are considered equivalent
_Appears in:_
- [DynamoCheckpointSpec](#dynamocheckpointspec)
- [ServiceCheckpointConfig](#servicecheckpointconfig)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `model` _string_ | Model is the model identifier (e.g., "meta-llama/Llama-3-70B") | | Required: \{\} <br /> |
| `backendFramework` _string_ | BackendFramework is the runtime framework (vllm, sglang, trtllm) | | Enum: [vllm sglang trtllm] <br />Required: \{\} <br /> |
| `dynamoVersion` _string_ | DynamoVersion is the Dynamo platform version (optional)<br />If not specified, version is not included in identity hash<br />This ensures checkpoint compatibility across Dynamo releases | | Optional: \{\} <br /> |
| `tensorParallelSize` _integer_ | TensorParallelSize is the tensor parallel configuration | 1 | Minimum: 1 <br />Optional: \{\} <br /> |
| `pipelineParallelSize` _integer_ | PipelineParallelSize is the pipeline parallel configuration | 1 | Minimum: 1 <br />Optional: \{\} <br /> |
| `dtype` _string_ | Dtype is the data type (fp16, bf16, fp8, etc.) | | Optional: \{\} <br /> |
| `maxModelLen` _integer_ | MaxModelLen is the maximum sequence length | | Minimum: 1 <br />Optional: \{\} <br /> |
| `extraParameters` _object (keys:string, values:string)_ | ExtraParameters are additional parameters that affect the checkpoint hash<br />Use for any framework-specific or custom parameters not covered above | | Optional: \{\} <br /> |
#### DynamoCheckpointJobConfig
DynamoCheckpointJobConfig defines the configuration for the checkpoint creation Job
_Appears in:_
- [DynamoCheckpointSpec](#dynamocheckpointspec)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `podTemplateSpec` _[PodTemplateSpec](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#podtemplatespec-v1-core)_ | PodTemplateSpec allows customizing the checkpoint Job pod<br />This should include the container that runs the workload to be checkpointed | | Required: \{\} <br /> |
| `activeDeadlineSeconds` _integer_ | ActiveDeadlineSeconds specifies the maximum time the Job can run | 3600 | Optional: \{\} <br /> |
| `backoffLimit` _integer_ | BackoffLimit specifies the number of retries before marking the Job failed | 3 | Optional: \{\} <br /> |
| `ttlSecondsAfterFinished` _integer_ | TTLSecondsAfterFinished specifies how long to keep the Job after completion | 300 | Optional: \{\} <br /> |
#### DynamoCheckpointPhase
_Underlying type:_ _string_
DynamoCheckpointPhase represents the current phase of the checkpoint lifecycle
_Validation:_
- Enum: [Pending Creating Ready Failed]
_Appears in:_
- [DynamoCheckpointStatus](#dynamocheckpointstatus)
| Field | Description |
| --- | --- |
| `Pending` | DynamoCheckpointPhasePending indicates the checkpoint CR has been created but the Job has not started<br /> |
| `Creating` | DynamoCheckpointPhaseCreating indicates the checkpoint Job is running<br /> |
| `Ready` | DynamoCheckpointPhaseReady indicates the checkpoint tar file is available on the PVC<br /> |
| `Failed` | DynamoCheckpointPhaseFailed indicates the checkpoint creation failed<br /> |
#### DynamoCheckpointSpec
DynamoCheckpointSpec defines the desired state of DynamoCheckpoint
_Appears in:_
- [DynamoCheckpoint](#dynamocheckpoint)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `identity` _[DynamoCheckpointIdentity](#dynamocheckpointidentity)_ | Identity defines the inputs that determine checkpoint equivalence | | Required: \{\} <br /> |
| `job` _[DynamoCheckpointJobConfig](#dynamocheckpointjobconfig)_ | Job defines the configuration for the checkpoint creation Job | | Required: \{\} <br /> |
#### DynamoCheckpointStatus
DynamoCheckpointStatus defines the observed state of DynamoCheckpoint
_Appears in:_
- [DynamoCheckpoint](#dynamocheckpoint)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `phase` _[DynamoCheckpointPhase](#dynamocheckpointphase)_ | Phase represents the current phase of the checkpoint lifecycle | | Enum: [Pending Creating Ready Failed] <br />Optional: \{\} <br /> |
| `identityHash` _string_ | IdentityHash is the computed hash of the checkpoint identity<br />This hash is used to identify equivalent checkpoints | | Optional: \{\} <br /> |
| `location` _string_ | Location is the full URI/path to the checkpoint in the storage backend<br />For PVC: same as TarPath (e.g., /checkpoints/\{hash\}.tar)<br />For S3: s3://bucket/prefix/\{hash\}.tar<br />For OCI: oci://registry/repo:\{hash\} | | Optional: \{\} <br /> |
| `storageType` _[DynamoCheckpointStorageType](#dynamocheckpointstoragetype)_ | StorageType indicates the storage backend type used for this checkpoint | | Enum: [pvc s3 oci] <br />Optional: \{\} <br /> |
| `jobName` _string_ | JobName is the name of the checkpoint creation Job | | Optional: \{\} <br /> |
| `createdAt` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#time-v1-meta)_ | CreatedAt is the timestamp when the checkpoint tar was created | | Optional: \{\} <br /> |
| `message` _string_ | Message provides additional information about the current state | | Optional: \{\} <br /> |
| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions represent the latest available observations of the checkpoint's state | | Optional: \{\} <br /> |
#### DynamoCheckpointStorageType
_Underlying type:_ _string_
DynamoCheckpointStorageType defines the supported storage backends for checkpoints
_Validation:_
- Enum: [pvc s3 oci]
_Appears in:_
- [DynamoCheckpointStatus](#dynamocheckpointstatus)
#### DynamoComponentDeployment
......@@ -203,6 +362,7 @@ _Appears in:_
| `multinode` _[MultinodeSpec](#multinodespec)_ | Multinode is the configuration for multinode components. | | |
| `scalingAdapter` _[ScalingAdapter](#scalingadapter)_ | ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter.<br />When enabled, replicas are managed via DGDSA and external autoscalers can scale<br />the service using the Scale subresource. When disabled, replicas can be modified directly. | | Optional: \{\} <br /> |
| `eppConfig` _[EPPConfig](#eppconfig)_ | EPPConfig defines EPP-specific configuration options for Endpoint Picker Plugin components.<br />Only applicable when ComponentType is "epp". | | Optional: \{\} <br /> |
| `checkpoint` _[ServiceCheckpointConfig](#servicecheckpointconfig)_ | Checkpoint configures container checkpointing for this service.<br />When enabled, pods can be restored from a checkpoint files for faster cold start. | | Optional: \{\} <br /> |
#### DynamoComponentDeploymentSpec
......@@ -242,6 +402,7 @@ _Appears in:_
| `multinode` _[MultinodeSpec](#multinodespec)_ | Multinode is the configuration for multinode components. | | |
| `scalingAdapter` _[ScalingAdapter](#scalingadapter)_ | ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter.<br />When enabled, replicas are managed via DGDSA and external autoscalers can scale<br />the service using the Scale subresource. When disabled, replicas can be modified directly. | | Optional: \{\} <br /> |
| `eppConfig` _[EPPConfig](#eppconfig)_ | EPPConfig defines EPP-specific configuration options for Endpoint Picker Plugin components.<br />Only applicable when ComponentType is "epp". | | Optional: \{\} <br /> |
| `checkpoint` _[ServiceCheckpointConfig](#servicecheckpointconfig)_ | Checkpoint configures container checkpointing for this service.<br />When enabled, pods can be restored from a checkpoint files for faster cold start. | | Optional: \{\} <br /> |
#### DynamoGraphDeployment
......@@ -456,6 +617,7 @@ _Appears in:_
| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions contains the latest observed conditions of the graph deployment.<br />The slice is merged by type on patch updates. | | |
| `services` _object (keys:string, values:[ServiceReplicaStatus](#servicereplicastatus))_ | Services contains per-service replica status information.<br />The map key is the service name from spec.services. | | Optional: \{\} <br /> |
| `restart` _[RestartStatus](#restartstatus)_ | Restart contains the status of the restart of the graph deployment. | | Optional: \{\} <br /> |
| `checkpoints` _object (keys:string, values:[ServiceCheckpointStatus](#servicecheckpointstatus))_ | Checkpoints contains per-service checkpoint status information.<br />The map key is the service name from spec.services. | | Optional: \{\} <br /> |
#### DynamoModel
......@@ -872,6 +1034,44 @@ _Appears in:_
| `enabled` _boolean_ | Enabled indicates whether the ScalingAdapter should be enabled for this service.<br />When true, a DGDSA is created and owns the replicas field.<br />When false (default), no DGDSA is created and replicas can be modified directly in the DGD. | false | Optional: \{\} <br /> |
#### ServiceCheckpointConfig
ServiceCheckpointConfig configures checkpointing for a DGD service
_Appears in:_
- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `enabled` _boolean_ | Enabled indicates whether checkpointing is enabled for this service | false | Optional: \{\} <br /> |
| `mode` _[CheckpointMode](#checkpointmode)_ | Mode defines how checkpoint creation is handled<br />- Auto: DGD controller creates Checkpoint CR automatically<br />- Manual: User must create Checkpoint CR | Auto | Enum: [Auto Manual] <br />Optional: \{\} <br /> |
| `checkpointRef` _string_ | CheckpointRef references an existing Checkpoint CR to use<br />If specified, Identity is ignored and this checkpoint is used directly | | Optional: \{\} <br /> |
| `identity` _[DynamoCheckpointIdentity](#dynamocheckpointidentity)_ | Identity defines the checkpoint identity for hash computation<br />Used when Mode is Auto or when looking up existing checkpoints<br />Required when checkpointRef is not specified | | Optional: \{\} <br /> |
#### ServiceCheckpointStatus
ServiceCheckpointStatus contains checkpoint information for a single service.
_Appears in:_
- [DynamoGraphDeploymentStatus](#dynamographdeploymentstatus)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `checkpointName` _string_ | CheckpointName is the name of the associated Checkpoint CR | | Optional: \{\} <br /> |
| `identityHash` _string_ | IdentityHash is the computed hash of the checkpoint identity | | Optional: \{\} <br /> |
| `ready` _boolean_ | Ready indicates if the checkpoint is ready for use | | Optional: \{\} <br /> |
#### ServiceReplicaStatus
......
# ChReK: Checkpoint/Restore in Kubernetes
> ⚠️ **Experimental Feature**: ChReK is currently in **beta/preview**. It requires privileged mode for restore operations, which may not be suitable for all production environments. See [Limitations](#limitations) for details.
**ChReK** (Checkpoint/Restore in Kubernetes) is an experimental infrastructure for fast-starting GPU applications using CRIU (Checkpoint/Restore in User-space). ChReK dramatically reduces cold-start times for large models from minutes to seconds by capturing initialized application state and restoring it on-demand.
## What is ChReK?
ChReK provides:
- **Fast cold starts**: Restore GPU-accelerated applications in seconds instead of minutes
- **CUDA state preservation**: Checkpoint and restore GPU memory and CUDA contexts
- **Kubernetes-native**: Integrates seamlessly with Kubernetes primitives
- **Storage flexibility**: PVC-based storage (S3/OCI planned for future releases)
- **Namespace isolation**: Each namespace gets its own checkpoint infrastructure
## Use Cases
### 1. With NVIDIA Dynamo Platform (Recommended)
Use ChReK as part of the Dynamo platform for automatic checkpoint management:
- Automatic checkpoint creation and lifecycle management
- Seamless integration with DynamoGraphDeployment CRDs
- Built-in autoscaling with fast restore
📖 **[Read the Dynamo Integration Guide →](dynamo.md)**
### 2. Standalone (Without Dynamo)
Use ChReK independently in your own Kubernetes applications:
- Manual checkpoint job creation
- Build your own restore-enabled container images
- Full control over checkpoint lifecycle
📖 **[Read the Standalone Usage Guide →](standalone.md)**
## Architecture
ChReK consists of two main components:
### 1. ChReK Helm Chart
Deploys the checkpoint/restore infrastructure:
- **DaemonSet**: Runs on GPU nodes to perform CRIU checkpoint operations
- **PVC**: Stores checkpoint data (rootfs diffs, CUDA memory state)
- **RBAC**: Namespace-scoped or cluster-wide permissions
- **Seccomp Profile**: Security policies for CRIU syscalls
### 2. Smart Entrypoint
A wrapper script that intelligently decides between:
- **Cold start**: Normal application startup (when no checkpoint exists)
- **Restore**: CRIU restore from checkpoint (when checkpoint available)
## Quick Start
### Install ChReK Infrastructure
```bash
helm install chrek nvidia/chrek \
--namespace my-team \
--create-namespace \
--set storage.pvc.size=100Gi
```
### Choose Your Integration Path
- **Using Dynamo Platform?** → Follow the [Dynamo Integration Guide](dynamo.md)
- **Using standalone?** → Follow the [Standalone Usage Guide](standalone.md)
## Key Features
### ✅ Currently Supported
-**vLLM backend only** (SGLang and TensorRT-LLM planned)
- ✅ Single-node, single-GPU checkpoints
- ✅ PVC storage backend (RWX for multi-node)
- ✅ CUDA checkpoint/restore
- ✅ PyTorch distributed state (with `GLOO_SOCKET_IFNAME=lo`)
- ✅ Namespace-scoped and cluster-wide RBAC
- ✅ Idempotent checkpoint creation
- ✅ Automatic signal-based checkpoint coordination
### 🚧 Planned Features
- 🚧 SGLang backend support
- 🚧 TensorRT-LLM backend support
- 🚧 S3/MinIO storage backend
- 🚧 OCI registry storage backend
- 🚧 Multi-GPU checkpoints
- 🚧 Multi-node distributed checkpoints
## Limitations
⚠️ **Important**: ChReK has significant limitations that may impact production readiness:
### Security Considerations
- **🔴 Privileged mode required**: Restore pods **must run in privileged mode** for CRIU to function. This grants containers elevated host access and may violate security policies in many production environments.
- **Security Impact**: Privileged containers can:
- Access all host devices
- Bypass most security restrictions
- Potentially compromise node security if the container is exploited
### Technical Limitations
- **vLLM backend only**: Currently only the vLLM backend supports checkpoint/restore. SGLang and TensorRT-LLM support is planned.
- **Single-node only**: Checkpoints must be created and restored on the same node
- **Single-GPU only**: Multi-GPU configurations not yet supported
- **Network state limitations**: Active TCP connections are closed during restore (use `tcp-close` CRIU option)
- **Storage**: Only PVC storage is currently implemented (S3/OCI planned)
### Recommendation
ChReK is best suited for:
- ✅ Development and testing environments
- ✅ Research and experimentation
- ✅ Controlled production environments with appropriate security controls
- ❌ Security-sensitive production workloads without proper risk assessment
## Documentation
### Getting Started
- [Dynamo Integration Guide](dynamo.md) - Using ChReK with Dynamo Platform
- [Standalone Usage Guide](standalone.md) - Using ChReK independently
- [ChReK Helm Chart README](../../../deploy/helm/charts/chrek/README.md) - Helm chart configuration
### Related Documentation
- [CRIU Documentation](https://criu.org/Main_Page) - Upstream CRIU docs
## Prerequisites
- Kubernetes 1.21+
- GPU nodes with NVIDIA runtime (`nvidia` runtime class)
- CRIU support in container runtime (containerd with CRIU plugin)
- RWX storage class (for multi-node deployments)
- **Security clearance for privileged pods** (required for restore operations)
## Troubleshooting
### Common Issues
**DaemonSet not starting?**
- Check GPU node labels: `kubectl get nodes -l nvidia.com/gpu.present=true`
- Verify NVIDIA runtime is available
**Checkpoint fails?**
- Check DaemonSet logs: `kubectl logs -l app.kubernetes.io/name=chrek -n <namespace>`
- Ensure application properly signals readiness
- Verify CRIU is installed in the runtime
**Restore fails?**
- Ensure restore pod uses the same volumes as checkpoint job
- Verify `hostIPC: true` is set (required for CUDA)
- Check for `PSM3_DISABLED=1` and `GLOO_SOCKET_IFNAME=lo` environment variables
For detailed troubleshooting, see:
- [Dynamo Integration Guide - Troubleshooting](dynamo.md#troubleshooting)
- [Standalone Guide - Troubleshooting](standalone.md#troubleshooting)
## Contributing
ChReK is part of the NVIDIA Dynamo project. Contributions are welcome!
## License
Apache License 2.0
<!--
SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# Checkpoint/Restore for Fast Pod Startup
> ⚠️ **Experimental Feature**: ChReK is currently in **beta/preview**. It requires privileged mode for restore operations. See [Limitations](#limitations) for details.
Reduce cold start times for LLM inference workers from ~3 minutes to ~30 seconds using container checkpointing.
## Overview
Checkpointing captures the complete state of a running worker pod (including GPU memory) and saves it to storage. New pods can restore from this checkpoint instead of performing a full cold start.
| Startup Type | Time | What Happens |
|--------------|------|--------------|
| **Cold Start** | ~3 min | Download model, load to GPU, initialize engine |
| **Warm Start** (checkpoint) | ~30 sec | Restore from checkpoint tar |
## Prerequisites
- Dynamo Platform installed (v0.4.0+)
- ChReK Helm chart installed (separate from platform)
- GPU nodes with CRIU support
- RWX PVC storage (PVC is currently the only supported backend)
## Quick Start
### 1. Install ChReK Infrastructure
First, install the ChReK Helm chart in each namespace where you need checkpointing:
```bash
# Install ChReK infrastructure
helm install chrek nvidia/chrek \
--namespace my-team \
--create-namespace \
--set storage.pvc.size=100Gi
```
This creates:
- A PVC for checkpoint storage (`chrek-pvc`)
- A DaemonSet for CRIU operations (`chrek-agent`)
### 2. Configure Operator Values
Update your Helm values to point to the ChReK infrastructure:
```yaml
# values.yaml
dynamo-operator:
checkpoint:
enabled: true
storage:
type: pvc # Only PVC is currently supported (S3/OCI planned)
pvc:
pvcName: "chrek-pvc" # Must match ChReK chart
basePath: "/checkpoints"
signalHostPath: "/var/lib/chrek/signals" # Must match ChReK chart
```
### 2. Configure Your DGD
Add checkpoint configuration to your service:
```yaml
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: my-llm
spec:
services:
VllmWorker:
replicas: 1
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/dynamo-vllm:latest
args:
- python3 -m dynamo.vllm --model meta-llama/Llama-3-8B
resources:
limits:
nvidia.com/gpu: "1"
# Checkpoint configuration
checkpoint:
enabled: true
mode: auto # Automatically create checkpoint if not found
identity:
model: "meta-llama/Llama-3-8B"
backendFramework: "vllm"
tensorParallelSize: 1
dtype: "bfloat16"
```
### 3. Deploy
```bash
kubectl apply -f my-llm.yaml -n dynamo-system
```
On first deployment:
1. A checkpoint job runs to create the checkpoint
2. Worker pods start with cold start (checkpoint not ready yet)
3. Once checkpoint is ready, new pods (scale-up, restarts) restore from checkpoint
## Storage Backends
### PVC (Currently Supported)
Use when you have RWX storage available (e.g., NFS, EFS, Filestore).
```yaml
checkpoint:
storage:
type: pvc
pvc:
pvcName: "chrek-pvc"
basePath: "/checkpoints"
```
**Requirements:**
- RWX (ReadWriteMany) PVC for multi-node access
- Sufficient storage (checkpoints are ~10-50GB per model)
### S3 / MinIO (Planned - Not Yet Implemented)
> ⚠️ **Note:** S3 storage backend is defined in the API but not yet fully implemented.
Object storage support is planned for a future release. The configuration will look like:
```yaml
checkpoint:
storage:
type: s3 # Not yet supported
s3:
# AWS S3
uri: "s3://my-bucket/checkpoints"
# Or MinIO / custom S3
uri: "s3://minio.example.com/my-bucket/checkpoints"
# Optional: credentials secret
credentialsSecretRef: "s3-creds"
```
### OCI Registry (Planned - Not Yet Implemented)
> ⚠️ **Note:** OCI registry storage backend is defined in the API but not yet fully implemented.
Container registry storage support is planned for a future release. The configuration will look like:
```yaml
checkpoint:
storage:
type: oci # Not yet supported
oci:
uri: "oci://myregistry.io/checkpoints"
credentialsSecretRef: "registry-creds" # Docker config secret
```
## Checkpoint Modes
### Auto Mode (Recommended)
The operator automatically creates a `DynamoCheckpoint` CR if one doesn't exist:
```yaml
checkpoint:
enabled: true
mode: auto
identity:
model: "meta-llama/Llama-3-8B"
backendFramework: "vllm"
tensorParallelSize: 1
```
### Reference Mode
Reference an existing `DynamoCheckpoint` CR by its 16-character hash using `checkpointRef`:
```yaml
checkpoint:
enabled: true
checkpointRef: "e5962d34ba272638" # 16-char hash of DynamoCheckpoint CR
```
This is useful when:
- You want to **pre-warm checkpoints** before creating DGDs
- You want to **explicit control** over which checkpoint to use
**Flow:**
1. Create a `DynamoCheckpoint` CR (see [DynamoCheckpoint CRD](#dynamocheckpoint-crd) section)
2. Wait for it to become `Ready`
3. Reference it in your DGD using `checkpointRef` with the hash
```bash
# Check checkpoint status (using 16-char hash name)
kubectl get dynamocheckpoint e5962d34ba272638 -n dynamo-system
NAME MODEL BACKEND PHASE HASH AGE
e5962d34ba272638 meta-llama/Llama-3-8B vllm Ready e5962d34ba272638 5m
# Now create DGD referencing it
kubectl apply -f my-dgd.yaml
```
## Checkpoint Identity
Checkpoints are uniquely identified by a **16-character SHA256 hash** (64 bits) of configuration that affects runtime state:
| Field | Required | Affects Hash | Example |
|-------|----------|-------------|---------|
| `model` | ✓ | ✓ | `meta-llama/Llama-3-8B` |
| `framework` | ✓ | ✓ | `vllm`, `sglang`, `trtllm` |
| `dynamoVersion` | | ✓ | `0.9.0`, `1.0.0` |
| `tensorParallelSize` | | ✓ | `1`, `2`, `4`, `8` (default: 1) |
| `pipelineParallelSize` | | ✓ | `1`, `2` (default: 1) |
| `dtype` | | ✓ | `float16`, `bfloat16`, `fp8` |
| `maxModelLen` | | ✓ | `4096`, `8192` |
| `extraParameters` | | ✓ | Custom key-value pairs |
**Not included in hash** (don't invalidate checkpoint):
- `replicas`
- `nodeSelector`, `affinity`, `tolerations`
- `resources` (requests/limits)
- Logging/observability config
**Example with all fields:**
```yaml
checkpoint:
enabled: true
mode: auto
identity:
model: "meta-llama/Llama-3-8B"
backendFramework: "vllm"
dynamoVersion: "0.9.0"
tensorParallelSize: 1
pipelineParallelSize: 1
dtype: "bfloat16"
maxModelLen: 8192
extraParameters:
enableChunkedPrefill: "true"
quantization: "awq"
```
**Checkpoint Naming:** The `DynamoCheckpoint` CR is automatically named using the 16-character identity hash (e.g., `e5962d34ba272638`).
**Checkpoint Sharing:** Multiple DGDs with the same identity automatically share the same checkpoint.
## DynamoCheckpoint CRD
The `DynamoCheckpoint` (shortname: `dckpt`) is a Kubernetes Custom Resource that manages checkpoint lifecycle.
**When to create a DynamoCheckpoint directly:**
- **Pre-warming:** Create checkpoints before deploying DGDs for instant startup
- **Explicit control:** Manage checkpoint lifecycle independently from DGDs
**Note:** With the new hash-based naming, checkpoint names are automatically generated (16-character hash). The operator handles checkpoint discovery and reuse automatically in `auto` mode.
**Create a checkpoint:**
```yaml
apiVersion: nvidia.com/v1alpha1
kind: DynamoCheckpoint
metadata:
name: e5962d34ba272638 # Use the computed 16-char hash
spec:
identity:
model: "meta-llama/Llama-3-8B"
backendFramework: "vllm"
tensorParallelSize: 1
dtype: "bfloat16"
job:
activeDeadlineSeconds: 3600
podTemplateSpec:
spec:
containers:
- name: main
image: nvcr.io/nvidia/ai-dynamo/dynamo-vllm:latest
command: ["python3", "-m", "dynamo.vllm"]
args: ["--model", "meta-llama/Llama-3-8B"]
resources:
limits:
nvidia.com/gpu: "1"
env:
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: HF_TOKEN
```
**Note:** You can compute the hash yourself, or use `auto` mode to let the operator create it.
**Check status:**
```bash
# List all checkpoints
kubectl get dynamocheckpoint -n dynamo-system
# Or use shortname
kubectl get dckpt -n dynamo-system
NAME MODEL BACKEND PHASE HASH AGE
e5962d34ba272638 meta-llama/Llama-3-8B vllm Ready e5962d34ba272638 5m
a7b4f89c12de3456 meta-llama/Llama-3-70B vllm Creating a7b4f89c12de3456 2m
```
**Phases:**
| Phase | Description |
|-------|-------------|
| `Pending` | CR created, waiting for job to start |
| `Creating` | Checkpoint job is running |
| `Ready` | Checkpoint available for use |
| `Failed` | Checkpoint creation failed |
**Detailed status:**
```bash
kubectl describe dckpt e5962d34ba272638 -n dynamo-system
```
```yaml
Status:
Phase: Ready
IdentityHash: e5962d34ba272638
Location: /checkpoints/e5962d34ba272638
StorageType: pvc
CreatedAt: 2026-01-29T10:05:00Z
```
**Reference from DGD:**
Once the checkpoint is `Ready`, you can reference it by hash:
```yaml
spec:
services:
VllmWorker:
checkpoint:
enabled: true
checkpointRef: "e5962d34ba272638" # 16-char hash
```
Or use `auto` mode and the operator will find/create it automatically.
## Limitations
⚠️ **Important**: ChReK has significant limitations that impact production readiness:
### Security Considerations
- **🔴 Privileged mode required**: Restore pods **must run in privileged mode** for CRIU to function
- Privileged containers have elevated host access, which may violate security policies in many production environments
- This requirement applies to all worker pods that restore from checkpoints
### Technical Limitations
- **vLLM backend only**: Currently only the vLLM backend supports checkpoint/restore. SGLang and TensorRT-LLM support is planned.
- **Single-node only**: Checkpoints must be created and restored on the same node
- **Single-GPU only**: Multi-GPU configurations are not yet supported
- **Network state**: Active TCP connections are closed during restore (handled with `tcp-close` CRIU option)
- **Storage**: Only PVC backend currently implemented (S3/OCI planned)
### Recommendation
ChReK is **experimental/beta** and best suited for:
- ✅ Development and testing environments
- ✅ Research and experimentation
- ✅ Controlled production environments with appropriate security controls
- ❌ Security-sensitive production workloads without proper risk assessment
## Troubleshooting
### Checkpoint Not Creating
1. Check the checkpoint job:
```bash
kubectl get jobs -l nvidia.com/checkpoint-source=true -n dynamo-system
kubectl logs job/checkpoint-<name> -n dynamo-system
```
2. Check the DaemonSet:
```bash
kubectl logs daemonset/chrek-agent -n dynamo-system
```
3. Verify storage access:
```bash
kubectl exec -it <checkpoint-agent-pod> -- ls -la /checkpoints
```
### Restore Failing
1. Check pod logs:
```bash
kubectl logs <worker-pod> -n dynamo-system
```
2. Verify checkpoint file exists:
```bash
# For PVC
kubectl exec -it <any-pod-with-pvc> -- ls -la /checkpoints/
# For S3
aws s3 ls s3://my-bucket/checkpoints/
```
3. Check environment variables:
```bash
kubectl exec <worker-pod> -- env | grep DYN_CHECKPOINT
```
### Cold Start Despite Checkpoint
Pods fall back to cold start if:
- Checkpoint file doesn't exist yet (still being created)
- Checkpoint file is corrupted
- CRIU restore fails
Check logs for "Falling back to cold start" message.
## Best Practices
1. **Use RWX PVCs** for multi-node deployments (currently the only supported backend)
2. **Pre-warm checkpoints** before scaling up
3. **Monitor checkpoint size** - large models create large checkpoints
4. **Clean up old checkpoints** to save storage
## Environment Variables
| Variable | Description |
|----------|-------------|
| `DYN_CHECKPOINT_STORAGE_TYPE` | Backend: `pvc`, `s3`, `oci` |
| `DYN_CHECKPOINT_LOCATION` | Source location (URI) |
| `DYN_CHECKPOINT_PATH` | Local path to tar file |
| `DYN_CHECKPOINT_HASH` | Identity hash (debugging) |
| `DYN_CHECKPOINT_SIGNAL_FILE` | Signal file (creation mode only) |
## Complete Example
Create a checkpoint and use it in a DGD:
```yaml
# 1. Create the DynamoCheckpoint CR
apiVersion: nvidia.com/v1alpha1
kind: DynamoCheckpoint
metadata:
name: e5962d34ba272638 # 16-char hash (computed from identity)
namespace: dynamo-system
spec:
identity:
model: "meta-llama/Meta-Llama-3-8B-Instruct"
backendFramework: "vllm"
tensorParallelSize: 1
dtype: "bfloat16"
job:
activeDeadlineSeconds: 3600
backoffLimit: 3
podTemplateSpec:
spec:
containers:
- name: main
image: nvcr.io/nvidia/ai-dynamo/dynamo-vllm:latest
command: ["python3", "-m", "dynamo.vllm"]
args:
- "--model"
- "meta-llama/Meta-Llama-3-8B-Instruct"
- "--tensor-parallel-size"
- "1"
- "--dtype"
- "bfloat16"
env:
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: HF_TOKEN
resources:
limits:
nvidia.com/gpu: "1"
restartPolicy: Never
---
# 2. Wait for Ready: kubectl get dckpt e5962d34ba272638 -n dynamo-system -w
---
# 3. Reference the checkpoint in your DGD
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: my-llm
namespace: dynamo-system
spec:
services:
VllmWorker:
replicas: 2
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/dynamo-vllm:latest
resources:
limits:
nvidia.com/gpu: "1"
checkpoint:
enabled: true
checkpointRef: "e5962d34ba272638" # Reference by hash
```
## Related Documentation
- [ChReK Overview](README.md) - ChReK architecture and use cases
- [ChReK Standalone Usage Guide](standalone.md) - Use ChReK without Dynamo Platform
- [ChReK Helm Chart README](../../../deploy/helm/charts/chrek/README.md) - Chart configuration
- [Installation Guide](../installation_guide.md) - Platform installation
- [API Reference](../api_reference.md) - Complete CRD specifications
# ChReK Standalone Usage Guide
> ⚠️ **Experimental Feature**: ChReK is currently in **beta/preview**. It requires privileged mode for restore operations, which may not be suitable for all production environments. Review the [security implications](#security-considerations) before deploying.
This guide explains how to use **ChReK** (Checkpoint/Restore for Kubernetes) as a standalone component without deploying the full Dynamo platform. This is useful if you want to add checkpoint/restore capabilities to your own GPU workloads.
## Table of Contents
- [Overview](#overview)
- [Prerequisites](#prerequisites)
- [Step 1: Deploy ChReK](#step-1-deploy-chrek)
- [Step 2: Build Checkpoint-Enabled Images](#step-2-build-checkpoint-enabled-images)
- [Step 3: Create Checkpoint Jobs](#step-3-create-checkpoint-jobs)
- [Step 4: Restore from Checkpoints](#step-4-restore-from-checkpoints)
- [Environment Variables Reference](#environment-variables-reference)
- [Checkpoint Flow Explained](#checkpoint-flow-explained)
- [Troubleshooting](#troubleshooting)
---
## Overview
When using ChReK standalone, you are responsible for:
1. **Deploying the ChReK Helm chart** (DaemonSet + PVC)
2. **Building checkpoint-enabled container images** with the restore entrypoint
3. **Creating checkpoint jobs** with the correct environment variables
4. **Creating restore pods** that detect and use the checkpoints
The ChReK DaemonSet handles the actual CRIU checkpoint/restore operations automatically once your pods are configured correctly.
---
## Prerequisites
- Kubernetes cluster with:
- NVIDIA GPUs with checkpoint support
- **Privileged security context allowed** (⚠️ required for CRIU - see [Security Considerations](#security-considerations))
- PVC storage (ReadWriteMany recommended for multi-node)
- Docker or compatible container runtime for building images
- Access to the ChReK source code: `deploy/chrek/`
### Security Considerations
⚠️ **Important**: ChReK restore operations **require privileged mode**, which has significant security implications:
- **Privileged containers** can access all host devices and bypass most security restrictions
- This may violate security policies in production environments
- Privileged containers, if compromised, can potentially compromise node security
**Recommended for:**
- ✅ Development and testing environments
- ✅ Research and experimentation
- ✅ Controlled production environments with appropriate security controls
**Not recommended for:**
- ❌ Multi-tenant clusters without proper isolation
- ❌ Security-sensitive production workloads without risk assessment
- ❌ Environments with strict security compliance requirements
### Technical Limitations
⚠️ **Current Restrictions:**
- **vLLM backend only**: Currently only the vLLM backend supports checkpoint/restore. SGLang and TensorRT-LLM support is planned.
- **Single-node only**: Checkpoints must be created and restored on the same node
- **Single-GPU only**: Multi-GPU configurations are not yet supported
- **Network state**: Active TCP connections are closed during restore
- **Storage**: Only PVC backend currently implemented (S3/OCI planned)
---
## Step 1: Deploy ChReK
### Install the Helm Chart
```bash
# Clone the repository
git clone https://github.com/ai-dynamo/dynamo.git
cd dynamo
# Install ChReK in your namespace
helm install chrek ./deploy/helm/charts/chrek \
--namespace my-app \
--create-namespace \
--set storage.pvc.size=100Gi \
--set storage.pvc.storageClass=your-storage-class
```
### Verify Installation
```bash
# Check the DaemonSet is running
kubectl get daemonset -n my-app
# NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE
# chrek-agent 3 3 3 3 3
# Check the PVC is bound
kubectl get pvc -n my-app
# NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS
# chrek-pvc Bound pvc-xyz 100Gi RWX your-storage-class
```
---
## Step 2: Build Checkpoint-Enabled Images
ChReK provides a convenient `placeholder` target in its Dockerfile that automatically injects checkpoint/restore capabilities into your existing container images.
### Quick Start: Using the Placeholder Target (Recommended)
```bash
cd deploy/chrek
# Define your images
export BASE_IMAGE="your-app:latest" # Your existing application image
export RESTORE_IMAGE="your-app:checkpoint-enabled" # Output checkpoint-enabled image
# Build using the placeholder target
docker build \
--target placeholder \
--build-arg BASE_IMAGE="$BASE_IMAGE" \
-t "$RESTORE_IMAGE" \
.
# Push to your registry
docker push "$RESTORE_IMAGE"
```
**Example with a Dynamo vLLM image:**
```bash
cd deploy/chrek
export DYNAMO_IMAGE="nvidia/dynamo-vllm:v1.2.0"
export RESTORE_IMAGE="nvidia/dynamo-vllm:v1.2.0-checkpoint"
docker build \
--target placeholder \
--build-arg BASE_IMAGE="$DYNAMO_IMAGE" \
-t "$RESTORE_IMAGE" \
.
```
### What the Placeholder Target Does
The ChReK Dockerfile's `placeholder` stage automatically:
- ✅ Builds the restore-entrypoint binary
- ✅ Injects it into `/usr/local/bin/restore-entrypoint`
- ✅ Adds `smart-entrypoint.sh` to `/usr/local/bin/`
- ✅ Sets executable permissions
- ✅ Configures the entrypoint to detect and restore checkpoints
- ✅ Preserves your original application CMD
### Alternative: Manual Multi-Stage Build
If you need more control, you can create your own Dockerfile:
```dockerfile
# Stage 1: Build restore-entrypoint
FROM golang:1.23-alpine AS restore-builder
WORKDIR /build
COPY deploy/chrek/cmd/restore-entrypoint ./cmd/restore-entrypoint
COPY deploy/chrek/pkg ./pkg
COPY deploy/chrek/go.mod deploy/chrek/go.sum ./
RUN go build -o /restore-entrypoint ./cmd/restore-entrypoint
# Stage 2: Your application image
FROM your-base-image:latest
# Copy restore-entrypoint
COPY --from=restore-builder /restore-entrypoint /usr/local/bin/restore-entrypoint
# Copy smart-entrypoint.sh
COPY deploy/chrek/scripts/smart-entrypoint.sh /usr/local/bin/smart-entrypoint.sh
RUN chmod +x /usr/local/bin/smart-entrypoint.sh /usr/local/bin/restore-entrypoint
# Set smart-entrypoint as the default entrypoint
ENTRYPOINT ["/usr/local/bin/smart-entrypoint.sh"]
# Your application command (becomes CMD, can be overridden)
CMD ["python", "your_app.py"]
```
> **💡 Tip**: Using the `placeholder` target is the recommended approach as it's maintained with the ChReK codebase and ensures compatibility.
---
## Step 3: Create Checkpoint Jobs
A checkpoint job loads your application, waits for the ChReK DaemonSet to checkpoint it, and then exits.
### Required Environment Variables
Your checkpoint job MUST set these environment variables:
| Variable | Description | Example |
|----------|-------------|---------|
| `DYN_CHECKPOINT_SIGNAL_FILE` | Path where DaemonSet writes completion signal | `/checkpoint-signal/my-checkpoint.done` |
| `DYN_CHECKPOINT_READY_FILE` | Path where your app signals it's ready | `/tmp/checkpoint-ready` |
| `DYN_CHECKPOINT_HASH` | Unique identifier for this checkpoint | `abc123def456` |
| `DYN_CHECKPOINT_LOCATION` | Directory where checkpoint is stored | `/checkpoints/abc123def456` |
| `DYN_CHECKPOINT_STORAGE_TYPE` | Storage backend type | `pvc` |
### Required Labels
Add this label to enable DaemonSet checkpoint detection:
```yaml
labels:
nvidia.com/checkpoint-source: "true"
```
### Example Checkpoint Job
```yaml
apiVersion: batch/v1
kind: Job
metadata:
name: checkpoint-my-model
namespace: my-app
spec:
template:
metadata:
labels:
nvidia.com/checkpoint-source: "true" # Required for DaemonSet detection
spec:
restartPolicy: Never
# Init container to clean up stale signal files
initContainers:
- name: cleanup-signal-file
image: busybox:latest
command:
- sh
- -c
- |
rm -f /checkpoint-signal/my-checkpoint.done || true
echo "Signal file cleanup complete"
volumeMounts:
- name: checkpoint-signal
mountPath: /checkpoint-signal
containers:
- name: main
image: my-app:checkpoint-enabled
# Security context required for CRIU
securityContext:
privileged: true
capabilities:
add: ["SYS_ADMIN", "SYS_PTRACE", "SYS_CHROOT"]
# Readiness probe: Pod becomes Ready when model is loaded
# This is what triggers the DaemonSet to start checkpointing
readinessProbe:
exec:
command: ["sh", "-c", "cat ${DYN_CHECKPOINT_READY_FILE}"]
initialDelaySeconds: 15
periodSeconds: 2
# Remove liveness/startup probes for checkpoint jobs
# Model loading can take several minutes
livenessProbe: null
startupProbe: null
# Checkpoint-related environment variables
env:
- name: DYN_CHECKPOINT_SIGNAL_FILE
value: "/checkpoint-signal/my-checkpoint.done"
- name: DYN_CHECKPOINT_READY_FILE
value: "/tmp/checkpoint-ready"
- name: DYN_CHECKPOINT_HASH
value: "abc123def456"
- name: DYN_CHECKPOINT_LOCATION
value: "/checkpoints/abc123def456"
- name: DYN_CHECKPOINT_STORAGE_TYPE
value: "pvc"
# GPU request
resources:
limits:
nvidia.com/gpu: 1
# Required volume mounts
volumeMounts:
- name: checkpoint-storage
mountPath: /checkpoints
- name: checkpoint-signal
mountPath: /checkpoint-signal
- name: tmp
mountPath: /tmp
volumes:
- name: checkpoint-storage
persistentVolumeClaim:
claimName: chrek-pvc
- name: checkpoint-signal
hostPath:
path: /var/lib/chrek/signals
type: DirectoryOrCreate
- name: tmp
emptyDir: {}
```
### Application Code Requirements
Your application must implement the checkpoint flow. Here's the pattern used by Dynamo vLLM:
```python
import os
import time
def main():
# 1. Check for checkpoint mode
signal_file = os.environ.get("DYN_CHECKPOINT_SIGNAL_FILE")
ready_file = os.environ.get("DYN_CHECKPOINT_READY_FILE")
restore_marker = os.environ.get("DYN_RESTORE_MARKER_FILE", "/tmp/dynamo-restored")
is_checkpoint_mode = signal_file is not None
if is_checkpoint_mode:
print("Checkpoint mode detected")
# 2. Load your model/application
model = load_model()
# 3. Optional: Put model to sleep to reduce memory footprint
# model.sleep()
# 4. Write ready file (for application use, not DaemonSet)
if ready_file:
with open(ready_file, "w") as f:
f.write("ready")
print(f"Wrote checkpoint ready file: {ready_file}")
# 5. Log readiness messages (helps debugging)
print("CHECKPOINT_READY: Model loaded, ready for container checkpoint")
print(f"CHECKPOINT_READY: Waiting for signal file: {signal_file}")
print(f"CHECKPOINT_READY: Or restore marker file: {restore_marker}")
# 6. Wait for checkpoint completion OR restore detection
while True:
# Check if we've been restored (marker file created by restore entrypoint)
if os.path.exists(restore_marker):
print(f"Detected restore from checkpoint (marker: {restore_marker})")
# Continue with normal application flow
break
# Check if checkpoint is complete (signal file created by DaemonSet)
if os.path.exists(signal_file):
print(f"Checkpoint signal file detected: {signal_file}")
print("Checkpoint complete, exiting")
return # Exit gracefully
time.sleep(1)
# Normal application flow (or post-restore flow)
run_application()
```
**Important Notes:**
1. **Ready File & Readiness Probe**: The checkpoint job must have a readiness probe that checks for the ready file:
```yaml
readinessProbe:
exec:
command: ["sh", "-c", "cat ${DYN_CHECKPOINT_READY_FILE}"]
initialDelaySeconds: 15
periodSeconds: 2
```
The ChReK DaemonSet triggers checkpointing when:
- Pod has `nvidia.com/checkpoint-source: "true"` label
- Pod status is `Ready` (readiness probe passes = ready file exists)
2. **Restore Marker**: Created by `restore-entrypoint` before CRIU restore, allows the restored process to detect it was restored
3. **Two Exit Paths**:
- **Signal file found**: Checkpoint complete, exit gracefully
- **Restore marker found**: Process was restored, continue running
---
## Step 4: Restore from Checkpoints
Restore pods automatically detect and restore from checkpoints if they exist.
### Example Restore Pod
```yaml
apiVersion: v1
kind: Pod
metadata:
name: my-app-restored
namespace: my-app
spec:
restartPolicy: Never
containers:
- name: main
image: my-app:checkpoint-enabled
# Security context required for CRIU restore
securityContext:
privileged: true
capabilities:
add: ["SYS_ADMIN", "SYS_PTRACE", "SYS_CHROOT"]
# Set checkpoint environment variables
env:
- name: DYN_CHECKPOINT_HASH
value: "abc123def456" # Must match checkpoint job
- name: DYN_CHECKPOINT_PATH
value: "/checkpoints" # Base path (hash appended automatically)
# Optional: Customize restore marker file path
# - name: DYN_RESTORE_MARKER_FILE
# value: "/tmp/dynamo-restored"
# GPU request
resources:
limits:
nvidia.com/gpu: 1
# Mount checkpoint storage (READ-ONLY is fine for restore)
volumeMounts:
- name: checkpoint-storage
mountPath: /checkpoints
readOnly: true
- name: checkpoint-signal
mountPath: /checkpoint-signal
volumes:
- name: checkpoint-storage
persistentVolumeClaim:
claimName: chrek-pvc
- name: checkpoint-signal
hostPath:
path: /var/lib/chrek/signals
type: DirectoryOrCreate
```
### How Restore Works
1. **Smart Entrypoint Detects Checkpoint**: The `smart-entrypoint.sh` checks if a checkpoint exists at `/checkpoints/${DYN_CHECKPOINT_HASH}/`
2. **Calls Restore Entrypoint**: If found, calls `/usr/local/bin/restore-entrypoint` which invokes CRIU
3. **CRIU Restores Process**: The entire process tree is restored from the checkpoint, including GPU state
4. **Application Continues**: Your application resumes exactly where it was checkpointed
---
## Environment Variables Reference
### Checkpoint Jobs
| Variable | Required | Description |
|----------|----------|-------------|
| `DYN_CHECKPOINT_SIGNAL_FILE` | Yes | Full path to signal file (e.g., `/checkpoint-signal/my-checkpoint.done`) |
| `DYN_CHECKPOINT_READY_FILE` | Yes | Full path where app signals readiness (e.g., `/tmp/checkpoint-ready`) |
| `DYN_CHECKPOINT_HASH` | Yes | Unique checkpoint identifier (alphanumeric string) |
| `DYN_CHECKPOINT_LOCATION` | Yes | Directory where checkpoint is stored (e.g., `/checkpoints/abc123`) |
| `DYN_CHECKPOINT_STORAGE_TYPE` | Yes | Storage backend: `pvc`, `s3`, or `oci` |
### Restore Pods
| Variable | Required | Description |
|----------|----------|-------------|
| `DYN_CHECKPOINT_HASH` | Yes | Checkpoint identifier (must match checkpoint job) |
| `DYN_CHECKPOINT_PATH` | Yes | Base checkpoint directory (hash appended automatically) |
| `DYN_RESTORE_MARKER_FILE` | No | Path for restore marker file (default: `/tmp/dynamo-restored`) |
### Optional CRIU Tuning (Advanced)
| Variable | Default | Description |
|----------|---------|-------------|
| `CRIU_TIMEOUT` | `0` (unlimited) | CRIU operation timeout in seconds |
| `CRIU_LOG_LEVEL` | `4` | CRIU log verbosity (0-4) |
| `CRIU_WORK_DIR` | `/tmp` | CRIU working directory |
| `CUDA_PLUGIN_DIR` | `/usr/local/lib/criu` | Path to CRIU CUDA plugin |
| `CRIU_SKIP_IN_FLIGHT` | `false` | Skip in-flight TCP connections |
| `CRIU_AUTO_DEDUP` | `false` | Enable auto-deduplication |
| `CRIU_LAZY_PAGES` | `false` | Enable lazy page migration (experimental) |
| `WAIT_FOR_CHECKPOINT` | `false` | Wait for checkpoint to appear before starting |
| `RESTORE_WAIT_TIMEOUT` | `300` | Max seconds to wait for checkpoint |
| `DEBUG` | `false` | Enable debug mode (sleeps 300s on error) |
---
## Checkpoint Flow Explained
### 1. Checkpoint Creation Flow
```
┌─────────────────────────────────────────────────────────────┐
│ 1. Pod starts with nvidia.com/checkpoint-source=true label │
└──────────────────────┬──────────────────────────────────────┘
┌─────────────────────────────────────────────────────────────┐
│ 2. Application loads model and creates ready file │
│ /tmp/checkpoint-ready │
└──────────────────────┬──────────────────────────────────────┘
┌─────────────────────────────────────────────────────────────┐
│ 3. Pod becomes Ready (kubelet readiness probe passes) │
└──────────────────────┬──────────────────────────────────────┘
┌─────────────────────────────────────────────────────────────┐
│ 4. ChReK DaemonSet detects: │
│ - Pod is Ready │
│ - Has checkpoint-source label │
│ - Ready file exists: /tmp/checkpoint-ready │
└──────────────────────┬──────────────────────────────────────┘
┌─────────────────────────────────────────────────────────────┐
│ 5. DaemonSet executes CRIU checkpoint via runc: │
│ - Freezes container process │
│ - Dumps memory (CPU + GPU) │
│ - Saves to /checkpoints/${HASH}/ │
└──────────────────────┬──────────────────────────────────────┘
┌─────────────────────────────────────────────────────────────┐
│ 6. DaemonSet writes signal file: │
│ /checkpoint-signal/${HASH}.done │
└──────────────────────┬──────────────────────────────────────┘
┌─────────────────────────────────────────────────────────────┐
│ 7. Application detects signal file and exits gracefully │
└─────────────────────────────────────────────────────────────┘
```
### 2. Restore Flow
```
┌─────────────────────────────────────────────────────────────┐
│ 1. Pod starts with DYN_CHECKPOINT_HASH set │
└──────────────────────┬──────────────────────────────────────┘
┌─────────────────────────────────────────────────────────────┐
│ 2. smart-entrypoint.sh checks for checkpoint: │
│ /checkpoints/${DYN_CHECKPOINT_HASH}/checkpoint.done │
└──────────────────────┬──────────────────────────────────────┘
├─ Not Found ─────────────────┐
│ │
▼ ▼
┌───────────────────────┐ ┌──────────────────────┐
│ Checkpoint exists │ │ Cold start │
└──────────┬────────────┘ │ Run original CMD │
│ └──────────────────────┘
┌─────────────────────────────────────────────────────────────┐
│ 3. Call restore-entrypoint with checkpoint path │
└──────────────────────┬──────────────────────────────────────┘
┌─────────────────────────────────────────────────────────────┐
│ 4. restore-entrypoint extracts checkpoint and calls CRIU: │
│ criu restore --images-dir /checkpoints/${HASH}/images │
└──────────────────────┬──────────────────────────────────────┘
┌─────────────────────────────────────────────────────────────┐
│ 5. CRIU restores process from checkpoint │
│ - Restores memory (CPU + GPU) │
│ - Restores file descriptors │
│ - Resumes process execution │
└──────────────────────┬──────────────────────────────────────┘
┌─────────────────────────────────────────────────────────────┐
│ 6. Application continues from checkpointed state │
│ (Model already loaded, GPU memory initialized) │
└─────────────────────────────────────────────────────────────┘
```
---
## Troubleshooting
### Checkpoint Not Created
**Symptom**: Job runs but no checkpoint appears in `/checkpoints/`
**Checks**:
1. Verify the pod has the label:
```bash
kubectl get pod <pod-name> -o jsonpath='{.metadata.labels.nvidia\.com/checkpoint-source}'
```
2. Check pod readiness:
```bash
kubectl get pod <pod-name> -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}'
```
3. Check ready file was created:
```bash
kubectl exec <pod-name> -- ls -la /tmp/checkpoint-ready
```
4. Check DaemonSet logs:
```bash
kubectl logs -n my-app daemonset/chrek-agent --all-containers
```
### Restore Fails
**Symptom**: Pod fails to restore from checkpoint
**Checks**:
1. Verify checkpoint files exist:
```bash
kubectl exec <pod-name> -- ls -la /checkpoints/${DYN_CHECKPOINT_HASH}/
```
2. Check privileged mode is enabled:
```bash
kubectl get pod <pod-name> -o jsonpath='{.spec.containers[0].securityContext.privileged}'
```
3. Check CRIU logs in `/tmp/criu-restore.log`:
```bash
kubectl exec <pod-name> -- cat /tmp/criu-restore.log
```
4. Ensure checkpoint and restore have same:
- Container image
- GPU count
- Volume mounts
- Environment variables (except POD_NAME, POD_IP, etc.)
### Permission Denied Errors
**Symptom**: `CRIU: Permission denied` or `Operation not permitted`
**Solution**: Ensure pod has:
```yaml
securityContext:
privileged: true
capabilities:
add:
- SYS_ADMIN
- SYS_PTRACE
- SYS_CHROOT
```
### Signal File Not Appearing
**Symptom**: Application waits forever for signal file
**Checks**:
1. Verify hostPath mount is correct:
```bash
kubectl get pod <pod-name> -o jsonpath='{.spec.volumes[?(@.name=="checkpoint-signal")]}'
```
2. Check DaemonSet has access to the same path:
```bash
kubectl get daemonset -n my-app chrek-agent -o jsonpath='{.spec.template.spec.volumes[?(@.name=="signal-dir")]}'
```
3. Verify paths match exactly:
- Pod: `/var/lib/chrek/signals`
- DaemonSet: `/var/lib/chrek/signals`
---
## Additional Resources
- [ChReK Helm Chart Values](../../deploy/helm/charts/chrek/values.yaml)
- [Smart Entrypoint Script](../../deploy/chrek/scripts/smart-entrypoint.sh)
- [CRIU Documentation](https://criu.org/Main_Page)
- [CUDA Checkpoint Plugin](https://docs.nvidia.com/cuda/cuda-checkpoint-plugin/)
---
## Getting Help
If you encounter issues:
1. Check the [Troubleshooting](#troubleshooting) section
2. Review DaemonSet logs: `kubectl logs -n <namespace> daemonset/chrek-agent`
3. Open an issue on [GitHub](https://github.com/ai-dynamo/dynamo/issues)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment