# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Chrek - Checkpoint/Restore Infrastructure # This chart deploys the checkpoint storage (PVC) and CRIU agent (DaemonSet) # in a single namespace. Install this chart in each namespace where you want # to enable checkpoint/restore functionality for DynamoGraphDeployments. # Storage configuration for checkpoints storage: # Storage type: pvc (default), s3, or oci type: pvc # PVC configuration (when type=pvc) pvc: # Create a new PVC (set to false if using existing PVC) create: true # PVC name - must match operator configuration name: chrek-pvc # PVC size size: 100Gi # Storage class (leave empty for default) storageClass: "" # Access mode - ReadWriteMany required for multi-pod access accessMode: ReadWriteMany # Base path for checkpoints (mounted in pods) basePath: /checkpoints # S3 configuration (when type=s3) s3: # S3 URI (e.g., s3://my-bucket/checkpoints) uri: "" # Credentials are expected via IRSA or mounted secrets # OCI configuration (when type=oci) oci: # OCI URI (e.g., oci://registry.io/repo/checkpoints) uri: "" # DaemonSet configuration for chrek (checkpoint/restore) agent daemonset: # Container image image: repository: nvcr.io/nvidia/ai-dynamo/chrek-agent tag: 1.0.0 pullPolicy: Always # Image pull secrets imagePullSecrets: - name: ngc-secret # Resource limits and requests resources: limits: cpu: 4 memory: 4Gi requests: cpu: 2 memory: 1Gi # Node selector - target GPU nodes nodeSelector: nvidia.com/gpu.present: "true" # Tolerations for GPU nodes tolerations: - key: nvidia.com/gpu operator: Exists effect: NoSchedule - key: dedicated operator: Exists effect: NoSchedule # Pod labels podLabels: {} # Pod annotations podAnnotations: {} # Affinity rules affinity: {} # Seccomp profile configuration seccomp: # Deploy seccomp profile for blocking io_uring (required for CRIU) deploy: true # Service account configuration serviceAccount: # Create service account create: true # Service account name (generated if not set) name: "" # Annotations for service account (e.g., for IRSA) annotations: {} # RBAC configuration rbac: # Create RBAC resources create: true # Namespace-scoped RBAC (recommended, required for PVC storage) # - true (default): Creates Role/RoleBinding, agent watches pods in chart's namespace only # - false: Creates ClusterRole/ClusterRoleBinding, agent watches all pods on assigned nodes # Note: PVC storage requires namespace-scoped mode (true) as PVCs are namespace-scoped namespaceRestricted: true # Static configuration (loaded from ConfigMap) # Dynamic values (NODE_NAME, RESTRICTED_NAMESPACE, etc.) come from environment variables config: overlay: # Virtual FS dirs are COW artifacts in the overlay upperdir. systemDirs: - /proc - /sys - /dev # Cache directories to exclude (reduces checkpoint size) cacheDirs: - /.cache/huggingface # Python bytecode is already loaded in memory at restore time and # regenerated automatically on cold start. additionalExclusions: - "*/__pycache__" - "*.pyc" restore: # Path to the nsrestore binary in the placeholder image nsRestorePath: /usr/local/bin/nsrestore # Maximum seconds to wait for a restored pod to become Ready (0 = no timeout) restoreReadyTimeoutSeconds: 0 criu: # Path to the criu binary binaryPath: /usr/local/sbin/criu # Ghost file size limit in bytes. Deleted-but-open files smaller than this # are saved inline in the checkpoint image as ghost files. ghostLimit: 536870912 # CRIU logging verbosity (0-4) logLevel: 4 # CRIU work directory for temporary files workDir: /var/criu-work # K8s-specific options (recommended defaults for containers) leaveRunning: true # Keep process running after checkpoint shellJob: true # Containers are often session leaders tcpClose: true # Pod IPs change on restore/migration fileLocks: true # Applications use file locks orphanPtsMaster: true # Containers with TTYs extUnixSk: true # External Unix sockets linkRemap: true # Required for deleted-but-open files (e.g. POSIX semaphores in /dev/shm) extMasters: true # External bind mount masters manageCgroupsMode: soft # CRIU cgroup management mode (ignore/soft/full/strict) # Restore-specific options (only apply during CRIU restore, not dump) rstSibling: true # Restore as sibling process (required for go-criu swrk mode) mntnsCompatMode: false # Mount namespace compatibility mode evasiveDevices: true # Use any device path if original is inaccessible forceIrmap: true # Force resolving inotify/fsnotify watch names # Advanced options autoDedup: false # Auto-deduplication of memory pages lazyPages: false # Lazy page migration (experimental) # Config file options (NOT available via RPC - written to criu.conf) libDir: "" # Keep empty: external CUDA checkpoint/restore only (no CRIU CUDA plugin) allowUprobes: true # Leave enabled for kernel/userspace probe compatibility skipInFlight: true # Skip in-flight TCP connections