# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Chrek - Checkpoint/Restore Infrastructure # This chart deploys the checkpoint storage (PVC) and CRIU agent (DaemonSet) # in a single namespace. Install this chart in each namespace where you want # to enable checkpoint/restore functionality for DynamoGraphDeployments. # Storage configuration for checkpoints storage: # Storage type: pvc (default), s3, or oci type: pvc # PVC configuration (when type=pvc) pvc: # Create a new PVC (set to false if using existing PVC) create: true # PVC name - must match operator configuration name: chrek-pvc # PVC size size: 100Gi # Storage class (leave empty for default) storageClass: "" # Access mode - ReadWriteMany required for multi-pod access accessMode: ReadWriteMany # Base path for checkpoints (mounted in pods) basePath: /checkpoints # S3 configuration (when type=s3) s3: # S3 URI (e.g., s3://my-bucket/checkpoints) uri: "" # Credentials are expected via IRSA or mounted secrets # OCI configuration (when type=oci) oci: # OCI URI (e.g., oci://registry.io/repo/checkpoints) uri: "" # DaemonSet configuration for chrek (checkpoint/restore) agent daemonset: # Container image image: repository: nvcr.io/nvidian/dynamo-dev/chrek-agent tag: latest pullPolicy: Always # Image pull secrets imagePullSecrets: - name: ngc-secret # Resource limits and requests resources: limits: cpu: 2 memory: 4Gi requests: cpu: 500m memory: 1Gi # Node selector - target GPU nodes nodeSelector: nvidia.com/gpu.present: "true" # Tolerations for GPU nodes tolerations: - key: nvidia.com/gpu operator: Exists effect: NoSchedule - key: dedicated operator: Exists effect: NoSchedule # Runtime class name for GPU access runtimeClassName: nvidia # Pod labels podLabels: {} # Pod annotations podAnnotations: {} # Affinity rules affinity: {} # Seccomp profile configuration seccomp: # Deploy seccomp profile for blocking io_uring (required for CRIU) deploy: true # Service account configuration serviceAccount: # Create service account create: true # Service account name (generated if not set) name: "" # Annotations for service account (e.g., for IRSA) annotations: {} # RBAC configuration rbac: # Create RBAC resources create: true # Namespace-scoped RBAC (recommended, required for PVC storage) # - true (default): Creates Role/RoleBinding, agent watches pods in chart's namespace only # - false: Creates ClusterRole/ClusterRoleBinding, agent watches all pods on assigned nodes # Note: PVC storage requires namespace-scoped mode (true) as PVCs are namespace-scoped namespaceRestricted: true # Static configuration (loaded from ConfigMap) # Dynamic values (NODE_NAME, RESTRICTED_NAMESPACE, etc.) still come from environment variables config: agent: # How checkpoints are triggered: "http" or "watcher" signalSource: "watcher" # HTTP server address for health checks and API listenAddr: ":8080" checkpoint: criu: # Ghost file size limit in bytes (512MB recommended for GPU workloads) ghostLimit: 536870912 # CRIU timeout in seconds (6 hours for large GPU checkpoints) timeout: 21600 # CRIU logging verbosity (0-4) logLevel: 4 # CRIU work directory for temporary files workDir: "/var/criu-work" # K8s-specific options (recommended defaults for containers) leaveRunning: true # Keep process running after checkpoint shellJob: true # Containers are often session leaders tcpClose: true # Pod IPs change on restore/migration fileLocks: true # Applications use file locks orphanPtsMaster: true # Containers with TTYs extUnixSk: true # External Unix sockets linkRemap: true # Handle deleted-but-open files extMasters: true # External bind mount masters manageCgroupsMode: "ignore" # Let K8s manage cgroups (ignore/soft/full/strict) # Advanced options autoDedup: false # Auto-deduplication of memory pages lazyPages: false # Lazy page migration (experimental) # Config file options (NOT available via RPC - written to criu.conf) libDir: "/usr/local/lib/criu" # Plugin directory (required for GPU checkpoints) allowUprobes: true # Required for CUDA skipInFlight: true # Skip in-flight TCP connections rootfsExclusions: # System directories excluded from rootfs diff capture # These are injected by NVIDIA GPU Operator and cause conflicts during restore systemDirs: - "./usr" - "./etc" - "./opt" - "./var" - "./run" # Cache directories to exclude (reduces checkpoint size) cacheDirs: - "./.cache/huggingface" # Additional custom exclusions (application-specific) additionalExclusions: [] # NOTE: Restore configuration is NOT in this ConfigMap. # Placeholder containers do not mount it. Restore defaults are hardcoded in Go. # CRIU options for restore come from the saved checkpoint manifest (manifest.yaml).