values.yaml 5.79 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Chrek - Checkpoint/Restore Infrastructure
# This chart deploys the checkpoint storage (PVC) and CRIU agent (DaemonSet)
# in a single namespace. Install this chart in each namespace where you want
# to enable checkpoint/restore functionality for DynamoGraphDeployments.

# Storage configuration for checkpoints
storage:
  # Storage type: pvc (default), s3, or oci
  type: pvc

  # PVC configuration (when type=pvc)
  pvc:
    # Create a new PVC (set to false if using existing PVC)
    create: true
    # PVC name - must match operator configuration
    name: chrek-pvc
    # PVC size
    size: 100Gi
    # Storage class (leave empty for default)
    storageClass: ""
    # Access mode - ReadWriteMany required for multi-pod access
    accessMode: ReadWriteMany
    # Base path for checkpoints (mounted in pods)
    basePath: /checkpoints

  # S3 configuration (when type=s3)
  s3:
    # S3 URI (e.g., s3://my-bucket/checkpoints)
    uri: ""
    # Credentials are expected via IRSA or mounted secrets

  # OCI configuration (when type=oci)
  oci:
    # OCI URI (e.g., oci://registry.io/repo/checkpoints)
    uri: ""

# DaemonSet configuration for chrek (checkpoint/restore) agent
daemonset:
  # Container image
  image:
    repository: nvcr.io/nvidian/dynamo-dev/chrek-agent
    tag: latest
    pullPolicy: Always

  # Image pull secrets
  imagePullSecrets:
    - name: ngc-secret

  # Resource limits and requests
  resources:
    limits:
      cpu: 2
      memory: 4Gi
    requests:
      cpu: 500m
      memory: 1Gi

  # Node selector - target GPU nodes
  nodeSelector:
    nvidia.com/gpu.present: "true"

  # Tolerations for GPU nodes
  tolerations:
    - key: nvidia.com/gpu
      operator: Exists
      effect: NoSchedule
    - key: dedicated
      operator: Exists
      effect: NoSchedule

  # Runtime class name for GPU access
  runtimeClassName: nvidia

  # Pod labels
  podLabels: {}

  # Pod annotations
  podAnnotations: {}

  # Affinity rules
  affinity: {}

# Seccomp profile configuration
seccomp:
  # Deploy seccomp profile for blocking io_uring (required for CRIU)
  deploy: true

# Service account configuration
serviceAccount:
  # Create service account
  create: true
  # Service account name (generated if not set)
  name: ""
  # Annotations for service account (e.g., for IRSA)
  annotations: {}

# RBAC configuration
rbac:
  # Create RBAC resources
  create: true

  # Namespace-scoped RBAC (recommended, required for PVC storage)
  # - true (default): Creates Role/RoleBinding, agent watches pods in chart's namespace only
  # - false: Creates ClusterRole/ClusterRoleBinding, agent watches all pods on assigned nodes
  # Note: PVC storage requires namespace-scoped mode (true) as PVCs are namespace-scoped
  namespaceRestricted: true

122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
# Static configuration (loaded from ConfigMap)
# Dynamic values (NODE_NAME, RESTRICTED_NAMESPACE, etc.) still come from environment variables
config:
  agent:
    # How checkpoints are triggered: "http" or "watcher"
    signalSource: "watcher"
    # HTTP server address for health checks and API
    listenAddr: ":8080"

  checkpoint:
    criu:
      # Ghost file size limit in bytes (512MB recommended for GPU workloads)
      ghostLimit: 536870912
      # CRIU timeout in seconds (6 hours for large GPU checkpoints)
      timeout: 21600
      # CRIU logging verbosity (0-4)
      logLevel: 4
      # CRIU work directory for temporary files
      workDir: "/var/criu-work"

      # K8s-specific options (recommended defaults for containers)
      leaveRunning: true      # Keep process running after checkpoint
      shellJob: true          # Containers are often session leaders
      tcpClose: true          # Pod IPs change on restore/migration
      fileLocks: true         # Applications use file locks
      orphanPtsMaster: true   # Containers with TTYs
      extUnixSk: true         # External Unix sockets
      linkRemap: true         # Handle deleted-but-open files
      extMasters: true        # External bind mount masters
      manageCgroupsMode: "ignore"  # Let K8s manage cgroups (ignore/soft/full/strict)

      # Advanced options
      autoDedup: false        # Auto-deduplication of memory pages
      lazyPages: false        # Lazy page migration (experimental)

      # Config file options (NOT available via RPC - written to criu.conf)
      libDir: "/usr/local/lib/criu"  # Plugin directory (required for GPU checkpoints)
      allowUprobes: true             # Required for CUDA
      skipInFlight: true             # Skip in-flight TCP connections

    rootfsExclusions:
      # System directories excluded from rootfs diff capture
      # These are injected by NVIDIA GPU Operator and cause conflicts during restore
      systemDirs:
        - "./usr"
        - "./etc"
        - "./opt"
        - "./var"
        - "./run"
      # Cache directories to exclude (reduces checkpoint size)
      cacheDirs:
        - "./.cache/huggingface"
      # Additional custom exclusions (application-specific)
      additionalExclusions: []

  # NOTE: Restore configuration is NOT in this ConfigMap.
  # Placeholder containers do not mount it. Restore defaults are hardcoded in Go.
  # CRIU options for restore come from the saved checkpoint manifest (manifest.yaml).