values.yaml 6.07 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
15
# Dynamo Snapshot - Checkpoint/Restore Infrastructure
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# This chart deploys the checkpoint storage (PVC) and CRIU agent (DaemonSet)
# in a single namespace. Install this chart in each namespace where you want
# to enable checkpoint/restore functionality for DynamoGraphDeployments.

# Storage configuration for checkpoints
storage:
  # Storage type: pvc (default), s3, or oci
  type: pvc

  # PVC configuration (when type=pvc)
  pvc:
    # Create a new PVC (set to false if using existing PVC)
    create: true
    # PVC name - must match operator configuration
30
    name: snapshot-pvc
31
    # PVC size
32
    size: 1Ti
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
    # Storage class (leave empty for default)
    storageClass: ""
    # Access mode - ReadWriteMany required for multi-pod access
    accessMode: ReadWriteMany
    # Base path for checkpoints (mounted in pods)
    basePath: /checkpoints

  # S3 configuration (when type=s3)
  s3:
    # S3 URI (e.g., s3://my-bucket/checkpoints)
    uri: ""
    # Credentials are expected via IRSA or mounted secrets

  # OCI configuration (when type=oci)
  oci:
    # OCI URI (e.g., oci://registry.io/repo/checkpoints)
    uri: ""

51
# DaemonSet configuration for snapshot (checkpoint/restore) agent
52
53
54
daemonset:
  # Container image
  image:
55
    repository: nvcr.io/nvidia/ai-dynamo/snapshot-agent
56
    tag: 1.0.0
57
58
    pullPolicy: Always

59
60
61
  # Snapshot agent and nsrestore log level (trace, debug, info, warn, error)
  snapshotLogLevel: info

62
63
64
65
66
67
68
  # Image pull secrets
  imagePullSecrets:
    - name: ngc-secret

  # Resource limits and requests
  resources:
    limits:
69
      cpu: 4
70
71
      memory: 4Gi
    requests:
72
      cpu: 2
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
      memory: 1Gi

  # Node selector - target GPU nodes
  nodeSelector:
    nvidia.com/gpu.present: "true"

  # Tolerations for GPU nodes
  tolerations:
    - key: nvidia.com/gpu
      operator: Exists
      effect: NoSchedule
    - key: dedicated
      operator: Exists
      effect: NoSchedule

  # Pod labels
  podLabels: {}

  # Pod annotations
  podAnnotations: {}

  # Affinity rules
  affinity: {}

# Seccomp profile configuration
seccomp:
  # Deploy seccomp profile for blocking io_uring (required for CRIU)
  deploy: true

# Service account configuration
serviceAccount:
  # Create service account
  create: true
  # Service account name (generated if not set)
  name: ""
  # Annotations for service account (e.g., for IRSA)
  annotations: {}

# RBAC configuration
rbac:
  # Create RBAC resources
  create: true

  # Namespace-scoped RBAC (recommended, required for PVC storage)
  # - true (default): Creates Role/RoleBinding, agent watches pods in chart's namespace only
  # - false: Creates ClusterRole/ClusterRoleBinding, agent watches all pods on assigned nodes
  # Note: PVC storage requires namespace-scoped mode (true) as PVCs are namespace-scoped
  namespaceRestricted: true

122
# Static configuration (loaded from ConfigMap)
123
# Dynamic values (NODE_NAME, RESTRICTED_NAMESPACE, etc.) come from environment variables
124
config:
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
  overlay:
    # Virtual FS dirs are COW artifacts in the overlay upperdir.
    systemDirs:
      - /proc
      - /sys
      - /dev
    # Cache directories to exclude (reduces checkpoint size)
    cacheDirs:
      - /.cache/huggingface
    # Python bytecode is already loaded in memory at restore time and
    # regenerated automatically on cold start.
    additionalExclusions:
      - "*/__pycache__"
      - "*.pyc"

  restore:
    # Path to the nsrestore binary in the placeholder image
    nsRestorePath: /usr/local/bin/nsrestore
    # Maximum seconds to wait for a restored pod to become Ready (0 = no timeout)
    restoreReadyTimeoutSeconds: 0

  criu:
    # Path to the criu binary
    binaryPath: /usr/local/sbin/criu
    # Ghost file size limit in bytes. Deleted-but-open files smaller than this
    # are saved inline in the checkpoint image as ghost files.
    ghostLimit: 536870912
    # CRIU logging verbosity (0-4)
    logLevel: 4
    # CRIU work directory for temporary files
    workDir: /var/criu-work

    # K8s-specific options (recommended defaults for containers)
    leaveRunning: true      # Keep process running after checkpoint
    shellJob: true          # Containers are often session leaders
    tcpClose: true          # Pod IPs change on restore/migration
    fileLocks: true         # Applications use file locks
    orphanPtsMaster: true   # Containers with TTYs
    extUnixSk: true         # External Unix sockets
    linkRemap: true         # Required for deleted-but-open files (e.g. POSIX semaphores in /dev/shm)
    extMasters: true        # External bind mount masters
    manageCgroupsMode: soft # CRIU cgroup management mode (ignore/soft/full/strict)

    # Restore-specific options (only apply during CRIU restore, not dump)
    rstSibling: true         # Restore as sibling process (required for go-criu swrk mode)
    mntnsCompatMode: false   # Mount namespace compatibility mode
    evasiveDevices: true     # Use any device path if original is inaccessible
    forceIrmap: true         # Force resolving inotify/fsnotify watch names

    # Advanced options
    autoDedup: false        # Auto-deduplication of memory pages
    lazyPages: false        # Lazy page migration (experimental)

    # Config file options (NOT available via RPC - written to criu.conf)
    libDir: ""                     # Keep empty: external CUDA checkpoint/restore only (no CRIU CUDA plugin)
    allowUprobes: true             # Leave enabled for kernel/userspace probe compatibility
    skipInFlight: true             # Skip in-flight TCP connections