values.yaml 6.17 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
15
# Dynamo Snapshot - Checkpoint/Restore Infrastructure
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# This chart deploys the checkpoint storage (PVC) and CRIU agent (DaemonSet)
# in a single namespace. Install this chart in each namespace where you want
# to enable checkpoint/restore functionality for DynamoGraphDeployments.

# Storage configuration for checkpoints
storage:
  # Storage type: pvc (default), s3, or oci
  type: pvc

  # PVC configuration (when type=pvc)
  pvc:
    # Create a new PVC (set to false if using existing PVC)
    create: true
    # PVC name - must match operator configuration
30
    name: snapshot-pvc
31
    # PVC size
32
    size: 1Ti
33
34
35
36
    # Storage class (leave empty for default)
    storageClass: ""
    # Access mode - ReadWriteMany required for multi-pod access
    accessMode: ReadWriteMany
37
38
    # PVC mount path inside the snapshot-agent pod.
    # This must match the operator checkpoint.storage.pvc.basePath setting.
39
40
41
42
43
44
45
46
47
48
49
50
51
    basePath: /checkpoints

  # S3 configuration (when type=s3)
  s3:
    # S3 URI (e.g., s3://my-bucket/checkpoints)
    uri: ""
    # Credentials are expected via IRSA or mounted secrets

  # OCI configuration (when type=oci)
  oci:
    # OCI URI (e.g., oci://registry.io/repo/checkpoints)
    uri: ""

52
# DaemonSet configuration for snapshot (checkpoint/restore) agent
53
54
55
daemonset:
  # Container image
  image:
56
    repository: nvcr.io/nvidia/ai-dynamo/snapshot-agent
57
    tag: 1.0.0
58
59
    pullPolicy: Always

60
61
62
  # Snapshot agent and nsrestore log level (trace, debug, info, warn, error)
  snapshotLogLevel: info

63
64
65
66
67
68
69
  # Image pull secrets
  imagePullSecrets:
    - name: ngc-secret

  # Resource limits and requests
  resources:
    limits:
70
      cpu: 4
71
72
      memory: 4Gi
    requests:
73
      cpu: 2
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
      memory: 1Gi

  # Node selector - target GPU nodes
  nodeSelector:
    nvidia.com/gpu.present: "true"

  # Tolerations for GPU nodes
  tolerations:
    - key: nvidia.com/gpu
      operator: Exists
      effect: NoSchedule
    - key: dedicated
      operator: Exists
      effect: NoSchedule

  # Pod labels
  podLabels: {}

  # Pod annotations
  podAnnotations: {}

  # Affinity rules
  affinity: {}

# Seccomp profile configuration
seccomp:
  # Deploy seccomp profile for blocking io_uring (required for CRIU)
  deploy: true

# Service account configuration
serviceAccount:
  # Create service account
  create: true
  # Service account name (generated if not set)
  name: ""
  # Annotations for service account (e.g., for IRSA)
  annotations: {}

# RBAC configuration
rbac:
  # Create RBAC resources
  create: true

  # Namespace-scoped RBAC (recommended, required for PVC storage)
  # - true (default): Creates Role/RoleBinding, agent watches pods in chart's namespace only
  # - false: Creates ClusterRole/ClusterRoleBinding, agent watches all pods on assigned nodes
  # Note: PVC storage requires namespace-scoped mode (true) as PVCs are namespace-scoped
  namespaceRestricted: true

123
# Static configuration (loaded from ConfigMap)
124
# Dynamic values (NODE_NAME, RESTRICTED_NAMESPACE, etc.) come from environment variables
125
config:
126
  overlay:
127
128
129
130
    # Rootfs diff tar exclusions. Absolute-looking paths are normalized
    # relative to the tar root, and patterns starting with * are passed
    # through as tar globs unchanged.
    exclusions:
131
132
133
      - /proc
      - /sys
      - /dev
134
135
      - "*/.cache/huggingface"
      - "*/.cache/vllm/torch_compile_cache"
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
      - "*/__pycache__"
      - "*.pyc"

  restore:
    # Path to the nsrestore binary in the placeholder image
    nsRestorePath: /usr/local/bin/nsrestore
    # Maximum seconds to wait for a restored pod to become Ready (0 = no timeout)
    restoreReadyTimeoutSeconds: 0

  criu:
    # Path to the criu binary
    binaryPath: /usr/local/sbin/criu
    # Ghost file size limit in bytes. Deleted-but-open files smaller than this
    # are saved inline in the checkpoint image as ghost files.
    ghostLimit: 536870912
    # CRIU logging verbosity (0-4)
    logLevel: 4
    # CRIU work directory for temporary files
    workDir: /var/criu-work

    # K8s-specific options (recommended defaults for containers)
    leaveRunning: true      # Keep process running after checkpoint
    shellJob: true          # Containers are often session leaders
159
160
    tcpClose: true          # Close non-listening TCP sockets on restore
    tcpEstablished: false   # Preserve established TCP sockets during restore
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
    fileLocks: true         # Applications use file locks
    orphanPtsMaster: true   # Containers with TTYs
    extUnixSk: true         # External Unix sockets
    linkRemap: true         # Required for deleted-but-open files (e.g. POSIX semaphores in /dev/shm)
    extMasters: true        # External bind mount masters
    manageCgroupsMode: soft # CRIU cgroup management mode (ignore/soft/full/strict)

    # Restore-specific options (only apply during CRIU restore, not dump)
    rstSibling: true         # Restore as sibling process (required for go-criu swrk mode)
    mntnsCompatMode: false   # Mount namespace compatibility mode
    evasiveDevices: true     # Use any device path if original is inaccessible
    forceIrmap: true         # Force resolving inotify/fsnotify watch names

    # Advanced options
    autoDedup: false        # Auto-deduplication of memory pages
    lazyPages: false        # Lazy page migration (experimental)

    # Config file options (NOT available via RPC - written to criu.conf)
    libDir: ""                     # Keep empty: external CUDA checkpoint/restore only (no CRIU CUDA plugin)
    allowUprobes: true             # Leave enabled for kernel/userspace probe compatibility
    skipInFlight: true             # Skip in-flight TCP connections