values.yaml 6.13 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
15
# Dynamo Snapshot - Checkpoint/Restore Infrastructure
16
17
18
19
20
21
# This chart deploys the checkpoint storage (PVC) and CRIU agent (DaemonSet)
# in a single namespace. Install this chart in each namespace where you want
# to enable checkpoint/restore functionality for DynamoGraphDeployments.

# Storage configuration for checkpoints
storage:
22
23
  # Storage type stays snapshot-owned for future backend expansion.
  # Only pvc is implemented today.
24
25
26
27
28
29
  type: pvc

  # PVC configuration (when type=pvc)
  pvc:
    # Create a new PVC (set to false if using existing PVC)
    create: true
30
    # PVC name used by the snapshot-agent checkpoint store
31
    name: snapshot-pvc
32
    # PVC size
33
    size: 1Ti
34
35
36
37
    # Storage class (leave empty for default)
    storageClass: ""
    # Access mode - ReadWriteMany required for multi-pod access
    accessMode: ReadWriteMany
38
    # PVC mount path inside the snapshot-agent pod.
39
    # Restore targets derive checkpoint paths from this mount.
40
41
    basePath: /checkpoints

42
  # Reserved for future snapshot-owned backends. Unsupported today.
43
44
45
  s3:
    uri: ""

46
  # Reserved for future snapshot-owned backends. Unsupported today.
47
48
49
  oci:
    uri: ""

50
# DaemonSet configuration for snapshot (checkpoint/restore) agent
51
52
53
daemonset:
  # Container image
  image:
54
    repository: nvcr.io/nvidia/ai-dynamo/snapshot-agent
55
    tag: 1.0.0
56
57
    pullPolicy: Always

58
59
60
  # Snapshot agent and nsrestore log level (trace, debug, info, warn, error)
  snapshotLogLevel: info

61
62
63
64
65
66
67
  # Image pull secrets
  imagePullSecrets:
    - name: ngc-secret

  # Resource limits and requests
  resources:
    limits:
68
      cpu: 4
69
70
      memory: 4Gi
    requests:
71
      cpu: 2
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
      memory: 1Gi

  # Node selector - target GPU nodes
  nodeSelector:
    nvidia.com/gpu.present: "true"

  # Tolerations for GPU nodes
  tolerations:
    - key: nvidia.com/gpu
      operator: Exists
      effect: NoSchedule
    - key: dedicated
      operator: Exists
      effect: NoSchedule

  # Pod labels
  podLabels: {}

  # Pod annotations
  podAnnotations: {}

  # Affinity rules
  affinity: {}

# Seccomp profile configuration
seccomp:
  # Deploy seccomp profile for blocking io_uring (required for CRIU)
  deploy: true

# Service account configuration
serviceAccount:
  # Create service account
  create: true
  # Service account name (generated if not set)
  name: ""
  # Annotations for service account (e.g., for IRSA)
  annotations: {}

# RBAC configuration
rbac:
  # Create RBAC resources
  create: true

  # Namespace-scoped RBAC (recommended, required for PVC storage)
  # - true (default): Creates Role/RoleBinding, agent watches pods in chart's namespace only
  # - false: Creates ClusterRole/ClusterRoleBinding, agent watches all pods on assigned nodes
  # Note: PVC storage requires namespace-scoped mode (true) as PVCs are namespace-scoped
  namespaceRestricted: true

121
# Static configuration (loaded from ConfigMap)
122
# Dynamic values (NODE_NAME, RESTRICTED_NAMESPACE, etc.) come from environment variables
123
config:
124
  overlay:
125
126
127
128
    # Rootfs diff tar exclusions. Absolute-looking paths are normalized
    # relative to the tar root, and patterns starting with * are passed
    # through as tar globs unchanged.
    exclusions:
129
130
131
      - /proc
      - /sys
      - /dev
132
133
      - "*/.cache/huggingface"
      - "*/.cache/vllm/torch_compile_cache"
134
135
136
137
138
139
      - "*/__pycache__"
      - "*.pyc"

  restore:
    # Path to the nsrestore binary in the placeholder image
    nsRestorePath: /usr/local/bin/nsrestore
140
141
    # Maximum seconds to allow a restore attempt before snapshot-agent marks it failed
    restoreTimeoutSeconds: 7200
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156

  criu:
    # Path to the criu binary
    binaryPath: /usr/local/sbin/criu
    # Ghost file size limit in bytes. Deleted-but-open files smaller than this
    # are saved inline in the checkpoint image as ghost files.
    ghostLimit: 536870912
    # CRIU logging verbosity (0-4)
    logLevel: 4
    # CRIU work directory for temporary files
    workDir: /var/criu-work

    # K8s-specific options (recommended defaults for containers)
    leaveRunning: true      # Keep process running after checkpoint
    shellJob: true          # Containers are often session leaders
157
158
    tcpClose: true          # Close non-listening TCP sockets on restore
    tcpEstablished: false   # Preserve established TCP sockets during restore
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
    fileLocks: true         # Applications use file locks
    orphanPtsMaster: true   # Containers with TTYs
    extUnixSk: true         # External Unix sockets
    linkRemap: true         # Required for deleted-but-open files (e.g. POSIX semaphores in /dev/shm)
    extMasters: true        # External bind mount masters
    manageCgroupsMode: soft # CRIU cgroup management mode (ignore/soft/full/strict)

    # Restore-specific options (only apply during CRIU restore, not dump)
    rstSibling: true         # Restore as sibling process (required for go-criu swrk mode)
    mntnsCompatMode: false   # Mount namespace compatibility mode
    evasiveDevices: true     # Use any device path if original is inaccessible
    forceIrmap: true         # Force resolving inotify/fsnotify watch names

    # Advanced options
    autoDedup: false        # Auto-deduplication of memory pages
    lazyPages: false        # Lazy page migration (experimental)

    # Config file options (NOT available via RPC - written to criu.conf)
    libDir: ""                     # Keep empty: external CUDA checkpoint/restore only (no CRIU CUDA plugin)
    allowUprobes: true             # Leave enabled for kernel/userspace probe compatibility
    skipInFlight: true             # Skip in-flight TCP connections