@@ -17,7 +17,9 @@ This Helm chart deploys the checkpoint/restore infrastructure for NVIDIA Dynamo,
⚠️ **Security Warning**: The Dynamo Snapshot DaemonSet runs in **privileged mode** with `hostPID`, `hostIPC`, and `hostNetwork` to perform CRIU checkpoint/restore operations. Workload pods do not need privileged mode. Only deploy in environments where a privileged DaemonSet is acceptable.
- Kubernetes 1.21+
-**x86_64 (amd64) nodes only** for the snapshot agent and placeholder images
- GPU nodes with NVIDIA runtime (`nvidia` runtime class)
- NVIDIA driver 580.xx or newer on the target GPU nodes
- containerd runtime (for container inspection; CRIU is bundled in Dynamo Snapshot images)
- NVIDIA Dynamo operator installed (cluster-wide or namespace-scoped)
- RWX (ReadWriteMany) storage class for multi-node deployments
...
...
@@ -35,9 +37,9 @@ export NAMESPACE=my-team # Your target namespace
export DOCKER_SERVER=your-registry.com/ # Your container registry
{{- if and .Values.daemonset.affinity (hasKey .Values.daemonset.affinity "nodeAffinity")}}
{{- fail "daemonset.affinity.nodeAffinity is not supported because the chart already enforces kubernetes.io/arch=amd64; use daemonset.nodeSelector or daemonset.affinity.podAffinity/podAntiAffinity instead"}}
{{- end}}
affinity:
# cuda-checkpoint only supports x86_64 — never schedule on arm64 nodes
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
-matchExpressions:
-key:kubernetes.io/arch
operator:In
values:
-amd64
{{- with .Values.daemonset.affinity}}
{{- toYaml . | nindent 8}}
{{- end}}
{{- end}}
# CUDA checkpoint/restore requires the nvidia container runtime
$(error PLACEHOLDER_BASE_IMG is required. Example: make docker-build-placeholder PLACEHOLDER_BASE_IMG=nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.8.1-cuda13)