Unverified Commit 43e810a4 authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

refactor(snapshot): add manifest-based snapshotctl flow and shared workload builders (#7671)


Signed-off-by: default avatarSchwinn Saereesitthipitak <schwinns@nvidia.com>
parent 23144df5
...@@ -148,6 +148,7 @@ jobs: ...@@ -148,6 +148,7 @@ jobs:
docker buildx build --push \ docker buildx build --push \
--platform linux/amd64,linux/arm64 \ --platform linux/amd64,linux/arm64 \
--build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ \ --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ \
--build-context snapshot=../snapshot \
${TAGGING_FLAGS} -f Dockerfile . ${TAGGING_FLAGS} -f Dockerfile .
echo "### Operator Container Images" >> $GITHUB_STEP_SUMMARY echo "### Operator Container Images" >> $GITHUB_STEP_SUMMARY
......
...@@ -234,12 +234,12 @@ jobs: ...@@ -234,12 +234,12 @@ jobs:
shell: bash shell: bash
working-directory: ./deploy/operator working-directory: ./deploy/operator
run: | run: |
docker buildx build --platform linux/arm64 --target linter --progress=plain --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ . docker buildx build --platform linux/arm64 --target linter --progress=plain --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ --build-context snapshot=../snapshot .
- name: Tester - name: Tester
shell: bash shell: bash
working-directory: ./deploy/operator working-directory: ./deploy/operator
run: | run: |
docker buildx build --platform linux/arm64 --target tester --progress=plain --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ . docker buildx build --platform linux/arm64 --target tester --progress=plain --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ --build-context snapshot=../snapshot .
- name: Set up Go - name: Set up Go
uses: actions/setup-go@44694675825211faa026b3c33043df3e48a5fa00 # v6.0.0 uses: actions/setup-go@44694675825211faa026b3c33043df3e48a5fa00 # v6.0.0
with: with:
...@@ -291,6 +291,7 @@ jobs: ...@@ -291,6 +291,7 @@ jobs:
docker buildx build --push ${NO_CACHE_FLAG} \ docker buildx build --push ${NO_CACHE_FLAG} \
--platform linux/amd64,linux/arm64 \ --platform linux/amd64,linux/arm64 \
--build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ \ --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ \
--build-context snapshot=../snapshot \
${TAGGING_FLAGS} -f Dockerfile . ${TAGGING_FLAGS} -f Dockerfile .
echo "### 🐳 Operator Container Images" >> $GITHUB_STEP_SUMMARY echo "### 🐳 Operator Container Images" >> $GITHUB_STEP_SUMMARY
......
...@@ -113,12 +113,12 @@ jobs: ...@@ -113,12 +113,12 @@ jobs:
shell: bash shell: bash
working-directory: ./deploy/operator working-directory: ./deploy/operator
run: | run: |
docker buildx build --platform linux/arm64 --target linter --progress=plain --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ . docker buildx build --platform linux/arm64 --target linter --progress=plain --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ --build-context snapshot=../snapshot .
- name: Tester - name: Tester
shell: bash shell: bash
working-directory: ./deploy/operator working-directory: ./deploy/operator
run: | run: |
docker buildx build --platform linux/arm64 --target tester --progress=plain --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ . docker buildx build --platform linux/arm64 --target tester --progress=plain --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ --build-context snapshot=../snapshot .
- name: Set up Go - name: Set up Go
uses: actions/setup-go@44694675825211faa026b3c33043df3e48a5fa00 # v6.0.0 uses: actions/setup-go@44694675825211faa026b3c33043df3e48a5fa00 # v6.0.0
with: with:
...@@ -163,6 +163,7 @@ jobs: ...@@ -163,6 +163,7 @@ jobs:
docker buildx build --push ${NO_CACHE_FLAG} \ docker buildx build --push ${NO_CACHE_FLAG} \
--platform linux/amd64,linux/arm64 \ --platform linux/amd64,linux/arm64 \
--build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ \ --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ \
--build-context snapshot=../snapshot \
${TAGGING_FLAGS} -f Dockerfile . ${TAGGING_FLAGS} -f Dockerfile .
echo "### 🐳 Operator Container Images" >> $GITHUB_STEP_SUMMARY echo "### 🐳 Operator Container Images" >> $GITHUB_STEP_SUMMARY
......
...@@ -162,13 +162,6 @@ The chart includes built-in validation to prevent all operator conflicts: ...@@ -162,13 +162,6 @@ The chart includes built-in validation to prevent all operator conflicts:
| dynamo-operator.webhook.certManager.certificate.rootCA.renewBefore | string | `"720h"` | Time before root CA expiration to trigger renewal (e.g., "720h" for 30 days). Renewing a CA can be disruptive as all signed certificates must be reissued. | | dynamo-operator.webhook.certManager.certificate.rootCA.renewBefore | string | `"720h"` | Time before root CA expiration to trigger renewal (e.g., "720h" for 30 days). Renewing a CA can be disruptive as all signed certificates must be reissued. |
| dynamo-operator.checkpoint.enabled | bool | `false` | Whether to enable checkpoint/restore functionality | | dynamo-operator.checkpoint.enabled | bool | `false` | Whether to enable checkpoint/restore functionality |
| dynamo-operator.checkpoint.readyForCheckpointFilePath | string | `"/tmp/ready-for-checkpoint"` | Path written by worker when model is loaded and ready for checkpointing | | dynamo-operator.checkpoint.readyForCheckpointFilePath | string | `"/tmp/ready-for-checkpoint"` | Path written by worker when model is loaded and ready for checkpointing |
| dynamo-operator.checkpoint.storage.type | string | `"pvc"` | Storage backend type: pvc, s3, or oci |
| dynamo-operator.checkpoint.storage.pvc.pvcName | string | `"snapshot-pvc"` | Name of the PVC created by the snapshot chart |
| dynamo-operator.checkpoint.storage.pvc.basePath | string | `"/checkpoints"` | Base path within the PVC for storing checkpoints |
| dynamo-operator.checkpoint.storage.s3.uri | string | `""` | S3 URI in format: s3://[endpoint/]bucket/prefix |
| dynamo-operator.checkpoint.storage.s3.credentialsSecretRef | string | `""` | Reference to a secret containing AWS credentials |
| dynamo-operator.checkpoint.storage.oci.uri | string | `""` | OCI URI in format: oci://registry/repository |
| dynamo-operator.checkpoint.storage.oci.credentialsSecretRef | string | `""` | Reference to a docker config secret for registry authentication |
| grove.tolerations | list | `[]` | Node tolerations for Grove pods | | grove.tolerations | list | `[]` | Node tolerations for Grove pods |
| grove.affinity | object | `{}` | Affinity for Grove pods | | grove.affinity | object | `{}` | Affinity for Grove pods |
| kai-scheduler.global.tolerations | list | `[]` | Node tolerations for kai-scheduler pods | | kai-scheduler.global.tolerations | list | `[]` | Node tolerations for kai-scheduler pods |
......
...@@ -8173,8 +8173,9 @@ spec: ...@@ -8173,8 +8173,9 @@ spec:
- message: sharedMemory.size must not be set when sharedMemory.disabled is true - message: sharedMemory.size must not be set when sharedMemory.disabled is true
rule: '!(has(self.disabled) && self.disabled && has(self.size))' rule: '!(has(self.disabled) && self.disabled && has(self.size))'
ttlSecondsAfterFinished: ttlSecondsAfterFinished:
default: 300 description: |-
description: TTLSecondsAfterFinished specifies how long to keep the Job after completion Deprecated: TTLSecondsAfterFinished is ignored. Checkpoint Jobs use a fixed
300 second TTL.
format: int32 format: int32
minimum: 0 minimum: 0
type: integer type: integer
...@@ -8245,7 +8246,7 @@ spec: ...@@ -8245,7 +8246,7 @@ spec:
type: object type: object
type: array type: array
createdAt: createdAt:
description: CreatedAt is the timestamp when the checkpoint tar was created description: CreatedAt is the timestamp when the checkpoint became ready
format: date-time format: date-time
type: string type: string
identityHash: identityHash:
...@@ -8258,10 +8259,8 @@ spec: ...@@ -8258,10 +8259,8 @@ spec:
type: string type: string
location: location:
description: |- description: |-
Location is the full URI/path to the checkpoint in the storage backend Deprecated: Location is ignored and no longer populated. It is retained
For PVC: same as TarPath (e.g., /checkpoints/{hash}.tar) only so older objects continue to validate.
For S3: s3://bucket/prefix/{hash}.tar
For OCI: oci://registry/repo:{hash}
type: string type: string
message: message:
description: Message provides additional information about the current state description: Message provides additional information about the current state
...@@ -8275,7 +8274,9 @@ spec: ...@@ -8275,7 +8274,9 @@ spec:
- Failed - Failed
type: string type: string
storageType: storageType:
description: StorageType indicates the storage backend type used for this checkpoint description: |-
Deprecated: StorageType is ignored and no longer populated. It is retained
only so older objects continue to validate.
enum: enum:
- pvc - pvc
- s3 - s3
......
...@@ -135,29 +135,6 @@ data: ...@@ -135,29 +135,6 @@ data:
{{- if ne (.Values.checkpoint.readyForCheckpointFilePath | toString) "/tmp/ready-for-checkpoint" }} {{- if ne (.Values.checkpoint.readyForCheckpointFilePath | toString) "/tmp/ready-for-checkpoint" }}
readyForCheckpointFilePath: {{ .Values.checkpoint.readyForCheckpointFilePath | quote }} readyForCheckpointFilePath: {{ .Values.checkpoint.readyForCheckpointFilePath | quote }}
{{- end }} {{- end }}
storage:
{{- if and .Values.checkpoint.storage.type (ne (.Values.checkpoint.storage.type | toString) "pvc") }}
type: {{ .Values.checkpoint.storage.type | quote }}
{{- end }}
{{- if or (eq (.Values.checkpoint.storage.type | toString) "pvc") (not .Values.checkpoint.storage.type) }}
pvc:
pvcName: {{ (.Values.checkpoint.storage.pvc.pvcName | default "snapshot-pvc") | quote }}
basePath: {{ (.Values.checkpoint.storage.pvc.basePath | default "/checkpoints") | quote }}
{{- end }}
{{- if eq .Values.checkpoint.storage.type "s3" }}
s3:
uri: {{ .Values.checkpoint.storage.s3.uri | quote }}
{{- if .Values.checkpoint.storage.s3.credentialsSecretRef }}
credentialsSecretRef: {{ .Values.checkpoint.storage.s3.credentialsSecretRef | quote }}
{{- end }}
{{- end }}
{{- if eq .Values.checkpoint.storage.type "oci" }}
oci:
uri: {{ .Values.checkpoint.storage.oci.uri | quote }}
{{- if .Values.checkpoint.storage.oci.credentialsSecretRef }}
credentialsSecretRef: {{ .Values.checkpoint.storage.oci.credentialsSecretRef | quote }}
{{- end }}
{{- end }}
{{- end }} {{- end }}
{{- if and .Values.discoveryBackend (ne (.Values.discoveryBackend | toString) "kubernetes") }} {{- if and .Values.discoveryBackend (ne (.Values.discoveryBackend | toString) "kubernetes") }}
discovery: discovery:
......
...@@ -149,42 +149,6 @@ checkpoint: ...@@ -149,42 +149,6 @@ checkpoint:
# Must match the path expected by checkpoint-enabled runtime images # Must match the path expected by checkpoint-enabled runtime images
readyForCheckpointFilePath: "/tmp/ready-for-checkpoint" readyForCheckpointFilePath: "/tmp/ready-for-checkpoint"
# Storage configuration
# These settings tell the operator where to find checkpoint storage
# Must match the configuration in the snapshot chart
storage:
# Storage backend type: pvc, s3, or oci
type: pvc
# PVC configuration (used when type=pvc)
pvc:
# Name of the PVC created by the snapshot chart
# Must match the PVC name in the snapshot chart
pvcName: "snapshot-pvc"
# Base path within the PVC for storing checkpoints
basePath: "/checkpoints"
# S3 configuration (used when type=s3)
s3:
# S3 URI in format: s3://[endpoint/]bucket/prefix
# Examples:
# - s3://my-bucket/checkpoints (AWS S3)
# - s3://minio.example.com/my-bucket/checkpoints (MinIO/custom endpoint)
uri: ""
# Reference to a secret containing AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, and optionally AWS_REGION
# If not provided, uses IRSA/Workload Identity for authentication
credentialsSecretRef: ""
# OCI registry configuration (used when type=oci)
oci:
# OCI URI in format: oci://registry/repository
# Examples:
# - oci://myregistry.io/checkpoints
# - oci://ghcr.io/myorg/checkpoints
uri: ""
# Reference to a docker config secret for registry authentication
credentialsSecretRef: ""
# Webhook configuration # Webhook configuration
webhook: webhook:
# Certificate configuration # Certificate configuration
......
...@@ -233,34 +233,6 @@ dynamo-operator: ...@@ -233,34 +233,6 @@ dynamo-operator:
# -- Path written by worker when model is loaded and ready for checkpointing # -- Path written by worker when model is loaded and ready for checkpointing
readyForCheckpointFilePath: "/tmp/ready-for-checkpoint" readyForCheckpointFilePath: "/tmp/ready-for-checkpoint"
# Storage configuration
# These settings tell the operator where to find checkpoint storage
# Must match the configuration in the snapshot chart
storage:
# -- Storage backend type: pvc, s3, or oci
type: pvc
# PVC storage configuration (used when type=pvc)
pvc:
# -- Name of the PVC created by the snapshot chart
pvcName: "snapshot-pvc"
# -- Base path within the PVC for storing checkpoints
basePath: "/checkpoints"
# S3 storage configuration (used when type=s3)
s3:
# -- S3 URI in format: s3://[endpoint/]bucket/prefix
uri: ""
# -- Reference to a secret containing AWS credentials
credentialsSecretRef: ""
# OCI registry storage configuration (used when type=oci)
oci:
# -- OCI URI in format: oci://registry/repository
uri: ""
# -- Reference to a docker config secret for registry authentication
credentialsSecretRef: ""
# Grove component - distributed inference orchestration # Grove component - distributed inference orchestration
# Installation is controlled by global.grove.install above. # Installation is controlled by global.grove.install above.
grove: grove:
......
# Dynamo Snapshot Helm Chart # Dynamo Snapshot Helm Chart
> ⚠️ **Experimental Feature**: Dynamo Snapshot is currently in beta/preview. The DaemonSet runs in privileged mode to perform CRIU checkpoint and restore operations. > Experimental feature. `snapshot-agent` runs as a privileged DaemonSet to
> perform CRIU checkpoint and restore operations.
This chart installs the namespace-scoped checkpoint/restore infrastructure used by Dynamo: This chart installs the namespace-scoped snapshot infrastructure used by Dynamo:
- `snapshot-agent` DaemonSet on GPU nodes - `snapshot-agent` DaemonSet on eligible GPU nodes
- `snapshot-pvc` checkpoint storage, or wiring to an existing PVC - `snapshot-pvc`, or wiring to an existing PVC
- namespace-scoped RBAC - namespace-scoped RBAC
- the seccomp profile required by CRIU - the seccomp profile CRIU needs
Snapshot storage is namespace-local. Install this chart in every namespace where you want checkpoint and restore. Install the chart in each namespace where you want checkpoint and restore.
## Prerequisites ## Prerequisites
- Kubernetes 1.21+ - Kubernetes cluster with x86_64 GPU nodes
- x86_64 GPU nodes
- NVIDIA driver 580.xx or newer - NVIDIA driver 580.xx or newer
- containerd runtime - containerd runtime
- Dynamo Platform already installed with `dynamo-operator.checkpoint.enabled=true`
- a cluster where a privileged DaemonSet with `hostPID`, `hostIPC`, and `hostNetwork` is acceptable - a cluster where a privileged DaemonSet with `hostPID`, `hostIPC`, and `hostNetwork` is acceptable
- Dynamo Platform already installed, with operator checkpointing enabled
The platform/operator configuration must point at the same checkpoint storage that this chart installs:
```yaml
dynamo-operator:
checkpoint:
enabled: true
storage:
type: pvc
pvc:
pvcName: snapshot-pvc
basePath: /checkpoints
```
The snapshot-agent no longer reads `basePath` from its ConfigMap, but the operator still uses its configured PVC base path when it annotates checkpoint and restore pods. That path must match `storage.pvc.basePath` here so the mounted checkpoint location is valid inside the agent pod.
Cross-node restore requires a shared `ReadWriteMany` storage class. The chart defaults to `storage.pvc.accessMode=ReadWriteMany`. Cross-node restore requires shared `ReadWriteMany` storage. The chart defaults to
that mode.
For better restore times, use a fast `ReadWriteMany` StorageClass for the checkpoint PVC. ## Minimal install
## Minimal Install Create the checkpoint PVC and the agent:
This is the smallest Helm install that creates the checkpoint PVC and the DaemonSet:
```bash ```bash
helm upgrade --install snapshot ./deploy/helm/charts/snapshot \ helm upgrade --install snapshot ./deploy/helm/charts/snapshot \
...@@ -50,13 +34,10 @@ helm upgrade --install snapshot ./deploy/helm/charts/snapshot \ ...@@ -50,13 +34,10 @@ helm upgrade --install snapshot ./deploy/helm/charts/snapshot \
--set storage.pvc.create=true --set storage.pvc.create=true
``` ```
If your cluster does not use a default storage class, also set `storage.pvc.storageClass`. If your cluster does not use a default storage class, also set
`storage.pvc.storageClass`.
Keep `storage.pvc.accessMode=ReadWriteMany` for this chart layout. The DaemonSet mounts the same PVC on each eligible node, so a shared `ReadWriteOnce` claim only works when the agent runs on one node.
If you already have a PVC, keep the chart in "use existing PVC" mode: Reuse an existing PVC instead:
Do not set `storage.pvc.create=true` when reusing an existing checkpoint PVC.
```bash ```bash
helm upgrade --install snapshot ./deploy/helm/charts/snapshot \ helm upgrade --install snapshot ./deploy/helm/charts/snapshot \
...@@ -74,27 +55,32 @@ kubectl rollout status daemonset/snapshot-agent -n ${NAMESPACE} ...@@ -74,27 +55,32 @@ kubectl rollout status daemonset/snapshot-agent -n ${NAMESPACE}
kubectl get pods -n ${NAMESPACE} -l app.kubernetes.io/name=snapshot -o wide kubectl get pods -n ${NAMESPACE} -l app.kubernetes.io/name=snapshot -o wide
``` ```
## Important Values ## Important values
| Parameter | Meaning | Default | | Parameter | Meaning | Default |
|-----------|---------|---------| |-----------|---------|---------|
| `storage.type` | Snapshot-owned storage backend | `pvc` |
| `storage.pvc.create` | Create `snapshot-pvc` instead of using an existing PVC | `true` | | `storage.pvc.create` | Create `snapshot-pvc` instead of using an existing PVC | `true` |
| `storage.pvc.name` | PVC name used by the agent and by the operator config | `snapshot-pvc` | | `storage.pvc.name` | PVC mounted by the snapshot-agent | `snapshot-pvc` |
| `storage.pvc.size` | Requested PVC size | `1Ti` | | `storage.pvc.size` | Requested PVC size | `1Ti` |
| `storage.pvc.storageClass` | Storage class name | `""` | | `storage.pvc.storageClass` | Storage class name | `""` |
| `storage.pvc.accessMode` | Access mode for the checkpoint PVC | `ReadWriteMany` | | `storage.pvc.accessMode` | Access mode for the checkpoint PVC | `ReadWriteMany` |
| `storage.pvc.basePath` | PVC mount path inside the snapshot-agent pod | `/checkpoints` | | `storage.pvc.basePath` | Mount path inside the snapshot-agent pod | `/checkpoints` |
| `daemonset.image.repository` | Snapshot agent image repository | `nvcr.io/nvidia/ai-dynamo/snapshot-agent` | | `daemonset.image.repository` | Snapshot-agent image repository | `nvcr.io/nvidia/ai-dynamo/snapshot-agent` |
| `daemonset.image.tag` | Snapshot agent image tag | `1.0.0` | | `daemonset.image.tag` | Snapshot-agent image tag | `1.0.0` |
| `daemonset.imagePullSecrets` | Image pull secrets for the agent | `[{name: ngc-secret}]` | | `daemonset.imagePullSecrets` | Image pull secrets for the agent | `[{name: ngc-secret}]` |
See [values.yaml](./values.yaml) for the complete configuration surface. Reserved `s3` and `oci` values remain chart-owned placeholders for future
snapshot backends, but only `pvc` is implemented today.
See [values.yaml](./values.yaml) for the full configuration surface.
## End To End ## Next steps
Once the chart is installed, use the snapshot guide to deploy a snapshot-capable `DynamoGraphDeployment`, wait for the checkpoint to become ready, and then scale the worker to verify restore: Once the chart is installed, use the snapshot guide to create a checkpoint or
exercise the lower-level `snapshotctl` flow:
- [Snapshot](../../../../docs/kubernetes/snapshot.md) - [Snapshot guide](../../../../docs/kubernetes/snapshot.md)
## Uninstall ## Uninstall
...@@ -102,24 +88,9 @@ Once the chart is installed, use the snapshot guide to deploy a snapshot-capable ...@@ -102,24 +88,9 @@ Once the chart is installed, use the snapshot guide to deploy a snapshot-capable
helm uninstall snapshot -n ${NAMESPACE} helm uninstall snapshot -n ${NAMESPACE}
``` ```
The chart does not remove checkpoint data automatically. Delete the PVC yourself if you want to remove stored checkpoints: The chart does not delete checkpoint data automatically. Remove the PVC
yourself if you want to clear stored checkpoints:
```bash ```bash
kubectl delete pvc snapshot-pvc -n ${NAMESPACE} kubectl delete pvc snapshot-pvc -n ${NAMESPACE}
``` ```
## Troubleshooting
If `snapshot-agent` does not schedule:
```bash
kubectl get nodes -l nvidia.com/gpu.present=true
kubectl describe daemonset snapshot-agent -n ${NAMESPACE}
kubectl logs -n ${NAMESPACE} -l app.kubernetes.io/name=snapshot --all-containers
```
If checkpoint creation never becomes ready, verify all three pieces line up:
- the operator has `dynamo-operator.checkpoint.enabled=true`
- the operator PVC name and base path match the snapshot chart values
- the workload uses a snapshot-capable worker image and command
...@@ -52,7 +52,7 @@ helm.sh/chart: {{ include "snapshot.chart" . }} ...@@ -52,7 +52,7 @@ helm.sh/chart: {{ include "snapshot.chart" . }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
{{- end }} {{- end }}
app.kubernetes.io/managed-by: {{ .Release.Service }} app.kubernetes.io/managed-by: {{ .Release.Service }}
app.kubernetes.io/component: checkpoint-agent app.kubernetes.io/component: snapshot-agent
{{- end }} {{- end }}
{{/* {{/*
......
...@@ -3,6 +3,9 @@ ...@@ -3,6 +3,9 @@
apiVersion: v1 apiVersion: v1
kind: ConfigMap kind: ConfigMap
{{- if ne .Values.storage.type "pvc" }}
{{- fail (printf "snapshot.storage.type=%q is not supported yet; only pvc is currently implemented" .Values.storage.type) }}
{{- end }}
metadata: metadata:
name: {{ include "snapshot.fullname" . }}-config name: {{ include "snapshot.fullname" . }}-config
namespace: {{ .Release.Namespace }} namespace: {{ .Release.Namespace }}
...@@ -10,6 +13,12 @@ metadata: ...@@ -10,6 +13,12 @@ metadata:
{{- include "snapshot.labels" . | nindent 4 }} {{- include "snapshot.labels" . | nindent 4 }}
data: data:
config.yaml: | config.yaml: |
storage:
type: {{ .Values.storage.type | quote }}
{{- if eq .Values.storage.type "pvc" }}
basePath: {{ .Values.storage.pvc.basePath | quote }}
{{- end }}
overlay: overlay:
exclusions: {{ toYaml .Values.config.overlay.exclusions | nindent 8 }} exclusions: {{ toYaml .Values.config.overlay.exclusions | nindent 8 }}
......
...@@ -16,6 +16,7 @@ spec: ...@@ -16,6 +16,7 @@ spec:
metadata: metadata:
labels: labels:
{{- include "snapshot.selectorLabels" . | nindent 8 }} {{- include "snapshot.selectorLabels" . | nindent 8 }}
app.kubernetes.io/component: snapshot-agent
{{- with .Values.daemonset.podLabels }} {{- with .Values.daemonset.podLabels }}
{{- toYaml . | nindent 8 }} {{- toYaml . | nindent 8 }}
{{- end }} {{- end }}
......
...@@ -10,7 +10,6 @@ metadata: ...@@ -10,7 +10,6 @@ metadata:
namespace: {{ .Release.Namespace }} namespace: {{ .Release.Namespace }}
labels: labels:
{{- include "snapshot.labels" . | nindent 4 }} {{- include "snapshot.labels" . | nindent 4 }}
app.kubernetes.io/component: checkpoint-agent
rules: rules:
# Watch and annotate pods in this namespace to drive checkpoint/restore lifecycle # Watch and annotate pods in this namespace to drive checkpoint/restore lifecycle
- apiGroups: [""] - apiGroups: [""]
...@@ -39,7 +38,6 @@ metadata: ...@@ -39,7 +38,6 @@ metadata:
name: {{ include "snapshot.fullname" . }}-agent name: {{ include "snapshot.fullname" . }}-agent
labels: labels:
{{- include "snapshot.labels" . | nindent 4 }} {{- include "snapshot.labels" . | nindent 4 }}
app.kubernetes.io/component: checkpoint-agent
rules: rules:
# Watch and annotate pods cluster-wide on assigned nodes # Watch and annotate pods cluster-wide on assigned nodes
- apiGroups: [""] - apiGroups: [""]
...@@ -73,7 +71,6 @@ metadata: ...@@ -73,7 +71,6 @@ metadata:
name: {{ include "snapshot.fullname" . }}-agent-resourceslices name: {{ include "snapshot.fullname" . }}-agent-resourceslices
labels: labels:
{{- include "snapshot.labels" . | nindent 4 }} {{- include "snapshot.labels" . | nindent 4 }}
app.kubernetes.io/component: checkpoint-agent
rules: rules:
- apiGroups: ["resource.k8s.io"] - apiGroups: ["resource.k8s.io"]
resources: ["resourceslices"] resources: ["resourceslices"]
......
...@@ -10,7 +10,6 @@ metadata: ...@@ -10,7 +10,6 @@ metadata:
namespace: {{ .Release.Namespace }} namespace: {{ .Release.Namespace }}
labels: labels:
{{- include "snapshot.labels" . | nindent 4 }} {{- include "snapshot.labels" . | nindent 4 }}
app.kubernetes.io/component: checkpoint-agent
roleRef: roleRef:
apiGroup: rbac.authorization.k8s.io apiGroup: rbac.authorization.k8s.io
kind: Role kind: Role
...@@ -27,7 +26,6 @@ metadata: ...@@ -27,7 +26,6 @@ metadata:
name: {{ include "snapshot.fullname" . }}-agent-resourceslices name: {{ include "snapshot.fullname" . }}-agent-resourceslices
labels: labels:
{{- include "snapshot.labels" . | nindent 4 }} {{- include "snapshot.labels" . | nindent 4 }}
app.kubernetes.io/component: checkpoint-agent
roleRef: roleRef:
apiGroup: rbac.authorization.k8s.io apiGroup: rbac.authorization.k8s.io
kind: ClusterRole kind: ClusterRole
...@@ -43,7 +41,6 @@ metadata: ...@@ -43,7 +41,6 @@ metadata:
name: {{ include "snapshot.fullname" . }}-agent name: {{ include "snapshot.fullname" . }}-agent
labels: labels:
{{- include "snapshot.labels" . | nindent 4 }} {{- include "snapshot.labels" . | nindent 4 }}
app.kubernetes.io/component: checkpoint-agent
roleRef: roleRef:
apiGroup: rbac.authorization.k8s.io apiGroup: rbac.authorization.k8s.io
kind: ClusterRole kind: ClusterRole
......
...@@ -9,7 +9,6 @@ metadata: ...@@ -9,7 +9,6 @@ metadata:
namespace: {{ .Release.Namespace }} namespace: {{ .Release.Namespace }}
labels: labels:
{{- include "snapshot.labels" . | nindent 4 }} {{- include "snapshot.labels" . | nindent 4 }}
app.kubernetes.io/component: seccomp
data: data:
block-iouring.json: | block-iouring.json: |
{ {
...@@ -24,4 +23,3 @@ data: ...@@ -24,4 +23,3 @@ data:
] ]
} }
{{- end }} {{- end }}
...@@ -9,10 +9,8 @@ metadata: ...@@ -9,10 +9,8 @@ metadata:
namespace: {{ .Release.Namespace }} namespace: {{ .Release.Namespace }}
labels: labels:
{{- include "snapshot.labels" . | nindent 4 }} {{- include "snapshot.labels" . | nindent 4 }}
app.kubernetes.io/component: checkpoint-agent
{{- with .Values.serviceAccount.annotations }} {{- with .Values.serviceAccount.annotations }}
annotations: annotations:
{{- toYaml . | nindent 4 }} {{- toYaml . | nindent 4 }}
{{- end }} {{- end }}
{{- end }} {{- end }}
...@@ -19,14 +19,15 @@ ...@@ -19,14 +19,15 @@
# Storage configuration for checkpoints # Storage configuration for checkpoints
storage: storage:
# Storage type: pvc (default), s3, or oci # Storage type stays snapshot-owned for future backend expansion.
# Only pvc is implemented today.
type: pvc type: pvc
# PVC configuration (when type=pvc) # PVC configuration (when type=pvc)
pvc: pvc:
# Create a new PVC (set to false if using existing PVC) # Create a new PVC (set to false if using existing PVC)
create: true create: true
# PVC name - must match operator configuration # PVC name used by the snapshot-agent checkpoint store
name: snapshot-pvc name: snapshot-pvc
# PVC size # PVC size
size: 1Ti size: 1Ti
...@@ -35,18 +36,15 @@ storage: ...@@ -35,18 +36,15 @@ storage:
# Access mode - ReadWriteMany required for multi-pod access # Access mode - ReadWriteMany required for multi-pod access
accessMode: ReadWriteMany accessMode: ReadWriteMany
# PVC mount path inside the snapshot-agent pod. # PVC mount path inside the snapshot-agent pod.
# This must match the operator checkpoint.storage.pvc.basePath setting. # Restore targets derive checkpoint paths from this mount.
basePath: /checkpoints basePath: /checkpoints
# S3 configuration (when type=s3) # Reserved for future snapshot-owned backends. Unsupported today.
s3: s3:
# S3 URI (e.g., s3://my-bucket/checkpoints)
uri: "" uri: ""
# Credentials are expected via IRSA or mounted secrets
# OCI configuration (when type=oci) # Reserved for future snapshot-owned backends. Unsupported today.
oci: oci:
# OCI URI (e.g., oci://registry.io/repo/checkpoints)
uri: "" uri: ""
# DaemonSet configuration for snapshot (checkpoint/restore) agent # DaemonSet configuration for snapshot (checkpoint/restore) agent
......
...@@ -21,10 +21,12 @@ WORKDIR /workspace ...@@ -21,10 +21,12 @@ WORKDIR /workspace
# Copy go mod and sum files first for better layer caching # Copy go mod and sum files first for better layer caching
COPY go.mod go.sum ./ COPY go.mod go.sum ./
COPY --from=snapshot go.mod go.sum /snapshot/
RUN go mod download RUN go mod download
# Copy source code # Copy source code
COPY . . COPY . .
COPY --from=snapshot . /snapshot/
# Lint stage # Lint stage
FROM base AS linter FROM base AS linter
......
...@@ -175,7 +175,7 @@ run: manifests generate fmt vet ## Run a controller from your host. ...@@ -175,7 +175,7 @@ run: manifests generate fmt vet ## Run a controller from your host.
# More info: https://docs.docker.com/develop/develop-images/build_enhancements/ # More info: https://docs.docker.com/develop/develop-images/build_enhancements/
.PHONY: docker-build .PHONY: docker-build
docker-build: ## Build docker image with the manager. docker-build: ## Build docker image with the manager.
$(CONTAINER_TOOL) build -t ${IMG} . $(CONTAINER_TOOL) build --build-context snapshot=../snapshot -t ${IMG} .
.PHONY: docker-push .PHONY: docker-push
docker-push: ## Push docker image with the manager. docker-push: ## Push docker image with the manager.
...@@ -194,7 +194,7 @@ docker-buildx: ## Build and push docker image for the manager for cross-platform ...@@ -194,7 +194,7 @@ docker-buildx: ## Build and push docker image for the manager for cross-platform
sed -e '1 s/\(^FROM\)/FROM --platform=\$$\{BUILDPLATFORM\}/; t' -e ' 1,// s//FROM --platform=\$$\{BUILDPLATFORM\}/' Dockerfile > Dockerfile.cross sed -e '1 s/\(^FROM\)/FROM --platform=\$$\{BUILDPLATFORM\}/; t' -e ' 1,// s//FROM --platform=\$$\{BUILDPLATFORM\}/' Dockerfile > Dockerfile.cross
- $(CONTAINER_TOOL) buildx create --name project-v3-builder - $(CONTAINER_TOOL) buildx create --name project-v3-builder
$(CONTAINER_TOOL) buildx use project-v3-builder $(CONTAINER_TOOL) buildx use project-v3-builder
- $(CONTAINER_TOOL) buildx build --push --platform=$(PLATFORMS) --tag ${IMG} -f Dockerfile.cross . - $(CONTAINER_TOOL) buildx build --push --platform=$(PLATFORMS) --build-context snapshot=../snapshot --tag ${IMG} -f Dockerfile.cross .
- $(CONTAINER_TOOL) buildx rm project-v3-builder - $(CONTAINER_TOOL) buildx rm project-v3-builder
rm Dockerfile.cross rm Dockerfile.cross
......
...@@ -89,15 +89,6 @@ func SetDefaultsOperatorConfiguration(obj *OperatorConfiguration) { ...@@ -89,15 +89,6 @@ func SetDefaultsOperatorConfiguration(obj *OperatorConfiguration) {
if obj.Checkpoint.ReadyForCheckpointFilePath == "" { if obj.Checkpoint.ReadyForCheckpointFilePath == "" {
obj.Checkpoint.ReadyForCheckpointFilePath = "/tmp/ready-for-checkpoint" obj.Checkpoint.ReadyForCheckpointFilePath = "/tmp/ready-for-checkpoint"
} }
if obj.Checkpoint.Storage.Type == "" {
obj.Checkpoint.Storage.Type = CheckpointStorageTypePVC
}
if obj.Checkpoint.Storage.PVC.PVCName == "" {
obj.Checkpoint.Storage.PVC.PVCName = "snapshot-pvc"
}
if obj.Checkpoint.Storage.PVC.BasePath == "" {
obj.Checkpoint.Storage.PVC.BasePath = "/checkpoints"
}
// Logging defaults // Logging defaults
if obj.Logging.Level == "" { if obj.Logging.Level == "" {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment