Unverified Commit bb8fc8a4 authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

feat(chrek): external restore, signal-based IPC, and package refactor (#6286)


Co-authored-by: default avatarDan Feigin <dfeigin@nvidia.com>
parent c8423b57
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -165,7 +165,6 @@ The chart includes built-in validation to prevent all operator conflicts: ...@@ -165,7 +165,6 @@ The chart includes built-in validation to prevent all operator conflicts:
| dynamo-operator.checkpoint.readyForCheckpointFilePath | string | `"/tmp/ready-for-checkpoint"` | Path written by worker when model is loaded and ready for checkpointing | | dynamo-operator.checkpoint.readyForCheckpointFilePath | string | `"/tmp/ready-for-checkpoint"` | Path written by worker when model is loaded and ready for checkpointing |
| dynamo-operator.checkpoint.restoreMarkerFilePath | string | `"/tmp/dynamo-restored"` | Path written by restore-entrypoint after successful CRIU restore | | dynamo-operator.checkpoint.restoreMarkerFilePath | string | `"/tmp/dynamo-restored"` | Path written by restore-entrypoint after successful CRIU restore |
| dynamo-operator.checkpoint.storage.type | string | `"pvc"` | Storage backend type: pvc, s3, or oci | | dynamo-operator.checkpoint.storage.type | string | `"pvc"` | Storage backend type: pvc, s3, or oci |
| dynamo-operator.checkpoint.storage.signalHostPath | string | `"/var/lib/chrek/signals"` | Host path for signal files (communication between checkpoint pod and DaemonSet) |
| dynamo-operator.checkpoint.storage.pvc.pvcName | string | `"chrek-pvc"` | Name of the PVC created by the chrek chart | | dynamo-operator.checkpoint.storage.pvc.pvcName | string | `"chrek-pvc"` | Name of the PVC created by the chrek chart |
| dynamo-operator.checkpoint.storage.pvc.basePath | string | `"/checkpoints"` | Base path within the PVC for storing checkpoints | | dynamo-operator.checkpoint.storage.pvc.basePath | string | `"/checkpoints"` | Base path within the PVC for storing checkpoints |
| dynamo-operator.checkpoint.storage.s3.uri | string | `""` | S3 URI in format: s3://[endpoint/]bucket/prefix | | dynamo-operator.checkpoint.storage.s3.uri | string | `""` | S3 URI in format: s3://[endpoint/]bucket/prefix |
......
...@@ -148,10 +148,7 @@ spec: ...@@ -148,10 +148,7 @@ spec:
{{- if .Values.checkpoint.enabled }} {{- if .Values.checkpoint.enabled }}
- --checkpoint-enabled=true - --checkpoint-enabled=true
- --checkpoint-storage-type={{ .Values.checkpoint.storage.type }} - --checkpoint-storage-type={{ .Values.checkpoint.storage.type }}
- --checkpoint-signal-host-path={{ .Values.checkpoint.storage.signalHostPath }}
- --checkpoint-init-container-image={{ .Values.checkpoint.initContainerImage }}
- --checkpoint-ready-for-checkpoint-file-path={{ .Values.checkpoint.readyForCheckpointFilePath }} - --checkpoint-ready-for-checkpoint-file-path={{ .Values.checkpoint.readyForCheckpointFilePath }}
- --checkpoint-restore-marker-file-path={{ .Values.checkpoint.restoreMarkerFilePath }}
{{- if eq .Values.checkpoint.storage.type "pvc" }} {{- if eq .Values.checkpoint.storage.type "pvc" }}
- --checkpoint-pvc-name={{ .Values.checkpoint.storage.pvc.pvcName }} - --checkpoint-pvc-name={{ .Values.checkpoint.storage.pvc.pvcName }}
- --checkpoint-pvc-base-path={{ .Values.checkpoint.storage.pvc.basePath }} - --checkpoint-pvc-base-path={{ .Values.checkpoint.storage.pvc.basePath }}
......
This diff is collapsed.
...@@ -128,8 +128,6 @@ const ( ...@@ -128,8 +128,6 @@ const (
DynamoCheckpointConditionJobCreated DynamoCheckpointConditionType = "JobCreated" DynamoCheckpointConditionJobCreated DynamoCheckpointConditionType = "JobCreated"
// DynamoCheckpointConditionJobCompleted indicates whether the checkpoint Job has completed // DynamoCheckpointConditionJobCompleted indicates whether the checkpoint Job has completed
DynamoCheckpointConditionJobCompleted DynamoCheckpointConditionType = "JobCompleted" DynamoCheckpointConditionJobCompleted DynamoCheckpointConditionType = "JobCompleted"
// DynamoCheckpointConditionTarAvailable indicates whether the checkpoint tar file exists
DynamoCheckpointConditionTarAvailable DynamoCheckpointConditionType = "TarAvailable"
) )
// DynamoCheckpointStatus defines the observed state of DynamoCheckpoint // DynamoCheckpointStatus defines the observed state of DynamoCheckpoint
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment