Unverified Commit d381e6ff authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

feat(chrek): config refactor, /dev/shm support, and mount-policy rewrite (#5946)

parent b6824ae0
This diff is collapsed.
...@@ -151,8 +151,9 @@ spec: ...@@ -151,8 +151,9 @@ spec:
- --checkpoint-enabled=true - --checkpoint-enabled=true
- --checkpoint-storage-type={{ .Values.checkpoint.storage.type }} - --checkpoint-storage-type={{ .Values.checkpoint.storage.type }}
- --checkpoint-signal-host-path={{ .Values.checkpoint.storage.signalHostPath }} - --checkpoint-signal-host-path={{ .Values.checkpoint.storage.signalHostPath }}
- --checkpoint-criu-timeout={{ .Values.checkpoint.criu.timeout }}
- --checkpoint-init-container-image={{ .Values.checkpoint.initContainerImage }} - --checkpoint-init-container-image={{ .Values.checkpoint.initContainerImage }}
- --checkpoint-ready-for-checkpoint-file-path={{ .Values.checkpoint.readyForCheckpointFilePath }}
- --checkpoint-restore-marker-file-path={{ .Values.checkpoint.restoreMarkerFilePath }}
{{- if eq .Values.checkpoint.storage.type "pvc" }} {{- if eq .Values.checkpoint.storage.type "pvc" }}
- --checkpoint-pvc-name={{ .Values.checkpoint.storage.pvc.pvcName }} - --checkpoint-pvc-name={{ .Values.checkpoint.storage.pvc.pvcName }}
- --checkpoint-pvc-base-path={{ .Values.checkpoint.storage.pvc.basePath }} - --checkpoint-pvc-base-path={{ .Values.checkpoint.storage.pvc.basePath }}
......
...@@ -216,6 +216,15 @@ dynamo-operator: ...@@ -216,6 +216,15 @@ dynamo-operator:
# -- Whether to enable checkpoint/restore functionality # -- Whether to enable checkpoint/restore functionality
enabled: false enabled: false
# -- Image used for init containers in checkpoint jobs (e.g., signal file cleanup)
initContainerImage: "busybox:latest"
# -- Path written by worker when model is loaded and ready for checkpointing
readyForCheckpointFilePath: "/tmp/ready-for-checkpoint"
# -- Path written by restore-entrypoint after successful CRIU restore
restoreMarkerFilePath: "/tmp/dynamo-restored"
# Storage configuration # Storage configuration
# These settings tell the operator where to find checkpoint storage # These settings tell the operator where to find checkpoint storage
# Must match the configuration in the chrek chart # Must match the configuration in the chrek chart
...@@ -247,12 +256,6 @@ dynamo-operator: ...@@ -247,12 +256,6 @@ dynamo-operator:
# -- Reference to a docker config secret for registry authentication # -- Reference to a docker config secret for registry authentication
credentialsSecretRef: "" credentialsSecretRef: ""
# CRIU timeout configuration (shared across checkpoint and restore)
criu:
# -- CRIU operation timeout in seconds. Default: 21600 (6 hours)
timeout: "21600"
# Grove component - distributed inference orchestration # Grove component - distributed inference orchestration
grove: grove:
# -- Whether to enable Grove for multi-node inference coordination, if enabled, the Grove operator will be deployed cluster-wide # -- Whether to enable Grove for multi-node inference coordination, if enabled, the Grove operator will be deployed cluster-wide
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment