Unverified Commit 43e810a4 authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

refactor(snapshot): add manifest-based snapshotctl flow and shared workload builders (#7671)


Signed-off-by: default avatarSchwinn Saereesitthipitak <schwinns@nvidia.com>
parent 23144df5
...@@ -25,7 +25,9 @@ import ( ...@@ -25,7 +25,9 @@ import (
configv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/config/v1alpha1" configv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/config/v1alpha1"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/checkpoint" "github.com/ai-dynamo/dynamo/deploy/operator/internal/checkpoint"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/checkpointjob"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts" "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
batchv1 "k8s.io/api/batch/v1" batchv1 "k8s.io/api/batch/v1"
...@@ -58,7 +60,9 @@ var testHash = func() string { ...@@ -58,7 +60,9 @@ var testHash = func() string {
return hash return hash
}() }()
var defaultCheckpointJobName = "checkpoint-job-" + testHash + "-" + consts.DefaultCheckpointArtifactVersion var defaultCheckpointJobName = checkpointjob.DesiredCheckpointJobName(testHash, map[string]string{
snapshotprotocol.CheckpointArtifactVersionAnnotation: snapshotprotocol.DefaultCheckpointArtifactVersion,
})
func checkpointTestScheme() *runtime.Scheme { func checkpointTestScheme() *runtime.Scheme {
s := runtime.NewScheme() s := runtime.NewScheme()
...@@ -74,13 +78,6 @@ func checkpointTestConfig() *configv1alpha1.OperatorConfiguration { ...@@ -74,13 +78,6 @@ func checkpointTestConfig() *configv1alpha1.OperatorConfiguration {
Checkpoint: configv1alpha1.CheckpointConfiguration{ Checkpoint: configv1alpha1.CheckpointConfiguration{
Enabled: true, Enabled: true,
ReadyForCheckpointFilePath: "/tmp/ready-for-checkpoint", ReadyForCheckpointFilePath: "/tmp/ready-for-checkpoint",
Storage: configv1alpha1.CheckpointStorageConfiguration{
Type: configv1alpha1.CheckpointStorageTypePVC,
PVC: configv1alpha1.CheckpointPVCConfig{
PVCName: "snapshot-pvc",
BasePath: "/checkpoints",
},
},
}, },
} }
} }
...@@ -143,14 +140,15 @@ func TestBuildCheckpointJob(t *testing.T) { ...@@ -143,14 +140,15 @@ func TestBuildCheckpointJob(t *testing.T) {
} }
r := makeCheckpointReconciler(s, ckpt) r := makeCheckpointReconciler(s, ckpt)
job := r.buildCheckpointJob(ckpt, defaultCheckpointJobName) job, err := checkpointjob.BuildCheckpointJob(r.Config, ckpt, defaultCheckpointJobName)
require.NoError(t, err)
podSpec := job.Spec.Template.Spec podSpec := job.Spec.Template.Spec
main := podSpec.Containers[0] main := podSpec.Containers[0]
// Job and pod template labels // Job and pod template labels
assert.Equal(t, testHash, job.Labels[consts.KubeLabelCheckpointHash]) assert.Equal(t, testHash, job.Labels[consts.KubeLabelCheckpointID])
assert.Equal(t, "true", job.Spec.Template.Labels[consts.KubeLabelIsCheckpointSource]) assert.Equal(t, "true", job.Spec.Template.Labels[consts.KubeLabelIsCheckpointSource])
assert.Equal(t, testHash, job.Spec.Template.Labels[consts.KubeLabelCheckpointHash]) assert.Equal(t, testHash, job.Spec.Template.Labels[consts.KubeLabelCheckpointID])
// Env vars (checkpoint-specific + user-provided preserved) // Env vars (checkpoint-specific + user-provided preserved)
envMap := make(map[string]string, len(main.Env)) envMap := make(map[string]string, len(main.Env))
...@@ -234,25 +232,26 @@ func TestBuildCheckpointJob(t *testing.T) { ...@@ -234,25 +232,26 @@ func TestBuildCheckpointJob(t *testing.T) {
assert.Equal(t, int32(0), *job.Spec.BackoffLimit) assert.Equal(t, int32(0), *job.Spec.BackoffLimit)
assert.Equal(t, int32(300), *job.Spec.TTLSecondsAfterFinished) assert.Equal(t, int32(300), *job.Spec.TTLSecondsAfterFinished)
// Custom deadlines override defaults, but checkpoint jobs never retry. // Custom active deadlines override defaults, but checkpoint jobs never retry and keep a fixed TTL.
deadline := int64(7200) deadline := int64(7200)
backoff := int32(5) backoff := int32(5)
ttl := int32(600)
ckpt.Spec.Job.ActiveDeadlineSeconds = &deadline ckpt.Spec.Job.ActiveDeadlineSeconds = &deadline
ckpt.Spec.Job.BackoffLimit = &backoff //nolint:staticcheck // Compatibility test: deprecated field must remain ignored by checkpoint Jobs. ckpt.Spec.Job.BackoffLimit = &backoff //nolint:staticcheck // Compatibility test: deprecated field must remain ignored by checkpoint Jobs.
ckpt.Spec.Job.TTLSecondsAfterFinished = &ttl job, err = checkpointjob.BuildCheckpointJob(r.Config, ckpt, defaultCheckpointJobName)
job = r.buildCheckpointJob(ckpt, defaultCheckpointJobName) require.NoError(t, err)
assert.Equal(t, int64(7200), *job.Spec.ActiveDeadlineSeconds) assert.Equal(t, int64(7200), *job.Spec.ActiveDeadlineSeconds)
assert.Equal(t, int32(0), *job.Spec.BackoffLimit) assert.Equal(t, int32(0), *job.Spec.BackoffLimit)
assert.Equal(t, int32(600), *job.Spec.TTLSecondsAfterFinished) assert.Equal(t, int32(300), *job.Spec.TTLSecondsAfterFinished)
ckpt.Spec.Job.PodTemplateSpec.Spec.Containers[0].Resources = corev1.ResourceRequirements{ ckpt.Spec.Job.PodTemplateSpec.Spec.Containers[0].Resources = corev1.ResourceRequirements{
Limits: corev1.ResourceList{ Limits: corev1.ResourceList{
corev1.ResourceName("nvidia.com/gpu"): resource.MustParse("2"), corev1.ResourceName("nvidia.com/gpu"): resource.MustParse("2"),
}, },
} }
job = r.buildCheckpointJob(ckpt, defaultCheckpointJobName) job, err = checkpointjob.BuildCheckpointJob(r.Config, ckpt, defaultCheckpointJobName)
assert.Equal(t, []string{"cuda-checkpoint", "--launch-job", "python3", "-m", "dynamo.vllm"}, job.Spec.Template.Spec.Containers[0].Command) require.NoError(t, err)
assert.Equal(t, []string{"cuda-checkpoint"}, job.Spec.Template.Spec.Containers[0].Command)
assert.Equal(t, []string{"--launch-job", "python3", "-m", "dynamo.vllm"}, job.Spec.Template.Spec.Containers[0].Args)
} }
func TestBuildCheckpointJobInjectsStandardEnvVars(t *testing.T) { func TestBuildCheckpointJobInjectsStandardEnvVars(t *testing.T) {
...@@ -274,7 +273,8 @@ func TestBuildCheckpointJobInjectsStandardEnvVars(t *testing.T) { ...@@ -274,7 +273,8 @@ func TestBuildCheckpointJobInjectsStandardEnvVars(t *testing.T) {
customShmSize := resource.MustParse("16Gi") customShmSize := resource.MustParse("16Gi")
ckpt.Spec.Job.SharedMemory = &nvidiacomv1alpha1.SharedMemorySpec{Size: customShmSize} ckpt.Spec.Job.SharedMemory = &nvidiacomv1alpha1.SharedMemorySpec{Size: customShmSize}
job := r.buildCheckpointJob(ckpt, defaultCheckpointJobName) job, err := checkpointjob.BuildCheckpointJob(r.Config, ckpt, defaultCheckpointJobName)
require.NoError(t, err)
foundCustomShmVolume := false foundCustomShmVolume := false
for _, v := range job.Spec.Template.Spec.Volumes { for _, v := range job.Spec.Template.Spec.Volumes {
if v.Name == consts.KubeValueNameSharedMemory { if v.Name == consts.KubeValueNameSharedMemory {
...@@ -326,7 +326,7 @@ func TestCheckpointReconciler_Reconcile(t *testing.T) { ...@@ -326,7 +326,7 @@ func TestCheckpointReconciler_Reconcile(t *testing.T) {
assert.Equal(t, nvidiacomv1alpha1.DynamoCheckpointPhasePending, updated.Status.Phase) assert.Equal(t, nvidiacomv1alpha1.DynamoCheckpointPhasePending, updated.Status.Phase)
assert.Equal(t, testHash, updated.Status.IdentityHash) assert.Equal(t, testHash, updated.Status.IdentityHash)
assert.Empty(t, updated.Status.Message) assert.Empty(t, updated.Status.Message)
assert.Equal(t, testHash, updated.Labels[consts.KubeLabelCheckpointHash]) assert.Equal(t, testHash, updated.Labels[consts.KubeLabelCheckpointID])
}) })
t.Run("Ready phase is a no-op", func(t *testing.T) { t.Run("Ready phase is a no-op", func(t *testing.T) {
...@@ -352,7 +352,7 @@ func TestCheckpointReconciler_Reconcile(t *testing.T) { ...@@ -352,7 +352,7 @@ func TestCheckpointReconciler_Reconcile(t *testing.T) {
updated := &nvidiacomv1alpha1.DynamoCheckpoint{} updated := &nvidiacomv1alpha1.DynamoCheckpoint{}
require.NoError(t, r.Get(ctx, types.NamespacedName{Name: friendlyCheckpointName, Namespace: testNamespace}, updated)) require.NoError(t, r.Get(ctx, types.NamespacedName{Name: friendlyCheckpointName, Namespace: testNamespace}, updated))
assert.Equal(t, testHash, updated.Labels[consts.KubeLabelCheckpointHash]) assert.Equal(t, testHash, updated.Labels[consts.KubeLabelCheckpointID])
assert.Equal(t, testHash, updated.Status.IdentityHash) assert.Equal(t, testHash, updated.Status.IdentityHash)
}) })
...@@ -375,7 +375,6 @@ func TestCheckpointReconciler_Reconcile(t *testing.T) { ...@@ -375,7 +375,6 @@ func TestCheckpointReconciler_Reconcile(t *testing.T) {
ckpt := makeTestCheckpoint(nvidiacomv1alpha1.DynamoCheckpointPhaseReady) ckpt := makeTestCheckpoint(nvidiacomv1alpha1.DynamoCheckpointPhaseReady)
ckpt.Status.IdentityHash = testHash ckpt.Status.IdentityHash = testHash
ckpt.Status.JobName = defaultCheckpointJobName ckpt.Status.JobName = defaultCheckpointJobName
ckpt.Status.Location = "/checkpoints/" + testHash + "/versions/" + consts.DefaultCheckpointArtifactVersion
ckpt.Annotations = map[string]string{consts.KubeAnnotationCheckpointArtifactVersion: "2"} ckpt.Annotations = map[string]string{consts.KubeAnnotationCheckpointArtifactVersion: "2"}
r := makeCheckpointReconciler(s, ckpt) r := makeCheckpointReconciler(s, ckpt)
...@@ -388,7 +387,6 @@ func TestCheckpointReconciler_Reconcile(t *testing.T) { ...@@ -388,7 +387,6 @@ func TestCheckpointReconciler_Reconcile(t *testing.T) {
require.NoError(t, r.Get(ctx, types.NamespacedName{Name: ckpt.Name, Namespace: testNamespace}, updated)) require.NoError(t, r.Get(ctx, types.NamespacedName{Name: ckpt.Name, Namespace: testNamespace}, updated))
assert.Equal(t, nvidiacomv1alpha1.DynamoCheckpointPhaseCreating, updated.Status.Phase) assert.Equal(t, nvidiacomv1alpha1.DynamoCheckpointPhaseCreating, updated.Status.Phase)
assert.Equal(t, "checkpoint-job-"+testHash+"-2", updated.Status.JobName) assert.Equal(t, "checkpoint-job-"+testHash+"-2", updated.Status.JobName)
assert.Equal(t, "/checkpoints/"+testHash+"/versions/2", updated.Status.Location)
}) })
t.Run("duplicate identity hash is rejected even with a readable name", func(t *testing.T) { t.Run("duplicate identity hash is rejected even with a readable name", func(t *testing.T) {
...@@ -431,13 +429,11 @@ func TestCheckpointReconciler_HandleCreating(t *testing.T) { ...@@ -431,13 +429,11 @@ func TestCheckpointReconciler_HandleCreating(t *testing.T) {
t.Run("succeeded job transitions to Ready", func(t *testing.T) { t.Run("succeeded job transitions to Ready", func(t *testing.T) {
ckpt := makeCreatingCkpt(testHash, defaultCheckpointJobName) ckpt := makeCreatingCkpt(testHash, defaultCheckpointJobName)
ckpt.Status.Location = "/checkpoints/" + testHash + "/versions/" + consts.DefaultCheckpointArtifactVersion
ckpt.Status.StorageType = "pvc"
job := &batchv1.Job{ job := &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: defaultCheckpointJobName, Name: defaultCheckpointJobName,
Namespace: testNamespace, Namespace: testNamespace,
Annotations: map[string]string{checkpointStatusAnnotation: checkpointStatusCompleted}, Annotations: map[string]string{snapshotprotocol.CheckpointStatusAnnotation: snapshotprotocol.CheckpointStatusCompleted},
}, },
Status: batchv1.JobStatus{ Status: batchv1.JobStatus{
Succeeded: 1, Succeeded: 1,
...@@ -454,8 +450,6 @@ func TestCheckpointReconciler_HandleCreating(t *testing.T) { ...@@ -454,8 +450,6 @@ func TestCheckpointReconciler_HandleCreating(t *testing.T) {
updated := &nvidiacomv1alpha1.DynamoCheckpoint{} updated := &nvidiacomv1alpha1.DynamoCheckpoint{}
require.NoError(t, r.Get(ctx, types.NamespacedName{Name: testHash, Namespace: testNamespace}, updated)) require.NoError(t, r.Get(ctx, types.NamespacedName{Name: testHash, Namespace: testNamespace}, updated))
assert.Equal(t, nvidiacomv1alpha1.DynamoCheckpointPhaseReady, updated.Status.Phase) assert.Equal(t, nvidiacomv1alpha1.DynamoCheckpointPhaseReady, updated.Status.Phase)
assert.Equal(t, "/checkpoints/"+testHash+"/versions/"+consts.DefaultCheckpointArtifactVersion, updated.Status.Location)
assert.Equal(t, nvidiacomv1alpha1.DynamoCheckpointStorageType("pvc"), updated.Status.StorageType)
assert.NotNil(t, updated.Status.CreatedAt) assert.NotNil(t, updated.Status.CreatedAt)
}) })
...@@ -531,7 +525,7 @@ func TestCheckpointReconciler_HandleCreating(t *testing.T) { ...@@ -531,7 +525,7 @@ func TestCheckpointReconciler_HandleCreating(t *testing.T) {
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: "job-agent-failed", Name: "job-agent-failed",
Namespace: testNamespace, Namespace: testNamespace,
Annotations: map[string]string{checkpointStatusAnnotation: checkpointStatusFailed}, Annotations: map[string]string{snapshotprotocol.CheckpointStatusAnnotation: snapshotprotocol.CheckpointStatusFailed},
}, },
Status: batchv1.JobStatus{ Status: batchv1.JobStatus{
Succeeded: 1, Succeeded: 1,
...@@ -557,7 +551,7 @@ func TestCheckpointReconciler_HandleCreating(t *testing.T) { ...@@ -557,7 +551,7 @@ func TestCheckpointReconciler_HandleCreating(t *testing.T) {
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: "job-running-agent-failed", Name: "job-running-agent-failed",
Namespace: testNamespace, Namespace: testNamespace,
Annotations: map[string]string{checkpointStatusAnnotation: checkpointStatusFailed}, Annotations: map[string]string{snapshotprotocol.CheckpointStatusAnnotation: snapshotprotocol.CheckpointStatusFailed},
}, },
Status: batchv1.JobStatus{Active: 1}, Status: batchv1.JobStatus{Active: 1},
} }
...@@ -590,14 +584,12 @@ func TestCheckpointReconciler_HandleCreating(t *testing.T) { ...@@ -590,14 +584,12 @@ func TestCheckpointReconciler_HandleCreating(t *testing.T) {
t.Run("in-flight version changes do not relabel the running job's artifact", func(t *testing.T) { t.Run("in-flight version changes do not relabel the running job's artifact", func(t *testing.T) {
ckpt := makeCreatingCkpt(testHash, defaultCheckpointJobName) ckpt := makeCreatingCkpt(testHash, defaultCheckpointJobName)
ckpt.Status.Location = "/checkpoints/" + testHash + "/versions/" + consts.DefaultCheckpointArtifactVersion
ckpt.Status.StorageType = "pvc"
ckpt.Annotations = map[string]string{consts.KubeAnnotationCheckpointArtifactVersion: "2"} ckpt.Annotations = map[string]string{consts.KubeAnnotationCheckpointArtifactVersion: "2"}
job := &batchv1.Job{ job := &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: defaultCheckpointJobName, Name: defaultCheckpointJobName,
Namespace: testNamespace, Namespace: testNamespace,
Annotations: map[string]string{checkpointStatusAnnotation: checkpointStatusCompleted}, Annotations: map[string]string{snapshotprotocol.CheckpointStatusAnnotation: snapshotprotocol.CheckpointStatusCompleted},
}, },
Status: batchv1.JobStatus{ Status: batchv1.JobStatus{
Succeeded: 1, Succeeded: 1,
...@@ -614,7 +606,6 @@ func TestCheckpointReconciler_HandleCreating(t *testing.T) { ...@@ -614,7 +606,6 @@ func TestCheckpointReconciler_HandleCreating(t *testing.T) {
updated := &nvidiacomv1alpha1.DynamoCheckpoint{} updated := &nvidiacomv1alpha1.DynamoCheckpoint{}
require.NoError(t, r.Get(ctx, types.NamespacedName{Name: testHash, Namespace: testNamespace}, updated)) require.NoError(t, r.Get(ctx, types.NamespacedName{Name: testHash, Namespace: testNamespace}, updated))
assert.Equal(t, nvidiacomv1alpha1.DynamoCheckpointPhaseReady, updated.Status.Phase) assert.Equal(t, nvidiacomv1alpha1.DynamoCheckpointPhaseReady, updated.Status.Phase)
assert.Equal(t, "/checkpoints/"+testHash+"/versions/"+consts.DefaultCheckpointArtifactVersion, updated.Status.Location)
}) })
t.Run("succeeded count without complete condition keeps Creating phase", func(t *testing.T) { t.Run("succeeded count without complete condition keeps Creating phase", func(t *testing.T) {
......
...@@ -80,6 +80,7 @@ type DynamoComponentDeploymentReconciler struct { ...@@ -80,6 +80,7 @@ type DynamoComponentDeploymentReconciler struct {
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamocomponentdeployments/status,verbs=get;update;patch // +kubebuilder:rbac:groups=nvidia.com,resources=dynamocomponentdeployments/status,verbs=get;update;patch
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamocomponentdeployments/finalizers,verbs=update // +kubebuilder:rbac:groups=nvidia.com,resources=dynamocomponentdeployments/finalizers,verbs=update
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamocheckpoints,verbs=get;list // +kubebuilder:rbac:groups=nvidia.com,resources=dynamocheckpoints,verbs=get;list
// +kubebuilder:rbac:groups=apps,resources=daemonsets,verbs=get;list;watch
//+kubebuilder:rbac:groups=apps,resources=deployments,verbs=get;list;watch;create;update;patch;delete //+kubebuilder:rbac:groups=apps,resources=deployments,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch //+kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch
...@@ -1041,7 +1042,9 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex ...@@ -1041,7 +1042,9 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex
// Resolve checkpoint for this component // Resolve checkpoint for this component
var checkpointInfo *checkpoint.CheckpointInfo var checkpointInfo *checkpoint.CheckpointInfo
if opt.dynamoComponentDeployment.Spec.Checkpoint != nil && opt.dynamoComponentDeployment.Spec.Checkpoint.Enabled { if r.Config.Checkpoint.Enabled &&
opt.dynamoComponentDeployment.Spec.Checkpoint != nil &&
opt.dynamoComponentDeployment.Spec.Checkpoint.Enabled {
info, err := checkpoint.ResolveCheckpointForService(ctx, r.Client, opt.dynamoComponentDeployment.Namespace, opt.dynamoComponentDeployment.Spec.Checkpoint) info, err := checkpoint.ResolveCheckpointForService(ctx, r.Client, opt.dynamoComponentDeployment.Namespace, opt.dynamoComponentDeployment.Spec.Checkpoint)
if err != nil { if err != nil {
return nil, errors.Wrap(err, "failed to resolve checkpoint") return nil, errors.Wrap(err, "failed to resolve checkpoint")
...@@ -1054,6 +1057,17 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex ...@@ -1054,6 +1057,17 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex
err = errors.Wrap(err, "failed to generate base pod spec") err = errors.Wrap(err, "failed to generate base pod spec")
return nil, err return nil, err
} }
if r.Config.Checkpoint.Enabled {
if err := checkpoint.InjectCheckpointIntoPodSpec(
ctx,
r.Client,
opt.dynamoComponentDeployment.Namespace,
podSpec,
checkpointInfo,
); err != nil {
return nil, errors.Wrap(err, "failed to inject checkpoint config")
}
}
// Ensure we have at least one container (the main container should be there from GenerateBasePodSpec) // Ensure we have at least one container (the main container should be there from GenerateBasePodSpec)
if len(podSpec.Containers) == 0 { if len(podSpec.Containers) == 0 {
......
...@@ -29,6 +29,7 @@ import ( ...@@ -29,6 +29,7 @@ import (
commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts" commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common" "github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/dynamo" "github.com/ai-dynamo/dynamo/deploy/operator/internal/dynamo"
snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol"
"github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp"
"github.com/onsi/gomega" "github.com/onsi/gomega"
"github.com/onsi/gomega/format" "github.com/onsi/gomega/format"
...@@ -1255,6 +1256,40 @@ func TestDynamoComponentDeploymentReconciler_generatePodTemplateSpec_RestoreLabe ...@@ -1255,6 +1256,40 @@ func TestDynamoComponentDeploymentReconciler_generatePodTemplateSpec_RestoreLabe
if err := corev1.AddToScheme(s); err != nil { if err := corev1.AddToScheme(s); err != nil {
t.Fatalf("Failed to add corev1 to scheme: %v", err) t.Fatalf("Failed to add corev1 to scheme: %v", err)
} }
if err := appsv1.AddToScheme(s); err != nil {
t.Fatalf("Failed to add appsv1 to scheme: %v", err)
}
snapshotAgentDaemonSet := &appsv1.DaemonSet{
ObjectMeta: metav1.ObjectMeta{
Name: "snapshot-agent",
Namespace: "default",
Labels: map[string]string{
snapshotprotocol.SnapshotAgentLabelKey: snapshotprotocol.SnapshotAgentLabelValue,
},
},
Spec: appsv1.DaemonSetSpec{
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Containers: []corev1.Container{{
Name: snapshotprotocol.SnapshotAgentContainerName,
VolumeMounts: []corev1.VolumeMount{{
Name: "checkpoints",
MountPath: "/checkpoints",
}},
}},
Volumes: []corev1.Volume{{
Name: "checkpoints",
VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
ClaimName: "snapshot-pvc",
},
},
}},
},
},
},
}
makeDCD := func(checkpointRef string) *v1alpha1.DynamoComponentDeployment { makeDCD := func(checkpointRef string) *v1alpha1.DynamoComponentDeployment {
return &v1alpha1.DynamoComponentDeployment{ return &v1alpha1.DynamoComponentDeployment{
...@@ -1291,6 +1326,7 @@ func TestDynamoComponentDeploymentReconciler_generatePodTemplateSpec_RestoreLabe ...@@ -1291,6 +1326,7 @@ func TestDynamoComponentDeploymentReconciler_generatePodTemplateSpec_RestoreLabe
} }
makeReconciler := func(objs ...client.Object) *DynamoComponentDeploymentReconciler { makeReconciler := func(objs ...client.Object) *DynamoComponentDeploymentReconciler {
objs = append(objs, snapshotAgentDaemonSet.DeepCopy())
return &DynamoComponentDeploymentReconciler{ return &DynamoComponentDeploymentReconciler{
Client: fake.NewClientBuilder(). Client: fake.NewClientBuilder().
WithScheme(s). WithScheme(s).
...@@ -1299,13 +1335,6 @@ func TestDynamoComponentDeploymentReconciler_generatePodTemplateSpec_RestoreLabe ...@@ -1299,13 +1335,6 @@ func TestDynamoComponentDeploymentReconciler_generatePodTemplateSpec_RestoreLabe
Config: &configv1alpha1.OperatorConfiguration{ Config: &configv1alpha1.OperatorConfiguration{
Checkpoint: configv1alpha1.CheckpointConfiguration{ Checkpoint: configv1alpha1.CheckpointConfiguration{
Enabled: true, Enabled: true,
Storage: configv1alpha1.CheckpointStorageConfiguration{
Type: configv1alpha1.CheckpointStorageTypePVC,
PVC: configv1alpha1.CheckpointPVCConfig{
PVCName: "snapshot-pvc",
BasePath: "/checkpoints",
},
},
}, },
}, },
} }
...@@ -1342,8 +1371,8 @@ func TestDynamoComponentDeploymentReconciler_generatePodTemplateSpec_RestoreLabe ...@@ -1342,8 +1371,8 @@ func TestDynamoComponentDeploymentReconciler_generatePodTemplateSpec_RestoreLabe
if got := podTemplateSpec.Labels[commonconsts.KubeLabelIsRestoreTarget]; got != commonconsts.KubeLabelValueTrue { if got := podTemplateSpec.Labels[commonconsts.KubeLabelIsRestoreTarget]; got != commonconsts.KubeLabelValueTrue {
t.Fatalf("expected %s label to be true, got %q", commonconsts.KubeLabelIsRestoreTarget, got) t.Fatalf("expected %s label to be true, got %q", commonconsts.KubeLabelIsRestoreTarget, got)
} }
if got := podTemplateSpec.Labels[commonconsts.KubeLabelCheckpointHash]; got != checkpointName { if got := podTemplateSpec.Labels[commonconsts.KubeLabelCheckpointID]; got != checkpointName {
t.Fatalf("expected %s to be checkpoint hash, got %q", commonconsts.KubeLabelCheckpointHash, got) t.Fatalf("expected %s to be checkpoint id, got %q", commonconsts.KubeLabelCheckpointID, got)
} }
}) })
...@@ -1428,8 +1457,8 @@ func TestDynamoComponentDeploymentReconciler_generatePodTemplateSpec_RestoreLabe ...@@ -1428,8 +1457,8 @@ func TestDynamoComponentDeploymentReconciler_generatePodTemplateSpec_RestoreLabe
if _, ok := podTemplateSpec.Labels[commonconsts.KubeLabelIsRestoreTarget]; ok { if _, ok := podTemplateSpec.Labels[commonconsts.KubeLabelIsRestoreTarget]; ok {
t.Fatalf("did not expect %s label when checkpoint is not ready", commonconsts.KubeLabelIsRestoreTarget) t.Fatalf("did not expect %s label when checkpoint is not ready", commonconsts.KubeLabelIsRestoreTarget)
} }
if _, ok := podTemplateSpec.Labels[commonconsts.KubeLabelCheckpointHash]; ok { if _, ok := podTemplateSpec.Labels[commonconsts.KubeLabelCheckpointID]; ok {
t.Fatalf("did not expect %s label when checkpoint is not ready", commonconsts.KubeLabelCheckpointHash) t.Fatalf("did not expect %s label when checkpoint is not ready", commonconsts.KubeLabelCheckpointID)
} }
}) })
} }
...@@ -1481,6 +1510,36 @@ func TestDynamoComponentDeploymentReconciler_generateDeployment_RestoreStrategy( ...@@ -1481,6 +1510,36 @@ func TestDynamoComponentDeploymentReconciler_generateDeployment_RestoreStrategy(
} }
makeReconciler := func(objs ...client.Object) *DynamoComponentDeploymentReconciler { makeReconciler := func(objs ...client.Object) *DynamoComponentDeploymentReconciler {
objs = append(objs, &appsv1.DaemonSet{
ObjectMeta: metav1.ObjectMeta{
Name: "snapshot-agent",
Namespace: "default",
Labels: map[string]string{
snapshotprotocol.SnapshotAgentLabelKey: snapshotprotocol.SnapshotAgentLabelValue,
},
},
Spec: appsv1.DaemonSetSpec{
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Containers: []corev1.Container{{
Name: snapshotprotocol.SnapshotAgentContainerName,
VolumeMounts: []corev1.VolumeMount{{
Name: "checkpoints",
MountPath: "/checkpoints",
}},
}},
Volumes: []corev1.Volume{{
Name: "checkpoints",
VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
ClaimName: "snapshot-pvc",
},
},
}},
},
},
},
})
return &DynamoComponentDeploymentReconciler{ return &DynamoComponentDeploymentReconciler{
Client: fake.NewClientBuilder(). Client: fake.NewClientBuilder().
WithScheme(s). WithScheme(s).
...@@ -1489,13 +1548,6 @@ func TestDynamoComponentDeploymentReconciler_generateDeployment_RestoreStrategy( ...@@ -1489,13 +1548,6 @@ func TestDynamoComponentDeploymentReconciler_generateDeployment_RestoreStrategy(
Config: &configv1alpha1.OperatorConfiguration{ Config: &configv1alpha1.OperatorConfiguration{
Checkpoint: configv1alpha1.CheckpointConfiguration{ Checkpoint: configv1alpha1.CheckpointConfiguration{
Enabled: true, Enabled: true,
Storage: configv1alpha1.CheckpointStorageConfiguration{
Type: configv1alpha1.CheckpointStorageTypePVC,
PVC: configv1alpha1.CheckpointPVCConfig{
PVCName: "snapshot-pvc",
BasePath: "/checkpoints",
},
},
}, },
}, },
} }
......
...@@ -91,6 +91,7 @@ type DynamoGraphDeploymentReconciler struct { ...@@ -91,6 +91,7 @@ type DynamoGraphDeploymentReconciler struct {
// +kubebuilder:rbac:groups=scheduling.run.ai,resources=queues,verbs=get;list // +kubebuilder:rbac:groups=scheduling.run.ai,resources=queues,verbs=get;list
// +kubebuilder:rbac:groups=inference.networking.k8s.io,resources=inferencepools,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=inference.networking.k8s.io,resources=inferencepools,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch // +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch
// +kubebuilder:rbac:groups=apps,resources=daemonsets,verbs=get;list;watch
// Reconcile is part of the main kubernetes reconciliation loop which aims to // Reconcile is part of the main kubernetes reconciliation loop which aims to
// move the current state of the cluster closer to the desired state. // move the current state of the cluster closer to the desired state.
...@@ -547,7 +548,7 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGrovePodCliqueSet(ctx context ...@@ -547,7 +548,7 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGrovePodCliqueSet(ctx context
} }
// generate the dynamoComponentsDeployments from the config // generate the dynamoComponentsDeployments from the config
grovePodCliqueSet, err := dynamo.GenerateGrovePodCliqueSet(ctx, dynamoDeployment, r.Config, r.RuntimeConfig, r.DockerSecretRetriever, restartState, existingRestartAnnotations, checkpointInfos) grovePodCliqueSet, err := dynamo.GenerateGrovePodCliqueSet(ctx, dynamoDeployment, r.Config, r.RuntimeConfig, r.Client, r.DockerSecretRetriever, restartState, existingRestartAnnotations, checkpointInfos)
if err != nil { if err != nil {
logger.Error(err, "failed to generate the Grove GangSet") logger.Error(err, "failed to generate the Grove GangSet")
return nil, fmt.Errorf("failed to generate the Grove GangSet: %w", err) return nil, fmt.Errorf("failed to generate the Grove GangSet: %w", err)
......
...@@ -43,6 +43,7 @@ import ( ...@@ -43,6 +43,7 @@ import (
networkingv1beta1 "istio.io/client-go/pkg/apis/networking/v1beta1" networkingv1beta1 "istio.io/client-go/pkg/apis/networking/v1beta1"
corev1 "k8s.io/api/core/v1" corev1 "k8s.io/api/core/v1"
networkingv1 "k8s.io/api/networking/v1" networkingv1 "k8s.io/api/networking/v1"
ctrlclient "sigs.k8s.io/controller-runtime/pkg/client"
) )
// RestartState holds the restart state for DGD services. // RestartState holds the restart state for DGD services.
...@@ -1165,22 +1166,6 @@ func GenerateBasePodSpec( ...@@ -1165,22 +1166,6 @@ func GenerateBasePodSpec(
backend.UpdatePodSpec(&podSpec, numberOfNodes, role, component, serviceName, multinodeDeployer) backend.UpdatePodSpec(&podSpec, numberOfNodes, role, component, serviceName, multinodeDeployer)
// Inject checkpoint configuration if enabled
// This handles ALL checkpoint-related modifications:
// - Command/Args transformation (moves Command to Args to respect image ENTRYPOINT)
// - Security context (hostIPC, privileged mode)
// - Restore/checkpoint pod metadata (labels/annotations)
// - Storage configuration (volumes, mounts)
// CheckpointInfo should have been resolved by ResolveCheckpointForService before calling this function
// Checkpoint config comes from the operator's controller config (Helm values)
var checkpointConfig *configv1alpha1.CheckpointConfiguration
if operatorConfig.Checkpoint.Enabled {
checkpointConfig = &operatorConfig.Checkpoint
}
if err := checkpoint.InjectCheckpointIntoPodSpec(&podSpec, checkpointInfo, checkpointConfig); err != nil {
return nil, fmt.Errorf("failed to inject checkpoint config: %w", err)
}
// Inject auto-generated frontend sidecar if configured // Inject auto-generated frontend sidecar if configured
if component.FrontendSidecar != nil { if component.FrontendSidecar != nil {
sidecar, err := generateFrontendSidecar(component.FrontendSidecar, componentContext, operatorConfig) sidecar, err := generateFrontendSidecar(component.FrontendSidecar, componentContext, operatorConfig)
...@@ -1359,6 +1344,7 @@ func GenerateGrovePodCliqueSet( ...@@ -1359,6 +1344,7 @@ func GenerateGrovePodCliqueSet(
dynamoDeployment *v1alpha1.DynamoGraphDeployment, dynamoDeployment *v1alpha1.DynamoGraphDeployment,
operatorConfig *configv1alpha1.OperatorConfiguration, operatorConfig *configv1alpha1.OperatorConfiguration,
runtimeConfig *controller_common.RuntimeConfig, runtimeConfig *controller_common.RuntimeConfig,
kubeClient ctrlclient.Reader,
secretsRetriever SecretsRetriever, secretsRetriever SecretsRetriever,
restartState *RestartState, restartState *RestartState,
existingRestartAnnotations map[string]string, existingRestartAnnotations map[string]string,
...@@ -1438,6 +1424,17 @@ func GenerateGrovePodCliqueSet( ...@@ -1438,6 +1424,17 @@ func GenerateGrovePodCliqueSet(
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to generate podSpec for role %s: %w", r.Name, err) return nil, fmt.Errorf("failed to generate podSpec for role %s: %w", r.Name, err)
} }
if operatorConfig.Checkpoint.Enabled {
if err := checkpoint.InjectCheckpointIntoPodSpec(
ctx,
kubeClient,
dynamoDeployment.Namespace,
podSpec,
checkpointInfo,
); err != nil {
return nil, fmt.Errorf("failed to inject checkpoint config for role %s: %w", r.Name, err)
}
}
clique := &grovev1alpha1.PodCliqueTemplateSpec{ clique := &grovev1alpha1.PodCliqueTemplateSpec{
Name: strings.ToLower(r.Name), Name: strings.ToLower(r.Name),
......
...@@ -3787,7 +3787,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -3787,7 +3787,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
} }
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { t.Run(tt.name, func(t *testing.T) {
got, err := GenerateGrovePodCliqueSet(tt.args.ctx, tt.args.dynamoDeployment, tt.args.controllerConfig, &controller_common.RuntimeConfig{}, nil, nil, nil, nil) got, err := GenerateGrovePodCliqueSet(tt.args.ctx, tt.args.dynamoDeployment, tt.args.controllerConfig, &controller_common.RuntimeConfig{}, nil, nil, nil, nil, nil)
if (err != nil) != tt.wantErr { if (err != nil) != tt.wantErr {
t.Errorf("GenerateGrovePodCliqueSet() error = %v, wantErr %v", err, tt.wantErr) t.Errorf("GenerateGrovePodCliqueSet() error = %v, wantErr %v", err, tt.wantErr)
return return
...@@ -3848,7 +3848,7 @@ func Test_GeneratePodCliqueSetGlobalDynamoNamespace(t *testing.T) { ...@@ -3848,7 +3848,7 @@ func Test_GeneratePodCliqueSetGlobalDynamoNamespace(t *testing.T) {
}, },
} }
got, err := GenerateGrovePodCliqueSet(context.Background(), dynamoDeployment, &configv1alpha1.OperatorConfiguration{}, &controller_common.RuntimeConfig{}, nil, nil, nil, nil) got, err := GenerateGrovePodCliqueSet(context.Background(), dynamoDeployment, &configv1alpha1.OperatorConfiguration{}, &controller_common.RuntimeConfig{}, nil, nil, nil, nil, nil)
if !assert.NoError(t, err) { if !assert.NoError(t, err) {
return return
} }
...@@ -4953,7 +4953,7 @@ func TestGenerateGrovePodCliqueSet_StartsAfterDependencies(t *testing.T) { ...@@ -4953,7 +4953,7 @@ func TestGenerateGrovePodCliqueSet_StartsAfterDependencies(t *testing.T) {
}, },
} }
got, err := GenerateGrovePodCliqueSet(context.Background(), dynamoDeployment, controllerConfig, &controller_common.RuntimeConfig{}, secretsRetriever, nil, nil, nil) got, err := GenerateGrovePodCliqueSet(context.Background(), dynamoDeployment, controllerConfig, &controller_common.RuntimeConfig{}, nil, secretsRetriever, nil, nil, nil)
if err != nil { if err != nil {
t.Errorf("GenerateGrovePodCliqueSet() error = %v", err) t.Errorf("GenerateGrovePodCliqueSet() error = %v", err)
return return
...@@ -6783,7 +6783,7 @@ func TestGenerateGrovePodCliqueSet_RestartAnnotations(t *testing.T) { ...@@ -6783,7 +6783,7 @@ func TestGenerateGrovePodCliqueSet_RestartAnnotations(t *testing.T) {
}, },
} }
got, err := GenerateGrovePodCliqueSet(context.Background(), dgd, controllerConfig, &controller_common.RuntimeConfig{}, nil, tt.restartState, nil, nil) got, err := GenerateGrovePodCliqueSet(context.Background(), dgd, controllerConfig, &controller_common.RuntimeConfig{}, nil, nil, tt.restartState, nil, nil)
if err != nil { if err != nil {
t.Fatalf("GenerateGrovePodCliqueSet() error = %v", err) t.Fatalf("GenerateGrovePodCliqueSet() error = %v", err)
} }
...@@ -6864,8 +6864,8 @@ func TestGenerateLabels_RemovesStaleRestoreLabelsWhenCheckpointNotReady(t *testi ...@@ -6864,8 +6864,8 @@ func TestGenerateLabels_RemovesStaleRestoreLabelsWhenCheckpointNotReady(t *testi
}, },
ExtraPodMetadata: &v1alpha1.ExtraPodMetadata{ ExtraPodMetadata: &v1alpha1.ExtraPodMetadata{
Labels: map[string]string{ Labels: map[string]string{
"extra-label": "keep-too", "extra-label": "keep-too",
commonconsts.KubeLabelCheckpointHash: "stale-hash", commonconsts.KubeLabelCheckpointID: "stale-hash",
}, },
}, },
}, },
...@@ -6884,7 +6884,7 @@ func TestGenerateLabels_RemovesStaleRestoreLabelsWhenCheckpointNotReady(t *testi ...@@ -6884,7 +6884,7 @@ func TestGenerateLabels_RemovesStaleRestoreLabelsWhenCheckpointNotReady(t *testi
assert.Equal(t, "keep", labels["user-label"]) assert.Equal(t, "keep", labels["user-label"])
assert.Equal(t, "keep-too", labels["extra-label"]) assert.Equal(t, "keep-too", labels["extra-label"])
_, hasRestoreTarget := labels[commonconsts.KubeLabelIsRestoreTarget] _, hasRestoreTarget := labels[commonconsts.KubeLabelIsRestoreTarget]
_, hasCheckpointHash := labels[commonconsts.KubeLabelCheckpointHash] _, hasCheckpointHash := labels[commonconsts.KubeLabelCheckpointID]
assert.False(t, hasRestoreTarget) assert.False(t, hasRestoreTarget)
assert.False(t, hasCheckpointHash) assert.False(t, hasCheckpointHash)
} }
...@@ -6899,7 +6899,7 @@ func TestGenerateLabels_OverwritesStaleRestoreLabelsWhenCheckpointReady(t *testi ...@@ -6899,7 +6899,7 @@ func TestGenerateLabels_OverwritesStaleRestoreLabelsWhenCheckpointReady(t *testi
}, },
ExtraPodMetadata: &v1alpha1.ExtraPodMetadata{ ExtraPodMetadata: &v1alpha1.ExtraPodMetadata{
Labels: map[string]string{ Labels: map[string]string{
commonconsts.KubeLabelCheckpointHash: "stale-hash", commonconsts.KubeLabelCheckpointID: "stale-hash",
}, },
}, },
}, },
...@@ -6916,7 +6916,7 @@ func TestGenerateLabels_OverwritesStaleRestoreLabelsWhenCheckpointReady(t *testi ...@@ -6916,7 +6916,7 @@ func TestGenerateLabels_OverwritesStaleRestoreLabelsWhenCheckpointReady(t *testi
Hash: "resolved-hash", Hash: "resolved-hash",
}) })
assert.Equal(t, commonconsts.KubeLabelValueTrue, labels[commonconsts.KubeLabelIsRestoreTarget]) assert.Equal(t, commonconsts.KubeLabelValueTrue, labels[commonconsts.KubeLabelIsRestoreTarget])
assert.Equal(t, "resolved-hash", labels[commonconsts.KubeLabelCheckpointHash]) assert.Equal(t, "resolved-hash", labels[commonconsts.KubeLabelCheckpointID])
} }
func TestGenerateLabels_ReassertsRestoreIdentityLabelsAfterMetadataMerge(t *testing.T) { func TestGenerateLabels_ReassertsRestoreIdentityLabelsAfterMetadataMerge(t *testing.T) {
...@@ -7459,7 +7459,7 @@ func TestGenerateGrovePodCliqueSet_SpecMetadataPropagation(t *testing.T) { ...@@ -7459,7 +7459,7 @@ func TestGenerateGrovePodCliqueSet_SpecMetadataPropagation(t *testing.T) {
}, },
} }
pcs, err := GenerateGrovePodCliqueSet(context.Background(), dgd, &configv1alpha1.OperatorConfiguration{}, &controller_common.RuntimeConfig{}, nil, nil, nil, nil) pcs, err := GenerateGrovePodCliqueSet(context.Background(), dgd, &configv1alpha1.OperatorConfiguration{}, &controller_common.RuntimeConfig{}, nil, nil, nil, nil, nil)
require.NoError(t, err) require.NoError(t, err)
// PCS object-level metadata // PCS object-level metadata
...@@ -7627,6 +7627,7 @@ func TestGenerateGrovePodCliqueSet_TopologyConstraints(t *testing.T) { ...@@ -7627,6 +7627,7 @@ func TestGenerateGrovePodCliqueSet_TopologyConstraints(t *testing.T) {
tt.deployment, tt.deployment,
operatorConfig, operatorConfig,
&controller_common.RuntimeConfig{}, &controller_common.RuntimeConfig{},
nil,
secretsRetriever, secretsRetriever,
&RestartState{}, &RestartState{},
nil, nil,
......
...@@ -8,7 +8,7 @@ import ( ...@@ -8,7 +8,7 @@ import (
"gopkg.in/yaml.v3" "gopkg.in/yaml.v3"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
) )
// ConfigMapPath is the default path where the ConfigMap is mounted. // ConfigMapPath is the default path where the ConfigMap is mounted.
......
...@@ -12,9 +12,9 @@ import ( ...@@ -12,9 +12,9 @@ import (
"github.com/containerd/containerd" "github.com/containerd/containerd"
"github.com/go-logr/logr" "github.com/go-logr/logr"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/common" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/controller"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/controller" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/logging"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/logging" snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime"
) )
func main() { func main() {
...@@ -29,7 +29,7 @@ func main() { ...@@ -29,7 +29,7 @@ func main() {
fatal(agentLog, err, "Invalid configuration") fatal(agentLog, err, "Invalid configuration")
} }
ctrd, err := containerd.New(common.ContainerdSocket) ctrd, err := containerd.New(snapshotruntime.ContainerdSocket)
if err != nil { if err != nil {
fatal(agentLog, err, "Failed to connect to containerd") fatal(agentLog, err, "Failed to connect to containerd")
} }
......
...@@ -8,8 +8,8 @@ import ( ...@@ -8,8 +8,8 @@ import (
"github.com/go-logr/logr" "github.com/go-logr/logr"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/executor" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/executor"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/logging" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/logging"
) )
func main() { func main() {
......
# `snapshotctl`
`snapshotctl` is a lower-level snapshot utility for developers and operators.
It is not the primary Dynamo user workflow. The normal user-facing path is:
```text
DynamoCheckpoint CR -> operator -> snapshot-agent
```
Use `snapshotctl` when you want to exercise checkpoint or restore behavior
directly from a worker pod manifest without going through the operator.
## Requirements
- the snapshot Helm chart must already be installed in the target namespace
- a `snapshot-agent` DaemonSet must be running in that namespace
- the namespace must already have the checkpoint PVC mounted by the agent
## Manifest requirements
`snapshotctl checkpoint --manifest ...` and `snapshotctl restore --manifest ...`
accept a Kubernetes `Pod` manifest, not a Deployment or Job manifest.
That pod manifest must:
- describe the worker pod you want to checkpoint or restore
- contain exactly one worker container
- use the placeholder image for checkpoint-aware flows
- match the runtime-relevant worker settings you care about preserving
In practice, start from the real worker pod spec you would normally run, then
keep only the pod-level fields needed to recreate that worker accurately.
## Commands
Checkpoint from a manifest:
```bash
snapshotctl checkpoint \
--manifest ./worker-pod.yaml \
--namespace ${NAMESPACE}
```
If `--checkpoint-id` is omitted, `snapshotctl` generates one.
Restore by creating a new pod from a manifest:
```bash
snapshotctl restore \
--manifest ./worker-pod.yaml \
--namespace ${NAMESPACE} \
--checkpoint-id manual-snapshot-123
```
Restore an existing snapshot-compatible pod in place:
```bash
snapshotctl restore \
--pod existing-restore-target \
--namespace ${NAMESPACE} \
--checkpoint-id manual-snapshot-123
```
## Notes
- `restore --pod` expects a pod that is already compatible with snapshot restore
- `restore --manifest` creates a new restore target pod from the manifest you provide
- `snapshotctl` is useful for debugging and lower-level validation, but it does
not replace the operator-managed checkpoint flow
package main
import (
"context"
"fmt"
"strings"
"time"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/kubernetes"
snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol"
)
const defaultGeneratedCheckpointIDPrefix = "manual-snapshot"
type checkpointOptions struct {
ManifestPath string
Namespace string
KubeContext string
CheckpointID string
DisableCudaCheckpointJobFile bool
Timeout time.Duration
}
type result struct {
Name string
Namespace string
CheckpointID string
CheckpointLocation string
CheckpointJob string
RestorePod string
Status string
}
func runCheckpointFlow(ctx context.Context, opts checkpointOptions) (*result, error) {
if strings.TrimSpace(opts.ManifestPath) == "" {
return nil, fmt.Errorf("missing required flags: --manifest")
}
if opts.Timeout <= 0 {
return nil, fmt.Errorf("--timeout must be greater than zero")
}
pod, clientset, namespace, storage, err := loadRunContext(ctx, opts.ManifestPath, opts.Namespace, opts.KubeContext)
if err != nil {
return nil, err
}
checkpointID := strings.TrimSpace(opts.CheckpointID)
if checkpointID == "" {
checkpointID = fmt.Sprintf("%s-%d", defaultGeneratedCheckpointIDPrefix, time.Now().UTC().UnixNano())
}
resolvedStorage, err := snapshotprotocol.ResolveCheckpointStorage(checkpointID, "", snapshotprotocol.Storage{
Type: snapshotprotocol.StorageTypePVC,
PVCName: storage.PVCName,
BasePath: storage.BasePath,
})
if err != nil {
return nil, err
}
checkpointJobName := pod.Name + "-checkpoint"
job, err := snapshotprotocol.NewCheckpointJob(&corev1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Labels: pod.Labels,
Annotations: pod.Annotations,
},
Spec: *pod.Spec.DeepCopy(),
}, snapshotprotocol.CheckpointJobOptions{
Namespace: namespace,
CheckpointID: checkpointID,
ArtifactVersion: snapshotprotocol.DefaultCheckpointArtifactVersion,
SeccompProfile: snapshotprotocol.DefaultSeccompLocalhostProfile,
Name: checkpointJobName,
WrapLaunchJob: !opts.DisableCudaCheckpointJobFile,
})
if err != nil {
return nil, err
}
_, err = clientset.BatchV1().Jobs(namespace).Create(ctx, job, metav1.CreateOptions{})
if apierrors.IsAlreadyExists(err) {
return nil, fmt.Errorf("checkpoint job %s/%s already exists", namespace, checkpointJobName)
}
if err != nil {
return nil, err
}
waitCtx, cancel := context.WithTimeout(ctx, opts.Timeout)
defer cancel()
status, err := waitForCheckpoint(waitCtx, clientset, namespace, checkpointJobName)
if err != nil {
return nil, err
}
return &result{
Name: pod.Name,
Namespace: namespace,
CheckpointID: checkpointID,
CheckpointLocation: resolvedStorage.Location,
CheckpointJob: checkpointJobName,
Status: status,
}, nil
}
func waitForCheckpoint(ctx context.Context, clientset kubernetes.Interface, namespace string, jobName string) (string, error) {
var status string
if err := wait.PollUntilContextCancel(ctx, 2*time.Second, true, func(ctx context.Context) (bool, error) {
job, err := clientset.BatchV1().Jobs(namespace).Get(ctx, jobName, metav1.GetOptions{})
if err != nil {
if apierrors.IsNotFound(err) {
return false, nil
}
return false, fmt.Errorf("get checkpoint job %s/%s: %w", namespace, jobName, err)
}
status = strings.TrimSpace(job.Annotations[snapshotprotocol.CheckpointStatusAnnotation])
if status == snapshotprotocol.CheckpointStatusCompleted {
return true, nil
}
if status == snapshotprotocol.CheckpointStatusFailed {
return false, fmt.Errorf("checkpoint job %s/%s failed", namespace, jobName)
}
if job.Status.Failed > 0 {
return false, fmt.Errorf("checkpoint job %s/%s failed", namespace, jobName)
}
for _, condition := range job.Status.Conditions {
if condition.Status != corev1.ConditionTrue {
continue
}
if condition.Type == batchv1.JobFailed {
return false, fmt.Errorf("checkpoint job %s/%s failed: %s", namespace, jobName, strings.TrimSpace(condition.Message))
}
}
return false, nil
}); err != nil {
if !wait.Interrupted(err) {
return "", err
}
summaryCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
pods, err := clientset.CoreV1().Pods(namespace).List(summaryCtx, metav1.ListOptions{
LabelSelector: "batch.kubernetes.io/job-name=" + jobName,
})
summary := "no checkpoint pod created yet"
if err != nil {
summary = "unable to list checkpoint pod: " + err.Error()
} else if len(pods.Items) != 0 {
pod := pods.Items[0]
parts := []string{
fmt.Sprintf("job_status=%q", status),
fmt.Sprintf("pod=%s phase=%s", pod.Name, pod.Status.Phase),
}
for _, condition := range pod.Status.Conditions {
if condition.Status == corev1.ConditionTrue || condition.Status == corev1.ConditionFalse {
parts = append(parts, fmt.Sprintf("%s=%s", condition.Type, condition.Status))
}
}
for _, status := range pod.Status.ContainerStatuses {
if status.State.Waiting != nil {
parts = append(parts, fmt.Sprintf("container=%s waiting=%s", status.Name, status.State.Waiting.Reason))
}
if status.State.Terminated != nil {
parts = append(parts, fmt.Sprintf("container=%s terminated=%s", status.Name, status.State.Terminated.Reason))
}
}
summary = strings.Join(parts, " ")
}
return "", fmt.Errorf("checkpoint job %s/%s timed out: %s", namespace, jobName, summary)
}
return status, nil
}
package main
import (
"context"
"fmt"
"os"
"strings"
"time"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/clientcmd"
"sigs.k8s.io/yaml"
snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol"
)
func loadRunContext(ctx context.Context, manifestPath string, namespaceOverride string, kubeContext string) (*corev1.Pod, kubernetes.Interface, string, snapshotprotocol.Storage, error) {
pod, err := loadPod(manifestPath)
if err != nil {
return nil, nil, "", snapshotprotocol.Storage{}, err
}
clientset, currentNamespace, err := loadClientset(kubeContext)
if err != nil {
return nil, nil, "", snapshotprotocol.Storage{}, err
}
namespace := currentNamespace
if namespace == "" {
namespace = corev1.NamespaceDefault
}
if pod.Namespace != "" {
namespace = pod.Namespace
}
if namespaceOverride != "" {
namespace = namespaceOverride
}
storage, err := discoverSnapshotStorage(ctx, clientset, namespace)
if err != nil {
return nil, nil, "", snapshotprotocol.Storage{}, err
}
return pod, clientset, namespace, storage, nil
}
func loadClientset(kubeContext string) (kubernetes.Interface, string, error) {
loadingRules := clientcmd.NewDefaultClientConfigLoadingRules()
clientConfig := clientcmd.NewNonInteractiveDeferredLoadingClientConfig(loadingRules, &clientcmd.ConfigOverrides{
CurrentContext: strings.TrimSpace(kubeContext),
})
restConfig, err := clientConfig.ClientConfig()
if err != nil {
return nil, "", fmt.Errorf("load kubeconfig: %w", err)
}
restConfig.Timeout = 30 * time.Second
namespace, _, err := clientConfig.Namespace()
if err != nil {
return nil, "", fmt.Errorf("resolve current namespace: %w", err)
}
if strings.TrimSpace(namespace) == "" {
namespace = corev1.NamespaceDefault
}
clientset, err := kubernetes.NewForConfig(restConfig)
if err != nil {
return nil, "", fmt.Errorf("create kubernetes client: %w", err)
}
return clientset, namespace, nil
}
func discoverSnapshotStorage(ctx context.Context, clientset kubernetes.Interface, namespace string) (snapshotprotocol.Storage, error) {
daemonSets, err := clientset.AppsV1().DaemonSets(namespace).List(ctx, metav1.ListOptions{
LabelSelector: snapshotprotocol.SnapshotAgentLabelSelector,
})
if err != nil {
return snapshotprotocol.Storage{}, fmt.Errorf("list snapshot-agent daemonsets in %s: %w", namespace, err)
}
return snapshotprotocol.DiscoverStorageFromDaemonSets(namespace, daemonSets.Items)
}
func loadPod(manifestPath string) (*corev1.Pod, error) {
content, err := os.ReadFile(manifestPath)
if err != nil {
return nil, fmt.Errorf("read manifest %s: %w", manifestPath, err)
}
var pod corev1.Pod
if err := yaml.Unmarshal(content, &pod); err != nil {
return nil, fmt.Errorf("parse manifest %s: %w", manifestPath, err)
}
if kind := strings.TrimSpace(pod.Kind); kind != "" && kind != "Pod" {
return nil, fmt.Errorf("manifest %s is kind %q, expected Pod", manifestPath, kind)
}
if len(pod.Spec.Containers) != 1 {
return nil, fmt.Errorf(
"manifest %s has %d containers; snapshotctl requires exactly one worker container",
manifestPath,
len(pod.Spec.Containers),
)
}
if strings.TrimSpace(pod.Spec.Containers[0].Image) == "" {
return nil, fmt.Errorf("manifest %s: worker container image is required", manifestPath)
}
if strings.TrimSpace(pod.Name) == "" {
return nil, fmt.Errorf("manifest %s: metadata.name is required", manifestPath)
}
pod.Namespace = strings.TrimSpace(pod.Namespace)
return &pod, nil
}
package main
import (
"context"
"flag"
"fmt"
"os"
"time"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/logging"
)
var snapshotctlLog = logging.ConfigureLogger("stderr").WithName("snapshotctl")
func main() {
if err := run(os.Args[1:]); err != nil {
snapshotctlLog.Error(err, "snapshotctl failed")
os.Exit(1)
}
}
func run(args []string) error {
if len(args) == 0 {
printUsage()
return nil
}
switch args[0] {
case "checkpoint":
return runCheckpoint(args[1:])
case "restore":
return runRestore(args[1:])
case "help", "-h", "--help":
printUsage()
return nil
default:
return fmt.Errorf("unknown subcommand %q", args[0])
}
}
func runCheckpoint(args []string) error {
flags := flag.NewFlagSet("checkpoint", flag.ContinueOnError)
flags.SetOutput(os.Stderr)
manifest := flags.String("manifest", "", "Path to a worker Pod manifest")
namespace := flags.String("namespace", "", "Namespace override; defaults to the manifest namespace or current kube context namespace")
kubeContext := flags.String("kube-context", "", "Kubernetes context override")
checkpointID := flags.String("checkpoint-id", "", "Explicit checkpoint ID; defaults to a generated value")
disableCudaCheckpointJobFile := flags.Bool("disable-cuda-checkpoint-job-file", false, "Preserve the manifest command instead of wrapping it with cuda-checkpoint --launch-job")
timeout := flags.Duration("timeout", 45*time.Minute, "Maximum time to wait for checkpoint completion")
if err := flags.Parse(args); err != nil {
return err
}
if len(flags.Args()) != 0 {
return fmt.Errorf("unexpected arguments: %v", flags.Args())
}
if *manifest == "" {
return fmt.Errorf("--manifest is required")
}
snapshotctlLog.Info("Running checkpoint", "manifest", *manifest, "namespace", *namespace)
result, err := runCheckpointFlow(context.Background(), checkpointOptions{
ManifestPath: *manifest,
Namespace: *namespace,
KubeContext: *kubeContext,
CheckpointID: *checkpointID,
DisableCudaCheckpointJobFile: *disableCudaCheckpointJobFile,
Timeout: *timeout,
})
if err != nil {
return err
}
snapshotctlLog.Info("Checkpoint completed", "job", result.CheckpointJob, "checkpoint_id", result.CheckpointID)
fmt.Printf("status=%s\n", result.Status)
fmt.Printf("namespace=%s\n", result.Namespace)
fmt.Printf("name=%s\n", result.Name)
fmt.Printf("checkpoint_job=%s\n", result.CheckpointJob)
fmt.Printf("checkpoint_id=%s\n", result.CheckpointID)
fmt.Printf("checkpoint_location=%s\n", result.CheckpointLocation)
return nil
}
func runRestore(args []string) error {
flags := flag.NewFlagSet("restore", flag.ContinueOnError)
flags.SetOutput(os.Stderr)
manifest := flags.String("manifest", "", "Path to a worker Pod manifest used to create a new restore pod")
podName := flags.String("pod", "", "Existing restore target pod name")
namespace := flags.String("namespace", "", "Namespace override; defaults to the manifest namespace or current kube context namespace")
kubeContext := flags.String("kube-context", "", "Kubernetes context override")
checkpointID := flags.String("checkpoint-id", "", "Checkpoint ID to restore")
timeout := flags.Duration("timeout", 45*time.Minute, "Maximum time to wait for restore completion")
if err := flags.Parse(args); err != nil {
return err
}
if len(flags.Args()) != 0 {
return fmt.Errorf("unexpected arguments: %v", flags.Args())
}
if (*manifest == "") == (*podName == "") {
return fmt.Errorf("must specify exactly one of --manifest or --pod")
}
snapshotctlLog.Info("Running restore", "manifest", *manifest, "pod", *podName, "namespace", *namespace, "checkpoint_id", *checkpointID)
result, err := runRestoreFlow(context.Background(), restoreOptions{
ManifestPath: *manifest,
PodName: *podName,
Namespace: *namespace,
KubeContext: *kubeContext,
CheckpointID: *checkpointID,
Timeout: *timeout,
})
if err != nil {
return err
}
snapshotctlLog.Info("Restore completed", "pod", result.RestorePod, "checkpoint_id", result.CheckpointID)
fmt.Printf("status=%s\n", result.Status)
fmt.Printf("namespace=%s\n", result.Namespace)
fmt.Printf("name=%s\n", result.Name)
fmt.Printf("restore_pod=%s\n", result.RestorePod)
fmt.Printf("checkpoint_id=%s\n", result.CheckpointID)
fmt.Printf("checkpoint_location=%s\n", result.CheckpointLocation)
return nil
}
func printUsage() {
fmt.Fprintf(os.Stderr, `snapshotctl runs snapshot checkpoint and restore flows from a worker Pod manifest.
Subcommands:
checkpoint
restore
Examples:
snapshotctl checkpoint --manifest /tmp/vllm-worker-pod.yaml
snapshotctl restore --manifest /tmp/sglang-worker-pod.yaml --checkpoint-id manual-snapshot-123
snapshotctl restore --pod existing-restore-target --checkpoint-id manual-snapshot-123
`)
}
package main
import (
"context"
"encoding/json"
"fmt"
"strings"
"time"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/kubernetes"
snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol"
)
type restoreOptions struct {
ManifestPath string
PodName string
Namespace string
KubeContext string
CheckpointID string
Timeout time.Duration
}
func runRestoreFlow(ctx context.Context, opts restoreOptions) (*result, error) {
createPodFromManifest := strings.TrimSpace(opts.ManifestPath) != ""
targetExistingPod := strings.TrimSpace(opts.PodName) != ""
if createPodFromManifest == targetExistingPod {
return nil, fmt.Errorf("restore requires exactly one of --manifest or --pod")
}
if strings.TrimSpace(opts.CheckpointID) == "" {
return nil, fmt.Errorf("missing required flags: --checkpoint-id")
}
if opts.Timeout <= 0 {
return nil, fmt.Errorf("--timeout must be greater than zero")
}
checkpointID := strings.TrimSpace(opts.CheckpointID)
clientset, currentNamespace, err := loadClientset(opts.KubeContext)
if err != nil {
return nil, err
}
namespace := currentNamespace
if namespace == "" {
namespace = corev1.NamespaceDefault
}
if strings.TrimSpace(opts.Namespace) != "" {
namespace = strings.TrimSpace(opts.Namespace)
}
podName := strings.TrimSpace(opts.PodName)
pod := &corev1.Pod{}
if createPodFromManifest {
pod, err = loadPod(opts.ManifestPath)
if err != nil {
return nil, err
}
if strings.TrimSpace(pod.Namespace) != "" && strings.TrimSpace(opts.Namespace) == "" {
namespace = strings.TrimSpace(pod.Namespace)
}
podName = pod.Name
}
storage, err := discoverSnapshotStorage(ctx, clientset, namespace)
if err != nil {
return nil, err
}
resolvedStorage, err := snapshotprotocol.ResolveRestoreStorage(checkpointID, snapshotprotocol.DefaultCheckpointArtifactVersion, "", snapshotprotocol.Storage{
Type: snapshotprotocol.StorageTypePVC,
PVCName: storage.PVCName,
BasePath: storage.BasePath,
})
if err != nil {
return nil, err
}
if createPodFromManifest {
restorePod := snapshotprotocol.NewRestorePod(&corev1.Pod{
TypeMeta: metav1.TypeMeta{APIVersion: "v1", Kind: "Pod"},
ObjectMeta: metav1.ObjectMeta{
Name: pod.Name,
Labels: pod.Labels,
Annotations: pod.Annotations,
},
Spec: *pod.Spec.DeepCopy(),
}, snapshotprotocol.PodOptions{
Namespace: namespace,
CheckpointID: checkpointID,
ArtifactVersion: snapshotprotocol.DefaultCheckpointArtifactVersion,
Storage: resolvedStorage,
SeccompProfile: snapshotprotocol.DefaultSeccompLocalhostProfile,
})
_, err = clientset.CoreV1().Pods(namespace).Create(ctx, restorePod, metav1.CreateOptions{})
if apierrors.IsAlreadyExists(err) {
return nil, fmt.Errorf("restore pod %s/%s already exists", namespace, pod.Name)
}
if err != nil {
return nil, err
}
} else {
pod, err = clientset.CoreV1().Pods(namespace).Get(ctx, podName, metav1.GetOptions{})
if err != nil {
return nil, fmt.Errorf("get restore target pod %s/%s: %w", namespace, podName, err)
}
if len(pod.Spec.Containers) == 0 {
return nil, fmt.Errorf("restore target pod %s/%s has no containers", namespace, podName)
}
if err := snapshotprotocol.ValidateRestorePodSpec(&pod.Spec, resolvedStorage, snapshotprotocol.DefaultSeccompLocalhostProfile); err != nil {
return nil, fmt.Errorf("restore target pod %s/%s is not snapshot-compatible: %w", namespace, podName, err)
}
labels := map[string]string{}
for key, value := range pod.Labels {
labels[key] = value
}
annotations := map[string]string{}
for key, value := range pod.Annotations {
annotations[key] = value
}
snapshotprotocol.ApplyRestoreTargetMetadata(labels, annotations, true, checkpointID, snapshotprotocol.DefaultCheckpointArtifactVersion)
patch, err := json.Marshal(map[string]any{
"metadata": map[string]any{
"labels": labels,
"annotations": annotations,
},
})
if err != nil {
return nil, fmt.Errorf("encode restore target metadata patch: %w", err)
}
if _, err := clientset.CoreV1().Pods(namespace).Patch(ctx, podName, types.MergePatchType, patch, metav1.PatchOptions{}); err != nil {
return nil, fmt.Errorf("patch restore target pod %s/%s: %w", namespace, podName, err)
}
}
waitCtx, cancel := context.WithTimeout(ctx, opts.Timeout)
defer cancel()
status, err := waitForRestore(waitCtx, clientset, namespace, podName)
if err != nil {
return nil, err
}
return &result{
Name: podName,
Namespace: namespace,
CheckpointID: checkpointID,
CheckpointLocation: resolvedStorage.Location,
RestorePod: podName,
Status: status,
}, nil
}
func waitForRestore(ctx context.Context, clientset kubernetes.Interface, namespace string, podName string) (string, error) {
var status string
if err := wait.PollUntilContextCancel(ctx, 2*time.Second, true, func(ctx context.Context) (bool, error) {
pod, err := clientset.CoreV1().Pods(namespace).Get(ctx, podName, metav1.GetOptions{})
if err != nil {
return false, fmt.Errorf("get restore pod %s/%s: %w", namespace, podName, err)
}
status = strings.TrimSpace(pod.Annotations[snapshotprotocol.RestoreStatusAnnotation])
if status == snapshotprotocol.RestoreStatusCompleted {
return true, nil
}
if status == snapshotprotocol.RestoreStatusFailed {
return false, fmt.Errorf("restore pod %s/%s failed", namespace, podName)
}
if pod.Status.Phase == corev1.PodFailed {
return false, fmt.Errorf("restore pod %s/%s entered phase Failed (%s)", namespace, podName, pod.Status.Reason)
}
return false, nil
}); err != nil {
if !wait.Interrupted(err) {
return "", err
}
return "", fmt.Errorf("restore pod %s/%s timed out: status=%q", namespace, podName, status)
}
return status, nil
}
...@@ -8,6 +8,7 @@ require ( ...@@ -8,6 +8,7 @@ require (
github.com/cyphar/filepath-securejoin v0.5.1 github.com/cyphar/filepath-securejoin v0.5.1
github.com/go-logr/logr v1.4.3 github.com/go-logr/logr v1.4.3
github.com/go-logr/zapr v1.3.0 github.com/go-logr/zapr v1.3.0
github.com/google/uuid v1.6.0
github.com/moby/sys/mountinfo v0.7.1 github.com/moby/sys/mountinfo v0.7.1
github.com/opencontainers/runtime-spec v1.2.0 github.com/opencontainers/runtime-spec v1.2.0
github.com/prometheus/procfs v0.16.1 github.com/prometheus/procfs v0.16.1
...@@ -16,10 +17,13 @@ require ( ...@@ -16,10 +17,13 @@ require (
google.golang.org/grpc v1.72.2 google.golang.org/grpc v1.72.2
google.golang.org/protobuf v1.36.11 google.golang.org/protobuf v1.36.11
gopkg.in/yaml.v3 v3.0.1 gopkg.in/yaml.v3 v3.0.1
k8s.io/api v0.35.0 k8s.io/api v0.34.3
k8s.io/apimachinery v0.35.0 k8s.io/apimachinery v0.34.3
k8s.io/client-go v0.35.0 k8s.io/client-go v0.34.3
k8s.io/kubelet v0.35.0 k8s.io/kubelet v0.34.3
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4
sigs.k8s.io/controller-runtime v0.22.4
sigs.k8s.io/yaml v1.6.0
) )
require ( require (
...@@ -41,6 +45,7 @@ require ( ...@@ -41,6 +45,7 @@ require (
github.com/distribution/reference v0.6.0 // indirect github.com/distribution/reference v0.6.0 // indirect
github.com/docker/go-events v0.0.0-20190806004212-e31b211e4f1c // indirect github.com/docker/go-events v0.0.0-20190806004212-e31b211e4f1c // indirect
github.com/emicklei/go-restful/v3 v3.12.2 // indirect github.com/emicklei/go-restful/v3 v3.12.2 // indirect
github.com/evanphx/json-patch/v5 v5.9.11 // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect
github.com/fxamacker/cbor/v2 v2.9.0 // indirect github.com/fxamacker/cbor/v2 v2.9.0 // indirect
github.com/go-logr/stdr v1.2.2 // indirect github.com/go-logr/stdr v1.2.2 // indirect
...@@ -51,7 +56,6 @@ require ( ...@@ -51,7 +56,6 @@ require (
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/google/gnostic-models v0.7.0 // indirect github.com/google/gnostic-models v0.7.0 // indirect
github.com/google/go-cmp v0.7.0 // indirect github.com/google/go-cmp v0.7.0 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/josharian/intern v1.0.0 // indirect github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect github.com/json-iterator/go v1.1.12 // indirect
github.com/klauspost/compress v1.18.0 // indirect github.com/klauspost/compress v1.18.0 // indirect
...@@ -70,6 +74,7 @@ require ( ...@@ -70,6 +74,7 @@ require (
github.com/pkg/errors v0.9.1 // indirect github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/sirupsen/logrus v1.9.3 // indirect github.com/sirupsen/logrus v1.9.3 // indirect
github.com/spf13/pflag v1.0.9 // indirect
github.com/x448/float16 v0.8.4 // indirect github.com/x448/float16 v0.8.4 // indirect
go.opencensus.io v0.24.0 // indirect go.opencensus.io v0.24.0 // indirect
go.opentelemetry.io/auto/sdk v1.2.1 // indirect go.opentelemetry.io/auto/sdk v1.2.1 // indirect
...@@ -93,10 +98,8 @@ require ( ...@@ -93,10 +98,8 @@ require (
gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/inf.v0 v0.9.1 // indirect
k8s.io/klog/v2 v2.130.1 // indirect k8s.io/klog/v2 v2.130.1 // indirect
k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3 // indirect
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 // indirect
sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect
sigs.k8s.io/randfill v1.0.0 // indirect sigs.k8s.io/randfill v1.0.0 // indirect
sigs.k8s.io/structured-merge-diff/v6 v6.3.2-0.20260122202528-d9cc6641c482 // indirect sigs.k8s.io/structured-merge-diff/v6 v6.3.1 // indirect
sigs.k8s.io/yaml v1.6.0 // indirect
) )
...@@ -4,12 +4,12 @@ github.com/AdaLogics/go-fuzz-headers v0.0.0-20230811130428-ced1acdcaa24/go.mod h ...@@ -4,12 +4,12 @@ github.com/AdaLogics/go-fuzz-headers v0.0.0-20230811130428-ced1acdcaa24/go.mod h
github.com/AdamKorcz/go-118-fuzz-build v0.0.0-20230306123547-8075edf89bb0 h1:59MxjQVfjXsBpLy+dbd2/ELV5ofnUkUZBvWSC85sheA= github.com/AdamKorcz/go-118-fuzz-build v0.0.0-20230306123547-8075edf89bb0 h1:59MxjQVfjXsBpLy+dbd2/ELV5ofnUkUZBvWSC85sheA=
github.com/AdamKorcz/go-118-fuzz-build v0.0.0-20230306123547-8075edf89bb0/go.mod h1:OahwfttHWG6eJ0clwcfBAHoDI6X/LV/15hx/wlMZSrU= github.com/AdamKorcz/go-118-fuzz-build v0.0.0-20230306123547-8075edf89bb0/go.mod h1:OahwfttHWG6eJ0clwcfBAHoDI6X/LV/15hx/wlMZSrU=
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0=
github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM=
github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY= github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY=
github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU=
github.com/Microsoft/hcsshim v0.11.7 h1:vl/nj3Bar/CvJSYo7gIQPyRWc9f3c6IeSNavBTSZNZQ= github.com/Microsoft/hcsshim v0.11.7 h1:vl/nj3Bar/CvJSYo7gIQPyRWc9f3c6IeSNavBTSZNZQ=
github.com/Microsoft/hcsshim v0.11.7/go.mod h1:MV8xMfmECjl5HdO7U/3/hFVnkmSBjAjmA09d4bExKcU= github.com/Microsoft/hcsshim v0.11.7/go.mod h1:MV8xMfmECjl5HdO7U/3/hFVnkmSBjAjmA09d4bExKcU=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
...@@ -53,6 +53,8 @@ github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymF ...@@ -53,6 +53,8 @@ github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymF
github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU=
github.com/evanphx/json-patch/v5 v5.9.11/go.mod h1:3j+LviiESTElxA4p3EMKAB9HXj3/XEtnUf6OZxqIQTM=
github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM= github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM=
...@@ -105,8 +107,8 @@ github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/ ...@@ -105,8 +107,8 @@ github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J0b1vyeLSOYI8bm5wbJM/8yDe8= github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db h1:097atOisP2aRj7vFgYQBbFN4U4JNXUNYpxael3UzMyo=
github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144=
github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
...@@ -147,10 +149,10 @@ github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFd ...@@ -147,10 +149,10 @@ github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFd
github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
github.com/onsi/ginkgo/v2 v2.27.2 h1:LzwLj0b89qtIy6SSASkzlNvX6WktqurSHwkk2ipF/Ns= github.com/onsi/ginkgo/v2 v2.22.0 h1:Yed107/8DjTr0lKCNt7Dn8yQ6ybuDRQoMGrNFKzMfHg=
github.com/onsi/ginkgo/v2 v2.27.2/go.mod h1:ArE1D/XhNXBXCBkKOLkbsb2c81dQHCRcF5zwn/ykDRo= github.com/onsi/ginkgo/v2 v2.22.0/go.mod h1:7Du3c42kxCUegi0IImZ1wUQzMBVecgIHjR1C+NkhLQo=
github.com/onsi/gomega v1.38.2 h1:eZCjf2xjZAqe+LeWvKb5weQ+NcPwX84kqJ0cZNxok2A= github.com/onsi/gomega v1.36.1 h1:bJDPBO7ibjxcbHMgSCoo4Yj18UWbKDlLwX1x9sybDcw=
github.com/onsi/gomega v1.38.2/go.mod h1:W2MJcYxRGV63b418Ai34Ud0hEdTVXq9NW9+Sx6uXf3k= github.com/onsi/gomega v1.36.1/go.mod h1:PvZbdDc8J6XJEpDK4HCuRBm8a6Fzp9/DmhC9C7yFlog=
github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U=
github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM=
github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQb2IpWsCzug= github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQb2IpWsCzug=
...@@ -163,7 +165,13 @@ github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= ...@@ -163,7 +165,13 @@ github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/prometheus/client_golang v1.22.0 h1:rb93p9lokFEsctTys46VnV1kLCDpVZ0a/Y92Vm0Zc6Q=
github.com/prometheus/client_golang v1.22.0/go.mod h1:R7ljNsLXhuQXYZYtw6GAE9AZg8Y7vEW5scdCXrWRXC0=
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E=
github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY=
github.com/prometheus/common v0.62.0 h1:xasJaQlnWAeyHdUBeGjXmutelfJHWMRr+Fg4QszZ2Io=
github.com/prometheus/common v0.62.0/go.mod h1:vyBcEuLSvWos9B1+CyL7JZ2up+uFzXhkqml0W5zIY1I=
github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg=
github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is=
github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ=
...@@ -223,8 +231,6 @@ golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvx ...@@ -223,8 +231,6 @@ golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvx
golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.29.0 h1:HV8lRxZC4l2cr3Zq1LvtOsi/ThTgWnUk/y64QSs8GwA=
golang.org/x/mod v0.29.0/go.mod h1:NyhrlYXJ2H4eJiRy/WDBO6HMqZQ6q9nk4JzS3NuCK+w=
golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
...@@ -318,25 +324,29 @@ gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= ...@@ -318,25 +324,29 @@ gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
k8s.io/api v0.35.0 h1:iBAU5LTyBI9vw3L5glmat1njFK34srdLmktWwLTprlY= k8s.io/api v0.34.3 h1:D12sTP257/jSH2vHV2EDYrb16bS7ULlHpdNdNhEw2S4=
k8s.io/api v0.35.0/go.mod h1:AQ0SNTzm4ZAczM03QH42c7l3bih1TbAXYo0DkF8ktnA= k8s.io/api v0.34.3/go.mod h1:PyVQBF886Q5RSQZOim7DybQjAbVs8g7gwJNhGtY5MBk=
k8s.io/apimachinery v0.35.0 h1:Z2L3IHvPVv/MJ7xRxHEtk6GoJElaAqDCCU0S6ncYok8= k8s.io/apiextensions-apiserver v0.34.1 h1:NNPBva8FNAPt1iSVwIE0FsdrVriRXMsaWFMqJbII2CI=
k8s.io/apimachinery v0.35.0/go.mod h1:jQCgFZFR1F4Ik7hvr2g84RTJSZegBc8yHgFWKn//hns= k8s.io/apiextensions-apiserver v0.34.1/go.mod h1:hP9Rld3zF5Ay2Of3BeEpLAToP+l4s5UlxiHfqRaRcMc=
k8s.io/client-go v0.35.0 h1:IAW0ifFbfQQwQmga0UdoH0yvdqrbwMdq9vIFEhRpxBE= k8s.io/apimachinery v0.34.3 h1:/TB+SFEiQvN9HPldtlWOTp0hWbJ+fjU+wkxysf/aQnE=
k8s.io/client-go v0.35.0/go.mod h1:q2E5AAyqcbeLGPdoRB+Nxe3KYTfPce1Dnu1myQdqz9o= k8s.io/apimachinery v0.34.3/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw=
k8s.io/client-go v0.34.3 h1:wtYtpzy/OPNYf7WyNBTj3iUA0XaBHVqhv4Iv3tbrF5A=
k8s.io/client-go v0.34.3/go.mod h1:OxxeYagaP9Kdf78UrKLa3YZixMCfP6bgPwPwNBQBzpM=
k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=
k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE=
k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 h1:Y3gxNAuB0OBLImH611+UDZcmKS3g6CthxToOb37KgwE= k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3 h1:liMHz39T5dJO1aOKHLvwaCjDbf07wVh6yaUlTpunnkE=
k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912/go.mod h1:kdmbQkyfwUagLfXIad1y2TdrjPFWp2Q89B3qkRwf/pQ= k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3/go.mod h1:UZ2yyWbFTpuhSbFhv24aGNOdoRdJZgsIObGBUaYVsts=
k8s.io/kubelet v0.35.0 h1:8cgJHCBCKLYuuQ7/Pxb/qWbJfX1LXIw7790ce9xHq7c= k8s.io/kubelet v0.34.3 h1:8QRev2FmasZ05yCC774qn6ULche72PYM7AQv0CVt9CM=
k8s.io/kubelet v0.35.0/go.mod h1:ciRzAXn7C4z5iB7FhG1L2CGPPXLTVCABDlbXt/Zz8YA= k8s.io/kubelet v0.34.3/go.mod h1:pMgblr+nVQ02UkyaTcgqzS3AIYVQkjlMFg1Pd5rGC1Q=
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 h1:SjGebBtkBqHFOli+05xYbK8YF1Dzkbzn+gDM4X9T4Ck= k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 h1:SjGebBtkBqHFOli+05xYbK8YF1Dzkbzn+gDM4X9T4Ck=
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= k8s.io/utils v0.0.0-20251002143259-bc988d571ff4/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
sigs.k8s.io/controller-runtime v0.22.4 h1:GEjV7KV3TY8e+tJ2LCTxUTanW4z/FmNB7l327UfMq9A=
sigs.k8s.io/controller-runtime v0.22.4/go.mod h1:+QX1XUpTXN4mLoblf4tqr5CQcyHPAki2HLXqQMY6vh8=
sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg= sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg=
sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg=
sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU=
sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY=
sigs.k8s.io/structured-merge-diff/v6 v6.3.2-0.20260122202528-d9cc6641c482 h1:2WOzJpHUBVrrkDjU4KBT8n5LDcj824eX0I5UKcgeRUs= sigs.k8s.io/structured-merge-diff/v6 v6.3.1 h1:JrhdFMqOd/+3ByqlP2I45kTOZmTRLBUm5pvRjeheg7E=
sigs.k8s.io/structured-merge-diff/v6 v6.3.2-0.20260122202528-d9cc6641c482/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE= sigs.k8s.io/structured-merge-diff/v6 v6.3.1/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE=
sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs= sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs=
sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4= sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4=
...@@ -17,7 +17,8 @@ import ( ...@@ -17,7 +17,8 @@ import (
"k8s.io/client-go/kubernetes/fake" "k8s.io/client-go/kubernetes/fake"
clientgotesting "k8s.io/client-go/testing" clientgotesting "k8s.io/client-go/testing"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol"
) )
const testNodeName = "test-node" const testNodeName = "test-node"
...@@ -31,6 +32,10 @@ func makeTestController(t *testing.T, objs ...runtime.Object) *NodeController { ...@@ -31,6 +32,10 @@ func makeTestController(t *testing.T, objs ...runtime.Object) *NodeController {
return &NodeController{ return &NodeController{
config: &types.AgentConfig{ config: &types.AgentConfig{
NodeName: testNodeName, NodeName: testNodeName,
Storage: types.StorageSpec{
Type: snapshotprotocol.StorageTypePVC,
BasePath: t.TempDir(),
},
}, },
clientset: fake.NewClientset(objs...), clientset: fake.NewClientset(objs...),
log: testr.New(t), log: testr.New(t),
...@@ -187,11 +192,11 @@ func TestReconcileCheckpointPod(t *testing.T) { ...@@ -187,11 +192,11 @@ func TestReconcileCheckpointPod(t *testing.T) {
for _, tc := range tests { for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) { t.Run(tc.name, func(t *testing.T) {
labels := map[string]string{ labels := map[string]string{
kubeLabelIsCheckpointSource: "true", snapshotprotocol.CheckpointSourceLabel: "true",
"batch.kubernetes.io/job-name": "checkpoint-job", "batch.kubernetes.io/job-name": "checkpoint-job",
} }
if tc.hash != "" { if tc.hash != "" {
labels[kubeLabelCheckpointHash] = tc.hash labels[snapshotprotocol.CheckpointIDLabel] = tc.hash
} }
job := &batchv1.Job{ job := &batchv1.Job{
...@@ -202,18 +207,11 @@ func TestReconcileCheckpointPod(t *testing.T) { ...@@ -202,18 +207,11 @@ func TestReconcileCheckpointPod(t *testing.T) {
} }
if tc.annotation != "" { if tc.annotation != "" {
job.Annotations = map[string]string{ job.Annotations = map[string]string{
kubeAnnotationCheckpointStatus: tc.annotation, snapshotprotocol.CheckpointStatusAnnotation: tc.annotation,
} }
} }
var annotations map[string]string pod := makePod("test-pod", "default", tc.nodeName, tc.phase, tc.ready, labels, nil)
if tc.hash != "" {
annotations = map[string]string{
kubeAnnotationCheckpointLocation: "/checkpoints/" + tc.hash,
kubeAnnotationCheckpointStorageType: "pvc",
}
}
pod := makePod("test-pod", "default", tc.nodeName, tc.phase, tc.ready, labels, annotations)
objs := []runtime.Object{job} objs := []runtime.Object{job}
if tc.lease != nil { if tc.lease != nil {
objs = append(objs, tc.lease) objs = append(objs, tc.lease)
...@@ -289,13 +287,13 @@ func TestReconcileRestorePod(t *testing.T) { ...@@ -289,13 +287,13 @@ func TestReconcileRestorePod(t *testing.T) {
want: false, want: false,
}, },
{ {
name: "already ready", name: "ready placeholder still restores",
nodeName: testNodeName, nodeName: testNodeName,
phase: corev1.PodRunning, phase: corev1.PodRunning,
ready: true, ready: true,
hash: "abc123", hash: "abc123",
createDir: true, createDir: true,
want: false, want: true,
}, },
{ {
name: "missing hash", name: "missing hash",
...@@ -382,29 +380,20 @@ func TestReconcileRestorePod(t *testing.T) { ...@@ -382,29 +380,20 @@ func TestReconcileRestorePod(t *testing.T) {
for _, tc := range tests { for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) { t.Run(tc.name, func(t *testing.T) {
labels := map[string]string{ labels := map[string]string{
kubeLabelIsRestoreTarget: "true", snapshotprotocol.RestoreTargetLabel: "true",
} }
if tc.hash != "" { if tc.hash != "" {
labels[kubeLabelCheckpointHash] = tc.hash labels[snapshotprotocol.CheckpointIDLabel] = tc.hash
} }
w := makeTestController(t) w := makeTestController(t)
checkpointDir := t.TempDir()
var annotations map[string]string var annotations map[string]string
if tc.annotationStatus != "" { if tc.annotationStatus != "" {
annotations = map[string]string{ annotations = map[string]string{
kubeAnnotationRestoreStatus: tc.annotationStatus, snapshotprotocol.RestoreStatusAnnotation: tc.annotationStatus,
kubeAnnotationRestoreContainerID: tc.annotationContainerID, snapshotprotocol.RestoreContainerIDAnnotation: tc.annotationContainerID,
} }
} }
if tc.hash != "" {
if annotations == nil {
annotations = make(map[string]string)
}
annotations[kubeAnnotationCheckpointLocation] = filepath.Join(checkpointDir, tc.hash)
annotations[kubeAnnotationCheckpointStorageType] = "pvc"
}
pod := makePod("test-pod", "default", tc.nodeName, tc.phase, tc.ready, labels, annotations) pod := makePod("test-pod", "default", tc.nodeName, tc.phase, tc.ready, labels, annotations)
pod.Status.ContainerStatuses = []corev1.ContainerStatus{{ pod.Status.ContainerStatuses = []corev1.ContainerStatus{{
...@@ -414,7 +403,7 @@ func TestReconcileRestorePod(t *testing.T) { ...@@ -414,7 +403,7 @@ func TestReconcileRestorePod(t *testing.T) {
}} }}
if tc.createDir && tc.hash != "" { if tc.createDir && tc.hash != "" {
dir := filepath.Join(checkpointDir, tc.hash) dir := filepath.Join(w.config.Storage.BasePath, tc.hash, "versions", snapshotprotocol.DefaultCheckpointArtifactVersion)
if err := os.MkdirAll(dir, 0o755); err != nil { if err := os.MkdirAll(dir, 0o755); err != nil {
t.Fatalf("failed to create checkpoint dir: %v", err) t.Fatalf("failed to create checkpoint dir: %v", err)
} }
...@@ -473,6 +462,10 @@ func TestRunCheckpointKeepsLeaseAndInFlightOnTerminalStatusPatchFailure(t *testi ...@@ -473,6 +462,10 @@ func TestRunCheckpointKeepsLeaseAndInFlightOnTerminalStatusPatchFailure(t *testi
w := &NodeController{ w := &NodeController{
config: &types.AgentConfig{ config: &types.AgentConfig{
NodeName: testNodeName, NodeName: testNodeName,
Storage: types.StorageSpec{
Type: snapshotprotocol.StorageTypePVC,
BasePath: t.TempDir(),
},
}, },
clientset: clientset, clientset: clientset,
log: testr.New(t), log: testr.New(t),
...@@ -483,7 +476,7 @@ func TestRunCheckpointKeepsLeaseAndInFlightOnTerminalStatusPatchFailure(t *testi ...@@ -483,7 +476,7 @@ func TestRunCheckpointKeepsLeaseAndInFlightOnTerminalStatusPatchFailure(t *testi
stopCh: make(chan struct{}), stopCh: make(chan struct{}),
} }
err := w.runCheckpoint(context.Background(), pod, job, "abc123", filepath.Join(t.TempDir(), "abc123"), "pvc", "default/test-pod") err := w.runCheckpoint(context.Background(), pod, job, "abc123", filepath.Join(t.TempDir(), "abc123"), "default/test-pod")
if err == nil { if err == nil {
t.Fatal("expected terminal checkpoint status update to fail") t.Fatal("expected terminal checkpoint status update to fail")
} }
......
...@@ -22,6 +22,20 @@ const ( ...@@ -22,6 +22,20 @@ const (
checkpointLeaseRenewInterval = 10 * time.Second checkpointLeaseRenewInterval = 10 * time.Second
) )
func checkpointLeaseExpired(lease *coordinationv1.Lease, now time.Time) bool {
if lease == nil || lease.Spec.LeaseDurationSeconds == nil {
return true
}
last := lease.Spec.RenewTime
if last == nil {
last = lease.Spec.AcquireTime
}
if last == nil {
return true
}
return now.After(last.Time.Add(time.Duration(*lease.Spec.LeaseDurationSeconds) * time.Second))
}
func podFromInformerObj(obj interface{}) (*corev1.Pod, bool) { func podFromInformerObj(obj interface{}) (*corev1.Pod, bool) {
if pod, ok := obj.(*corev1.Pod); ok { if pod, ok := obj.(*corev1.Pod); ok {
return pod, true return pod, true
...@@ -94,20 +108,6 @@ func getCheckpointJob(ctx context.Context, clientset kubernetes.Interface, pod * ...@@ -94,20 +108,6 @@ func getCheckpointJob(ctx context.Context, clientset kubernetes.Interface, pod *
return job, nil return job, nil
} }
func isLeaseExpired(lease *coordinationv1.Lease, now time.Time) bool {
if lease == nil || lease.Spec.LeaseDurationSeconds == nil {
return true
}
last := lease.Spec.RenewTime
if last == nil {
last = lease.Spec.AcquireTime
}
if last == nil {
return true
}
return now.After(last.Time.Add(time.Duration(*lease.Spec.LeaseDurationSeconds) * time.Second))
}
func acquireCheckpointLease(ctx context.Context, clientset kubernetes.Interface, log logr.Logger, job *batchv1.Job, holderIdentity string) (bool, error) { func acquireCheckpointLease(ctx context.Context, clientset kubernetes.Interface, log logr.Logger, job *batchv1.Job, holderIdentity string) (bool, error) {
leaseName := job.Name leaseName := job.Name
now := metav1.NewMicroTime(time.Now()) now := metav1.NewMicroTime(time.Now())
...@@ -142,7 +142,7 @@ func acquireCheckpointLease(ctx context.Context, clientset kubernetes.Interface, ...@@ -142,7 +142,7 @@ func acquireCheckpointLease(ctx context.Context, clientset kubernetes.Interface,
return true, nil return true, nil
} }
if !isLeaseExpired(existingLease, now.Time) && if !checkpointLeaseExpired(existingLease, now.Time) &&
existingLease.Spec.HolderIdentity != nil && existingLease.Spec.HolderIdentity != nil &&
*existingLease.Spec.HolderIdentity != holderIdentity { *existingLease.Spec.HolderIdentity != holderIdentity {
return false, nil return false, nil
...@@ -150,7 +150,7 @@ func acquireCheckpointLease(ctx context.Context, clientset kubernetes.Interface, ...@@ -150,7 +150,7 @@ func acquireCheckpointLease(ctx context.Context, clientset kubernetes.Interface,
existingLease.Spec.HolderIdentity = &holderIdentity existingLease.Spec.HolderIdentity = &holderIdentity
existingLease.Spec.LeaseDurationSeconds = &leaseDurationSeconds existingLease.Spec.LeaseDurationSeconds = &leaseDurationSeconds
if existingLease.Spec.AcquireTime == nil || isLeaseExpired(existingLease, now.Time) { if existingLease.Spec.AcquireTime == nil || checkpointLeaseExpired(existingLease, now.Time) {
existingLease.Spec.AcquireTime = &now existingLease.Spec.AcquireTime = &now
} }
existingLease.Spec.RenewTime = &now existingLease.Spec.RenewTime = &now
......
...@@ -12,8 +12,8 @@ import ( ...@@ -12,8 +12,8 @@ import (
"github.com/go-logr/logr" "github.com/go-logr/logr"
"google.golang.org/protobuf/proto" "google.golang.org/protobuf/proto"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/common" snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
) )
const ( const (
...@@ -35,7 +35,7 @@ func BuildDumpOptions( ...@@ -35,7 +35,7 @@ func BuildDumpOptions(
maskedPaths = state.OCISpec.Linux.MaskedPaths maskedPaths = state.OCISpec.Linux.MaskedPaths
} }
externalized, skipped := common.BuildMountPolicy(state.Mounts, state.RootFS, maskedPaths) externalized, skipped := snapshotruntime.BuildMountPolicy(state.Mounts, state.RootFS, maskedPaths)
log.V(1).Info("Resolved mount policy for CRIU dump", log.V(1).Info("Resolved mount policy for CRIU dump",
"externalized_count", len(externalized), "externalized_count", len(externalized),
"skipped_count", len(skipped), "skipped_count", len(skipped),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment