Unverified Commit 38bb9d37 authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

refactor: clean up checkpoint orchestration (#7309)


Signed-off-by: default avatarSchwinn Saereesitthipitak <schwinns@nvidia.com>
parent 9ea3acad
......@@ -272,6 +272,11 @@ func (in *DynamoCheckpointIdentity) DeepCopy() *DynamoCheckpointIdentity {
func (in *DynamoCheckpointJobConfig) DeepCopyInto(out *DynamoCheckpointJobConfig) {
*out = *in
in.PodTemplateSpec.DeepCopyInto(&out.PodTemplateSpec)
if in.SharedMemory != nil {
in, out := &in.SharedMemory, &out.SharedMemory
*out = new(SharedMemorySpec)
(*in).DeepCopyInto(*out)
}
if in.ActiveDeadlineSeconds != nil {
in, out := &in.ActiveDeadlineSeconds, &out.ActiveDeadlineSeconds
*out = new(int64)
......
......@@ -124,11 +124,12 @@ spec:
default: 3600
description: ActiveDeadlineSeconds specifies the maximum time the Job can run
format: int64
minimum: 1
type: integer
backoffLimit:
default: 3
description: BackoffLimit specifies the number of retries before marking the Job failed
description: 'Deprecated: BackoffLimit is ignored. Checkpoint Jobs never retry.'
format: int32
minimum: 0
type: integer
podTemplateSpec:
description: |-
......@@ -8154,10 +8155,28 @@ spec:
- containers
type: object
type: object
sharedMemory:
description: |-
SharedMemory controls the tmpfs mounted at /dev/shm for the checkpoint Job pod.
When omitted, checkpoint Jobs use the same default 8Gi tmpfs as Dynamo components.
properties:
disabled:
type: boolean
size:
anyOf:
- type: integer
- type: string
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
type: object
x-kubernetes-validations:
- message: sharedMemory.size must not be set when sharedMemory.disabled is true
rule: '!(self.disabled && has(self.size))'
ttlSecondsAfterFinished:
default: 300
description: TTLSecondsAfterFinished specifies how long to keep the Job after completion
format: int32
minimum: 0
type: integer
required:
- podTemplateSpec
......@@ -8170,7 +8189,7 @@ spec:
description: DynamoCheckpointStatus defines the observed state of DynamoCheckpoint
properties:
conditions:
description: Conditions represent the latest available observations of the checkpoint's state
description: 'DEPRECATED: Conditions are deprecated. Use status.phase instead.'
items:
description: Condition contains details for one aspect of the current state of this API Resource.
properties:
......
......@@ -678,8 +678,8 @@ spec:
properties:
checkpointRef:
description: |-
CheckpointRef references an existing Checkpoint CR to use
If specified, Identity is ignored and this checkpoint is used directly
CheckpointRef references an existing DynamoCheckpoint CR by metadata.name.
If specified, this service's Identity is ignored and the referenced checkpoint is used directly.
type: string
enabled:
default: false
......@@ -11211,6 +11211,9 @@ spec:
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
type: object
x-kubernetes-validations:
- message: sharedMemory.size must not be set when sharedMemory.disabled is true
rule: '!(self.disabled && has(self.size))'
subComponentType:
description: SubComponentType indicates the sub-role of this component (for example, "prefill").
type: string
......
......@@ -887,8 +887,8 @@ spec:
properties:
checkpointRef:
description: |-
CheckpointRef references an existing Checkpoint CR to use
If specified, Identity is ignored and this checkpoint is used directly
CheckpointRef references an existing DynamoCheckpoint CR by metadata.name.
If specified, this service's Identity is ignored and the referenced checkpoint is used directly.
type: string
enabled:
default: false
......@@ -11420,6 +11420,9 @@ spec:
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
type: object
x-kubernetes-validations:
- message: sharedMemory.size must not be set when sharedMemory.disabled is true
rule: '!(self.disabled && has(self.size))'
subComponentType:
description: SubComponentType indicates the sub-role of this component (for example, "prefill").
type: string
......@@ -11466,7 +11469,7 @@ spec:
description: IdentityHash is the computed hash of the checkpoint identity
type: string
ready:
description: Ready indicates if the checkpoint is ready for use
description: Ready indicates if the checkpoint was visible to the worker at startup
type: boolean
type: object
description: |-
......
......@@ -16,7 +16,7 @@
apiVersion: nvidia.com/v1alpha1
kind: DynamoCheckpoint
metadata:
name: vllm-llama3-8b-tp1
name: llama3-8b-bf16
spec:
# Identity - determines the checkpoint hash
identity:
......@@ -33,7 +33,6 @@ spec:
# Job configuration for checkpoint creation
job:
activeDeadlineSeconds: 3600
backoffLimit: 3
ttlSecondsAfterFinished: 300
podTemplateSpec:
spec:
......@@ -60,4 +59,3 @@ spec:
limits:
nvidia.com/gpu: 1
restartPolicy: Never
......@@ -242,17 +242,6 @@ These are injected into all components when the corresponding infrastructure ser
| --- | --- | --- | --- | --- |
| `OMPI_MCA_orte_keep_fqdn_hostnames` | Instructs OpenMPI to preserve FQDN hostnames for inter-node communication | `1` | `string` | Multinode deployments only |
### Checkpoint / Restore
These environment variables are injected when checkpoint/restore is enabled for a component.
| Variable | Purpose | Default | Type | Condition |
| --- | --- | --- | --- | --- |
| `DYN_CHECKPOINT_PATH` | Base directory where checkpoint data is stored | From operator checkpoint config `storage.pvc.basePath` | `string` | PVC storage type |
| `DYN_CHECKPOINT_LOCATION` | Full checkpoint URI (for non-PVC backends) | — | `string` | S3 or OCI storage type |
| `DYN_CHECKPOINT_HASH` | Identity hash that uniquely identifies the checkpoint | — | `string` | Always set when checkpoint is enabled |
| `SKIP_WAIT_FOR_CHECKPOINT` | Skips the checkpoint readiness polling loop; checks once and proceeds | — | `string` | Set on restored and DGD pods |
## Service Accounts
The following component types automatically receive dedicated service accounts:
......
......@@ -30,6 +30,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/utils/ptr"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/fake"
)
......@@ -80,34 +81,103 @@ func testInfo() *CheckpointInfo {
return &CheckpointInfo{Enabled: true, Hash: testHash}
}
// --- Helper function tests ---
type createHookClient struct {
client.Client
onCreate func(ctx context.Context, obj client.Object) error
}
func TestHelpers(t *testing.T) {
// GetPVCBasePath
assert.Equal(t, "", GetPVCBasePath(nil))
assert.Equal(t, "/checkpoints", GetPVCBasePath(testPVCConfig()))
func (c *createHookClient) Create(ctx context.Context, obj client.Object, opts ...client.CreateOption) error {
if c.onCreate != nil {
if err := c.onCreate(ctx, obj); err != nil {
return err
}
c.onCreate = nil
}
return c.Client.Create(ctx, obj, opts...)
}
// getCheckpointInfoFromCheckpoint — ready
// --- Resource helper tests ---
func TestHelpers(t *testing.T) {
// checkpointInfoFromObject — ready
hash, err := ComputeIdentityHash(testIdentity())
require.NoError(t, err)
ckpt := &nvidiacomv1alpha1.DynamoCheckpoint{
ObjectMeta: metav1.ObjectMeta{Name: "ckpt-abc"},
ObjectMeta: metav1.ObjectMeta{Name: hash},
Spec: nvidiacomv1alpha1.DynamoCheckpointSpec{Identity: testIdentity()},
Status: nvidiacomv1alpha1.DynamoCheckpointStatus{
Phase: nvidiacomv1alpha1.DynamoCheckpointPhaseReady, IdentityHash: testHash,
Location: "/checkpoints/" + testHash, StorageType: "pvc",
Phase: nvidiacomv1alpha1.DynamoCheckpointPhaseReady,
IdentityHash: hash,
Location: "/checkpoints/" + hash,
StorageType: "pvc",
},
}
info := getCheckpointInfoFromCheckpoint(ckpt)
info, err := checkpointInfoFromObject(ckpt)
require.NoError(t, err)
assert.True(t, info.Enabled)
assert.True(t, info.Ready)
assert.Equal(t, testHash, info.Hash)
assert.Equal(t, "/checkpoints/"+testHash, info.Location)
assert.Equal(t, hash, info.Hash)
assert.Equal(t, "/checkpoints/"+hash, info.Location)
assert.Equal(t, ckpt.Name, info.CheckpointName)
// getCheckpointInfoFromCheckpoint — not ready
// checkpointInfoFromObject — not ready
ckpt.Status.Phase = nvidiacomv1alpha1.DynamoCheckpointPhaseCreating
info = getCheckpointInfoFromCheckpoint(ckpt)
info, err = checkpointInfoFromObject(ckpt)
require.NoError(t, err)
assert.False(t, info.Ready)
}
func TestCreateOrGetAutoCheckpointDeduplicatesConcurrentSameHashCheckpoint(t *testing.T) {
ctx := context.Background()
s := testScheme()
identity := testIdentity()
hash, err := ComputeIdentityHash(identity)
require.NoError(t, err)
friendly := &nvidiacomv1alpha1.DynamoCheckpoint{
ObjectMeta: metav1.ObjectMeta{
Name: "friendly-checkpoint",
Namespace: testNamespace,
Labels: map[string]string{
consts.KubeLabelCheckpointHash: hash,
},
},
Spec: nvidiacomv1alpha1.DynamoCheckpointSpec{
Identity: identity,
Job: nvidiacomv1alpha1.DynamoCheckpointJobConfig{
PodTemplateSpec: corev1.PodTemplateSpec{},
},
},
Status: nvidiacomv1alpha1.DynamoCheckpointStatus{
IdentityHash: hash,
Phase: nvidiacomv1alpha1.DynamoCheckpointPhaseReady,
},
}
baseClient := fake.NewClientBuilder().WithScheme(s).Build()
c := &createHookClient{
Client: baseClient,
onCreate: func(ctx context.Context, obj client.Object) error {
_, ok := obj.(*nvidiacomv1alpha1.DynamoCheckpoint)
if !ok {
return nil
}
return baseClient.Create(ctx, friendly.DeepCopy())
},
}
ckpt, err := CreateOrGetAutoCheckpoint(ctx, c, testNamespace, identity, corev1.PodTemplateSpec{})
require.NoError(t, err)
assert.Equal(t, friendly.Name, ckpt.Name)
list := &nvidiacomv1alpha1.DynamoCheckpointList{}
require.NoError(t, baseClient.List(ctx, list))
require.Len(t, list.Items, 1)
assert.Equal(t, friendly.Name, list.Items[0].Name)
}
// --- Injection idempotency tests ---
func TestInjectionIdempotency(t *testing.T) {
......@@ -126,87 +196,40 @@ func TestInjectionIdempotency(t *testing.T) {
assert.Len(t, container.VolumeMounts, 2)
}
// --- InjectCheckpointEnvVars tests ---
func TestApplyCheckpointPodMetadata(t *testing.T) {
t.Run("checkpoint source metadata uses annotations for location and storage", func(t *testing.T) {
labels := map[string]string{}
annotations := map[string]string{}
func TestInjectCheckpointEnvVars(t *testing.T) {
t.Run("PVC storage injects PATH and HASH", func(t *testing.T) {
container := &corev1.Container{}
InjectCheckpointEnvVars(container, testInfo(), testPVCConfig())
ApplyCheckpointSourcePodMetadata(labels, annotations, testHash, "/checkpoints/"+testHash, "pvc")
envMap := make(map[string]string, len(container.Env))
for _, e := range container.Env {
envMap[e.Name] = e.Value
}
assert.Equal(t, "/checkpoints", envMap[consts.EnvCheckpointPath])
assert.Equal(t, testHash, envMap[consts.EnvCheckpointHash])
_, hasLocation := envMap[consts.EnvCheckpointLocation]
assert.False(t, hasLocation)
assert.Equal(t, consts.KubeLabelValueTrue, labels[consts.KubeLabelIsCheckpointSource])
assert.Equal(t, testHash, labels[consts.KubeLabelCheckpointHash])
assert.Equal(t, "/checkpoints/"+testHash, annotations[consts.KubeAnnotationCheckpointLocation])
assert.Equal(t, "pvc", annotations[consts.KubeAnnotationCheckpointStorageType])
})
t.Run("S3 storage injects LOCATION and HASH", func(t *testing.T) {
container := &corev1.Container{}
info := &CheckpointInfo{Enabled: true, Hash: testHash, Location: "s3://bucket/" + testHash + ".tar"}
config := &configv1alpha1.CheckpointConfiguration{
Storage: configv1alpha1.CheckpointStorageConfiguration{
Type: configv1alpha1.CheckpointStorageTypeS3,
S3: configv1alpha1.CheckpointS3Config{URI: "s3://bucket"},
},
t.Run("restore metadata clears stale values when checkpoint is not ready", func(t *testing.T) {
labels := map[string]string{
consts.KubeLabelIsRestoreTarget: consts.KubeLabelValueTrue,
consts.KubeLabelCheckpointHash: "stale-hash",
}
InjectCheckpointEnvVars(container, info, config)
envMap := make(map[string]string, len(container.Env))
for _, e := range container.Env {
envMap[e.Name] = e.Value
annotations := map[string]string{
consts.KubeAnnotationCheckpointLocation: "/checkpoints/stale-hash",
consts.KubeAnnotationCheckpointStorageType: "pvc",
}
assert.Equal(t, "s3://bucket/"+testHash+".tar", envMap[consts.EnvCheckpointLocation])
assert.Equal(t, testHash, envMap[consts.EnvCheckpointHash])
})
t.Run("disabled is a no-op", func(t *testing.T) {
container := &corev1.Container{}
InjectCheckpointEnvVars(container, &CheckpointInfo{Enabled: false}, testPVCConfig())
assert.Empty(t, container.Env)
})
t.Run("preserves existing env vars", func(t *testing.T) {
container := &corev1.Container{Env: []corev1.EnvVar{{Name: "EXISTING", Value: "keep"}}}
InjectCheckpointEnvVars(container, testInfo(), testPVCConfig())
envMap := make(map[string]string, len(container.Env))
for _, e := range container.Env {
envMap[e.Name] = e.Value
}
assert.Equal(t, "keep", envMap["EXISTING"])
assert.Equal(t, testHash, envMap[consts.EnvCheckpointHash])
})
}
// --- InjectCheckpointLabelsFromConfig tests ---
func TestInjectCheckpointLabelsFromConfig(t *testing.T) {
// Disabled/nil configs are no-ops
for _, cfg := range []*nvidiacomv1alpha1.ServiceCheckpointConfig{nil, {Enabled: false}} {
labels := map[string]string{"existing": "value"}
result, err := InjectCheckpointLabelsFromConfig(labels, cfg)
require.NoError(t, err)
assert.Equal(t, map[string]string{"existing": "value"}, result)
}
ApplyRestorePodMetadata(labels, annotations, &CheckpointInfo{Enabled: true, Ready: false})
// Enabled with identity adds hash label
identity := testIdentity()
result, err := InjectCheckpointLabelsFromConfig(nil, &nvidiacomv1alpha1.ServiceCheckpointConfig{
Enabled: true, Identity: &identity,
_, hasRestoreTarget := labels[consts.KubeLabelIsRestoreTarget]
_, hasCheckpointHash := labels[consts.KubeLabelCheckpointHash]
_, hasLocation := annotations[consts.KubeAnnotationCheckpointLocation]
_, hasStorageType := annotations[consts.KubeAnnotationCheckpointStorageType]
assert.False(t, hasRestoreTarget)
assert.False(t, hasCheckpointHash)
assert.False(t, hasLocation)
assert.False(t, hasStorageType)
})
require.NoError(t, err)
hash, ok := result[consts.KubeLabelCheckpointHash]
assert.True(t, ok)
assert.Len(t, hash, 16)
// Enabled without identity does not add hash
result, err = InjectCheckpointLabelsFromConfig(map[string]string{}, &nvidiacomv1alpha1.ServiceCheckpointConfig{Enabled: true})
require.NoError(t, err)
_, ok = result[consts.KubeLabelCheckpointHash]
assert.False(t, ok)
}
// --- InjectCheckpointIntoPodSpec tests ---
......@@ -251,7 +274,7 @@ func TestInjectCheckpointIntoPodSpec(t *testing.T) {
require.NotNil(t, podSpec.SecurityContext.SeccompProfile)
})
t.Run("PVC storage injects volumes, mounts, and env vars", func(t *testing.T) {
t.Run("PVC storage injects volumes and mounts", func(t *testing.T) {
podSpec := testPodSpec()
require.NoError(t, InjectCheckpointIntoPodSpec(podSpec, testInfo(), testPVCConfig()))
......@@ -262,6 +285,20 @@ func TestInjectCheckpointIntoPodSpec(t *testing.T) {
if v.Name == consts.CheckpointVolumeName {
assert.Equal(t, "snapshot-pvc", v.PersistentVolumeClaim.ClaimName)
}
if v.Name == consts.PodInfoVolumeName {
require.NotNil(t, v.DownwardAPI)
fieldPaths := map[string]string{}
for _, item := range v.DownwardAPI.Items {
if item.FieldRef != nil {
fieldPaths[item.Path] = item.FieldRef.FieldPath
}
}
assert.Equal(t, "metadata.labels['"+consts.KubeLabelDynamoNamespace+"']", fieldPaths[consts.PodInfoFileDynNamespace])
assert.Equal(t, "metadata.labels['"+consts.KubeLabelDynamoWorkerHash+"']", fieldPaths[consts.PodInfoFileDynNamespaceWorkerSuffix])
assert.Equal(t, "metadata.labels['"+consts.KubeLabelDynamoComponentType+"']", fieldPaths[consts.PodInfoFileDynComponent])
assert.Equal(t, "metadata.labels['"+consts.KubeLabelDynamoGraphDeploymentName+"']", fieldPaths[consts.PodInfoFileDynParentDGDName])
assert.Equal(t, consts.PodInfoFieldPodNamespace, fieldPaths[consts.PodInfoFileDynParentDGDNamespace])
}
}
assert.True(t, volNames[consts.CheckpointVolumeName])
assert.True(t, volNames[consts.PodInfoVolumeName])
......@@ -273,14 +310,6 @@ func TestInjectCheckpointIntoPodSpec(t *testing.T) {
}
assert.Equal(t, "/checkpoints", mountPaths[consts.CheckpointVolumeName])
assert.Equal(t, consts.PodInfoMountPath, mountPaths[consts.PodInfoVolumeName])
// Env
envMap := make(map[string]string, len(podSpec.Containers[0].Env))
for _, e := range podSpec.Containers[0].Env {
envMap[e.Name] = e.Value
}
assert.Equal(t, "/checkpoints", envMap[consts.EnvCheckpointPath])
assert.Equal(t, testHash, envMap[consts.EnvCheckpointHash])
})
t.Run("computes hash from identity when hash is empty", func(t *testing.T) {
......@@ -328,9 +357,6 @@ func TestInjectCheckpointIntoPodSpec(t *testing.T) {
{"PVC name missing", testPodSpec(), testInfo(), &configv1alpha1.CheckpointConfiguration{
Storage: configv1alpha1.CheckpointStorageConfiguration{Type: "pvc", PVC: configv1alpha1.CheckpointPVCConfig{BasePath: "/checkpoints"}},
}, "no PVC name"},
{"PVC base path missing", testPodSpec(), testInfo(), &configv1alpha1.CheckpointConfiguration{
Storage: configv1alpha1.CheckpointStorageConfiguration{Type: "pvc", PVC: configv1alpha1.CheckpointPVCConfig{PVCName: "snapshot-pvc"}},
}, "no PVC base path"},
{"S3 URI missing", testPodSpec(), testInfo(), &configv1alpha1.CheckpointConfiguration{
Storage: configv1alpha1.CheckpointStorageConfiguration{Type: "s3"},
}, "S3"},
......@@ -370,39 +396,48 @@ func TestResolveCheckpointForService(t *testing.T) {
})
t.Run("checkpointRef resolves ready CR", func(t *testing.T) {
hash, err := ComputeIdentityHash(testIdentity())
require.NoError(t, err)
ckpt := &nvidiacomv1alpha1.DynamoCheckpoint{
ObjectMeta: metav1.ObjectMeta{Name: "my-ckpt", Namespace: testNamespace},
ObjectMeta: metav1.ObjectMeta{Name: hash, Namespace: testNamespace},
Spec: nvidiacomv1alpha1.DynamoCheckpointSpec{Identity: testIdentity()},
Status: nvidiacomv1alpha1.DynamoCheckpointStatus{
Phase: nvidiacomv1alpha1.DynamoCheckpointPhaseReady, IdentityHash: testHash,
Location: "/checkpoints/" + testHash, StorageType: "pvc",
Phase: nvidiacomv1alpha1.DynamoCheckpointPhaseReady,
IdentityHash: hash,
Location: "/checkpoints/" + hash,
StorageType: "pvc",
},
}
c := fake.NewClientBuilder().WithScheme(s).WithObjects(ckpt).WithStatusSubresource(ckpt).Build()
ref := "my-ckpt"
ref := hash
info, err := ResolveCheckpointForService(ctx, c, testNamespace, &nvidiacomv1alpha1.ServiceCheckpointConfig{
Enabled: true, CheckpointRef: &ref,
})
require.NoError(t, err)
assert.True(t, info.Exists)
assert.True(t, info.Ready)
assert.Equal(t, testHash, info.Hash)
assert.Equal(t, "/checkpoints/"+testHash, info.Location)
assert.Equal(t, hash, info.Hash)
assert.Equal(t, "/checkpoints/"+hash, info.Location)
assert.Equal(t, hash, info.CheckpointName)
})
t.Run("checkpointRef resolves not-ready CR", func(t *testing.T) {
hash, err := ComputeIdentityHash(testIdentity())
require.NoError(t, err)
ckpt := &nvidiacomv1alpha1.DynamoCheckpoint{
ObjectMeta: metav1.ObjectMeta{Name: "pending-ckpt", Namespace: testNamespace},
ObjectMeta: metav1.ObjectMeta{Name: hash, Namespace: testNamespace},
Spec: nvidiacomv1alpha1.DynamoCheckpointSpec{Identity: testIdentity()},
Status: nvidiacomv1alpha1.DynamoCheckpointStatus{Phase: nvidiacomv1alpha1.DynamoCheckpointPhaseCreating},
}
c := fake.NewClientBuilder().WithScheme(s).WithObjects(ckpt).WithStatusSubresource(ckpt).Build()
ref := "pending-ckpt"
ref := hash
info, err := ResolveCheckpointForService(ctx, c, testNamespace, &nvidiacomv1alpha1.ServiceCheckpointConfig{
Enabled: true, CheckpointRef: &ref,
})
require.NoError(t, err)
assert.True(t, info.Exists)
assert.False(t, info.Ready)
})
......@@ -415,20 +450,40 @@ func TestResolveCheckpointForService(t *testing.T) {
assert.ErrorContains(t, err, "nonexistent")
})
t.Run("identity lookup finds existing checkpoint by label", func(t *testing.T) {
t.Run("checkpointRef resolves human-readable checkpoint names", func(t *testing.T) {
hash, err := ComputeIdentityHash(testIdentity())
require.NoError(t, err)
ckpt := &nvidiacomv1alpha1.DynamoCheckpoint{
ObjectMeta: metav1.ObjectMeta{Name: "not-the-hash", Namespace: testNamespace},
Spec: nvidiacomv1alpha1.DynamoCheckpointSpec{Identity: testIdentity()},
Status: nvidiacomv1alpha1.DynamoCheckpointStatus{
IdentityHash: hash,
},
}
c := fake.NewClientBuilder().WithScheme(s).WithObjects(ckpt).WithStatusSubresource(ckpt).Build()
ref := "not-the-hash"
info, err := ResolveCheckpointForService(ctx, c, testNamespace, &nvidiacomv1alpha1.ServiceCheckpointConfig{
Enabled: true, CheckpointRef: &ref,
})
require.NoError(t, err)
assert.Equal(t, "not-the-hash", info.CheckpointName)
assert.Equal(t, hash, info.Hash)
})
t.Run("identity lookup finds existing checkpoint by identity hash", func(t *testing.T) {
identity := testIdentity()
hash, err := ComputeIdentityHash(identity)
require.NoError(t, err)
ckpt := &nvidiacomv1alpha1.DynamoCheckpoint{
ObjectMeta: metav1.ObjectMeta{
Name: hash, Namespace: testNamespace,
Labels: map[string]string{consts.KubeLabelCheckpointHash: hash},
},
Spec: nvidiacomv1alpha1.DynamoCheckpointSpec{Identity: identity},
ObjectMeta: metav1.ObjectMeta{Name: "friendly-name", Namespace: testNamespace},
Spec: nvidiacomv1alpha1.DynamoCheckpointSpec{Identity: identity},
Status: nvidiacomv1alpha1.DynamoCheckpointStatus{
Phase: nvidiacomv1alpha1.DynamoCheckpointPhaseReady, IdentityHash: hash,
Location: "/checkpoints/" + hash, StorageType: "pvc",
Phase: nvidiacomv1alpha1.DynamoCheckpointPhaseReady,
IdentityHash: hash,
Location: "/checkpoints/" + hash,
StorageType: "pvc",
},
}
c := fake.NewClientBuilder().WithScheme(s).WithObjects(ckpt).WithStatusSubresource(ckpt).Build()
......@@ -437,8 +492,34 @@ func TestResolveCheckpointForService(t *testing.T) {
Enabled: true, Identity: &identity,
})
require.NoError(t, err)
assert.True(t, info.Exists)
assert.True(t, info.Ready)
assert.Equal(t, hash, info.Hash)
assert.Equal(t, "friendly-name", info.CheckpointName)
})
t.Run("identity lookup returns existing not-ready checkpoint", func(t *testing.T) {
identity := testIdentity()
hash, err := ComputeIdentityHash(identity)
require.NoError(t, err)
ckpt := &nvidiacomv1alpha1.DynamoCheckpoint{
ObjectMeta: metav1.ObjectMeta{Name: "friendly-name", Namespace: testNamespace},
Spec: nvidiacomv1alpha1.DynamoCheckpointSpec{Identity: identity},
Status: nvidiacomv1alpha1.DynamoCheckpointStatus{
Phase: nvidiacomv1alpha1.DynamoCheckpointPhaseCreating,
IdentityHash: hash,
},
}
c := fake.NewClientBuilder().WithScheme(s).WithObjects(ckpt).WithStatusSubresource(ckpt).Build()
info, err := ResolveCheckpointForService(ctx, c, testNamespace, &nvidiacomv1alpha1.ServiceCheckpointConfig{
Enabled: true, Identity: &identity,
})
require.NoError(t, err)
assert.True(t, info.Exists)
assert.False(t, info.Ready)
assert.Equal(t, hash, info.Hash)
})
t.Run("identity lookup returns not-ready when no CR found", func(t *testing.T) {
......@@ -448,6 +529,7 @@ func TestResolveCheckpointForService(t *testing.T) {
Enabled: true, Identity: &identity,
})
require.NoError(t, err)
assert.False(t, info.Exists)
assert.False(t, info.Ready)
assert.Len(t, info.Hash, 16)
})
......
......@@ -18,288 +18,151 @@
package checkpoint
import (
"context"
"fmt"
configv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/config/v1alpha1"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/utils/ptr"
"sigs.k8s.io/controller-runtime/pkg/client"
)
// getCheckpointInfoFromCheckpoint extracts CheckpointInfo from a DynamoCheckpoint CR
func getCheckpointInfoFromCheckpoint(ckpt *nvidiacomv1alpha1.DynamoCheckpoint) *CheckpointInfo {
info := &CheckpointInfo{
Enabled: true,
CheckpointName: ckpt.Name,
Hash: ckpt.Status.IdentityHash,
Location: ckpt.Status.Location,
StorageType: ckpt.Status.StorageType,
Ready: ckpt.Status.Phase == nvidiacomv1alpha1.DynamoCheckpointPhaseReady,
Identity: &ckpt.Spec.Identity,
func ApplyCheckpointSourcePodMetadata(
labels map[string]string,
annotations map[string]string,
hash string,
location string,
storageType nvidiacomv1alpha1.DynamoCheckpointStorageType,
) {
delete(labels, commonconsts.KubeLabelIsRestoreTarget)
delete(labels, commonconsts.KubeLabelCheckpointHash)
delete(annotations, commonconsts.KubeAnnotationCheckpointLocation)
delete(annotations, commonconsts.KubeAnnotationCheckpointStorageType)
labels[commonconsts.KubeLabelIsCheckpointSource] = commonconsts.KubeLabelValueTrue
if hash != "" {
labels[commonconsts.KubeLabelCheckpointHash] = hash
}
return info
}
// getPVCBasePath returns the PVC base path from storage config.
// Only applicable for PVC storage type
func getPVCBasePath(storageConfig *configv1alpha1.CheckpointStorageConfiguration) string {
if storageConfig != nil && storageConfig.PVC.BasePath != "" {
return storageConfig.PVC.BasePath
if location != "" {
annotations[commonconsts.KubeAnnotationCheckpointLocation] = location
}
return ""
}
// GetPVCBasePath returns the configured PVC base path from controller config.
// This is used by both CheckpointReconciler and DynamoGraphDeploymentReconciler.
// Only applicable for PVC storage type.
func GetPVCBasePath(config *configv1alpha1.CheckpointConfiguration) string {
if config != nil {
return getPVCBasePath(&config.Storage)
if storageType != "" {
annotations[commonconsts.KubeAnnotationCheckpointStorageType] = string(storageType)
}
return ""
}
// CheckpointInfo contains resolved checkpoint information for a DGD service
type CheckpointInfo struct {
// Enabled indicates if checkpointing is enabled
Enabled bool
// Identity is the resolved checkpoint identity (model, framework, etc.)
Identity *nvidiacomv1alpha1.DynamoCheckpointIdentity
// Hash is the computed identity hash
Hash string
// Location is the full URI/path in the storage backend
Location string
// StorageType is the storage backend type (pvc, s3, oci)
StorageType nvidiacomv1alpha1.DynamoCheckpointStorageType
// CheckpointName is the name of the Checkpoint CR
CheckpointName string
// Ready indicates if the checkpoint is ready for use
Ready bool
}
// ResolveCheckpointForService resolves checkpoint information for a DGD service.
// It handles both checkpointRef (direct reference) and identity-based lookup.
// Returns CheckpointInfo with the resolved identity populated.
func ResolveCheckpointForService(
ctx context.Context,
c client.Client,
namespace string,
config *nvidiacomv1alpha1.ServiceCheckpointConfig,
) (*CheckpointInfo, error) {
if config == nil || !config.Enabled {
return &CheckpointInfo{Enabled: false}, nil
}
// If a direct checkpoint reference is provided, use it
if config.CheckpointRef != nil && *config.CheckpointRef != "" {
ckpt := &nvidiacomv1alpha1.DynamoCheckpoint{}
err := c.Get(ctx, types.NamespacedName{
Namespace: namespace,
Name: *config.CheckpointRef,
}, ckpt)
if err != nil {
return nil, fmt.Errorf("failed to get referenced checkpoint %s: %w", *config.CheckpointRef, err)
}
// Extract all checkpoint info including identity from the CR
return getCheckpointInfoFromCheckpoint(ckpt), nil
}
// Otherwise, compute hash from identity and look up checkpoint
if config.Identity == nil {
return nil, fmt.Errorf("checkpoint enabled but no checkpointRef or identity provided")
}
hash, err := ComputeIdentityHash(*config.Identity)
if err != nil {
return nil, fmt.Errorf("failed to compute identity hash: %w", err)
}
info := &CheckpointInfo{
Enabled: true,
Identity: config.Identity,
Hash: hash,
}
// Look for existing checkpoint with matching hash using label selector
checkpointList := &nvidiacomv1alpha1.DynamoCheckpointList{}
if err = c.List(ctx, checkpointList,
client.InNamespace(namespace),
client.MatchingLabels{consts.KubeLabelCheckpointHash: info.Hash},
); err != nil {
return nil, fmt.Errorf("failed to list checkpoints: %w", err)
}
func ApplyRestorePodMetadata(labels map[string]string, annotations map[string]string, checkpointInfo *CheckpointInfo) {
delete(labels, commonconsts.KubeLabelIsRestoreTarget)
delete(labels, commonconsts.KubeLabelCheckpointHash)
delete(annotations, commonconsts.KubeAnnotationCheckpointLocation)
delete(annotations, commonconsts.KubeAnnotationCheckpointStorageType)
// Return the first matching checkpoint (there should be at most one per hash)
if len(checkpointList.Items) > 0 {
ckpt := &checkpointList.Items[0]
// Merge checkpoint info from the CR (overrides the computed values)
foundInfo := getCheckpointInfoFromCheckpoint(ckpt)
// Keep the hash and identity we computed from the config
foundInfo.Hash = info.Hash
foundInfo.Identity = info.Identity
return foundInfo, nil
}
// No existing checkpoint found
// In Auto mode, the controller should create one
return info, nil
}
// InjectCheckpointEnvVars adds checkpoint-related environment variables to a restored/DGD container.
// Sets PATH and HASH so the restored process knows its checkpoint identity.
// DYN_CHECKPOINT_LOCATION is reserved for future S3/OCI support.
func InjectCheckpointEnvVars(container *corev1.Container, info *CheckpointInfo, checkpointConfig *configv1alpha1.CheckpointConfiguration) {
if !info.Enabled {
if checkpointInfo == nil || !checkpointInfo.Enabled || !checkpointInfo.Ready {
return
}
var envVars []corev1.EnvVar
// For PVC storage: inject base path so the restored process knows its checkpoint location.
// For S3/OCI (future): inject DYN_CHECKPOINT_LOCATION directly.
storageType := configv1alpha1.CheckpointStorageTypePVC
if checkpointConfig != nil && checkpointConfig.Storage.Type != "" {
storageType = checkpointConfig.Storage.Type
labels[commonconsts.KubeLabelIsRestoreTarget] = commonconsts.KubeLabelValueTrue
if checkpointInfo.Hash != "" {
labels[commonconsts.KubeLabelCheckpointHash] = checkpointInfo.Hash
}
switch storageType {
case configv1alpha1.CheckpointStorageTypePVC:
basePath := ""
if checkpointConfig != nil {
basePath = getPVCBasePath(&checkpointConfig.Storage)
}
envVars = append(envVars, corev1.EnvVar{
Name: consts.EnvCheckpointPath,
Value: basePath,
})
default:
// S3/OCI: inject full location URI directly
if info.Location != "" {
envVars = append(envVars, corev1.EnvVar{
Name: consts.EnvCheckpointLocation,
Value: info.Location,
})
}
if checkpointInfo.Location != "" {
annotations[commonconsts.KubeAnnotationCheckpointLocation] = checkpointInfo.Location
}
if info.Hash != "" {
envVars = append(envVars, corev1.EnvVar{
Name: consts.EnvCheckpointHash,
Value: info.Hash,
})
if checkpointInfo.StorageType != "" {
annotations[commonconsts.KubeAnnotationCheckpointStorageType] = string(checkpointInfo.StorageType)
}
// Prepend checkpoint env vars to ensure they're available
container.Env = append(envVars, container.Env...)
}
// InjectCheckpointVolume adds the checkpoint PVC volume to a pod spec
func InjectCheckpointVolume(podSpec *corev1.PodSpec, pvcName string) {
// Check if volume already exists
for _, v := range podSpec.Volumes {
if v.Name == consts.CheckpointVolumeName {
for _, volume := range podSpec.Volumes {
if volume.Name == commonconsts.CheckpointVolumeName {
return
}
}
podSpec.Volumes = append(podSpec.Volumes, corev1.Volume{
Name: consts.CheckpointVolumeName,
Name: commonconsts.CheckpointVolumeName,
VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
ClaimName: pvcName,
ReadOnly: false, // CRIU needs write access during restore
ReadOnly: false,
},
},
})
}
// InjectCheckpointVolumeMount adds the checkpoint volume mount to a container
func InjectCheckpointVolumeMount(container *corev1.Container, basePath string) {
// Check if mount already exists
for _, m := range container.VolumeMounts {
if m.Name == consts.CheckpointVolumeName {
for _, mount := range container.VolumeMounts {
if mount.Name == commonconsts.CheckpointVolumeName {
return
}
}
container.VolumeMounts = append(container.VolumeMounts, corev1.VolumeMount{
Name: consts.CheckpointVolumeName,
Name: commonconsts.CheckpointVolumeName,
MountPath: basePath,
ReadOnly: false, // CRIU needs write access for restore.log and restore-criu.conf
ReadOnly: false,
})
}
// InjectPodInfoVolume adds a Downward API volume for pod identity and DGD info.
// This is critical for CRIU checkpoint/restore scenarios where environment variables
// contain stale values from the checkpoint source pod. The Downward API files
// always reflect the current pod's identity and DGD configuration.
func InjectPodInfoVolume(podSpec *corev1.PodSpec) {
// Check if volume already exists
for _, v := range podSpec.Volumes {
if v.Name == consts.PodInfoVolumeName {
for _, volume := range podSpec.Volumes {
if volume.Name == commonconsts.PodInfoVolumeName {
return
}
}
podSpec.Volumes = append(podSpec.Volumes, corev1.Volume{
Name: consts.PodInfoVolumeName,
Name: commonconsts.PodInfoVolumeName,
VolumeSource: corev1.VolumeSource{
DownwardAPI: &corev1.DownwardAPIVolumeSource{
Items: []corev1.DownwardAPIVolumeFile{
// Pod identity fields
{
Path: "pod_name",
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: consts.PodInfoFieldPodName,
FieldPath: commonconsts.PodInfoFieldPodName,
},
},
{
Path: "pod_uid",
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: consts.PodInfoFieldPodUID,
FieldPath: commonconsts.PodInfoFieldPodUID,
},
},
{
Path: "pod_namespace",
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: consts.PodInfoFieldPodNamespace,
FieldPath: commonconsts.PodInfoFieldPodNamespace,
},
},
// DGD info from annotations (for CRIU restore)
{
Path: consts.PodInfoFileDynNamespace,
Path: commonconsts.PodInfoFileDynNamespace,
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.annotations['" + consts.AnnotationDynNamespace + "']",
FieldPath: "metadata.labels['" + commonconsts.KubeLabelDynamoNamespace + "']",
},
},
{
Path: consts.PodInfoFileDynComponent,
Path: commonconsts.PodInfoFileDynNamespaceWorkerSuffix,
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.annotations['" + consts.AnnotationDynComponent + "']",
FieldPath: "metadata.labels['" + commonconsts.KubeLabelDynamoWorkerHash + "']",
},
},
{
Path: consts.PodInfoFileDynParentDGDName,
Path: commonconsts.PodInfoFileDynComponent,
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.annotations['" + consts.AnnotationDynParentDGDName + "']",
FieldPath: "metadata.labels['" + commonconsts.KubeLabelDynamoComponentType + "']",
},
},
{
Path: consts.PodInfoFileDynParentDGDNS,
Path: commonconsts.PodInfoFileDynParentDGDName,
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.annotations['" + consts.AnnotationDynParentDGDNS + "']",
FieldPath: "metadata.labels['" + commonconsts.KubeLabelDynamoGraphDeploymentName + "']",
},
},
{
Path: consts.PodInfoFileDynDiscoveryBackend,
Path: commonconsts.PodInfoFileDynParentDGDNamespace,
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.annotations['" + consts.AnnotationDynDiscoveryBackend + "']",
FieldPath: commonconsts.PodInfoFieldPodNamespace,
},
},
},
......@@ -308,35 +171,20 @@ func InjectPodInfoVolume(podSpec *corev1.PodSpec) {
})
}
// InjectPodInfoVolumeMount adds the Downward API volume mount to a container.
func InjectPodInfoVolumeMount(container *corev1.Container) {
// Check if mount already exists
for _, m := range container.VolumeMounts {
if m.Name == consts.PodInfoVolumeName {
for _, mount := range container.VolumeMounts {
if mount.Name == commonconsts.PodInfoVolumeName {
return
}
}
container.VolumeMounts = append(container.VolumeMounts, corev1.VolumeMount{
Name: consts.PodInfoVolumeName,
MountPath: consts.PodInfoMountPath,
Name: commonconsts.PodInfoVolumeName,
MountPath: commonconsts.PodInfoMountPath,
ReadOnly: true,
})
}
// InjectCheckpointIntoPodSpec injects checkpoint configuration into a pod spec for
// external restore via the snapshot DaemonSet. The pod image is expected to be a
// runtime-compatible restore image (runtime + CRIU tooling). For ready checkpoints,
// the operator overrides command to `sleep infinity` so the watcher can trigger
// external restore via nsenter + nsrestore.
//
// Modifications applied:
// 1. Security context - seccomp profile (io_uring blocking, matches checkpoint environment)
// 2. Environment variables - checkpoint path and hash
// 3. Storage configuration - checkpoint PVC and Downward API (pod identity)
//
// No hostIPC, no privileged mode — those are only needed when CRIU runs inside the
// container. With external restore, all privilege lives in the DaemonSet.
func InjectCheckpointIntoPodSpec(
podSpec *corev1.PodSpec,
checkpointInfo *CheckpointInfo,
......@@ -351,6 +199,7 @@ func InjectCheckpointIntoPodSpec(
if info.Identity == nil {
return fmt.Errorf("checkpoint enabled but identity is nil and hash is not set")
}
hash, err := ComputeIdentityHash(*info.Identity)
if err != nil {
return fmt.Errorf("failed to compute identity hash: %w", err)
......@@ -358,10 +207,9 @@ func InjectCheckpointIntoPodSpec(
info.Hash = hash
}
// Find the main container (needed for volume mounts and env vars)
var mainContainer *corev1.Container
for i := range podSpec.Containers {
if podSpec.Containers[i].Name == consts.MainContainerName {
if podSpec.Containers[i].Name == commonconsts.MainContainerName {
mainContainer = &podSpec.Containers[i]
break
}
......@@ -373,26 +221,20 @@ func InjectCheckpointIntoPodSpec(
return fmt.Errorf("no container found to inject checkpoint config")
}
// When a ready checkpoint exists, override the container command to sleep infinity.
// The DaemonSet watcher detects this pod via the checkpoint-restore label and
// performs external restore (nsenter + nsrestore). When no checkpoint is ready,
// the original command runs (cold start).
if info.Ready {
mainContainer.Command = []string{"sleep", "infinity"}
mainContainer.Args = nil
}
// Seccomp profile to match checkpoint environment (blocks io_uring syscalls)
if podSpec.SecurityContext == nil {
podSpec.SecurityContext = &corev1.PodSecurityContext{}
}
podSpec.SecurityContext.SeccompProfile = &corev1.SeccompProfile{
Type: corev1.SeccompProfileTypeLocalhost,
LocalhostProfile: ptr.To(consts.SeccompProfilePath),
LocalhostProfile: ptr.To(commonconsts.SeccompProfilePath),
}
// Determine storage type and compute location/path
storageType := configv1alpha1.CheckpointStorageTypePVC // default
storageType := configv1alpha1.CheckpointStorageTypePVC
var storageConfig *configv1alpha1.CheckpointStorageConfiguration
if checkpointConfig != nil {
storageConfig = &checkpointConfig.Storage
......@@ -408,59 +250,30 @@ func InjectCheckpointIntoPodSpec(
return fmt.Errorf("S3 storage type selected but no S3 URI configured (set checkpoint.storage.s3.uri)")
}
info.Location = fmt.Sprintf("%s/%s.tar", storageConfig.S3.URI, info.Hash)
case configv1alpha1.CheckpointStorageTypeOCI:
info.StorageType = nvidiacomv1alpha1.DynamoCheckpointStorageType(storageType)
if storageConfig == nil || storageConfig.OCI.URI == "" {
return fmt.Errorf("OCI storage type selected but no OCI URI configured (set checkpoint.storage.oci.uri)")
}
info.Location = fmt.Sprintf("%s:%s", storageConfig.OCI.URI, info.Hash)
default: // PVC
default:
info.StorageType = nvidiacomv1alpha1.DynamoCheckpointStorageType(storageType)
basePath := getPVCBasePath(storageConfig)
basePath := ""
if storageConfig != nil && storageConfig.PVC.BasePath != "" {
basePath = storageConfig.PVC.BasePath
}
if storageConfig == nil || storageConfig.PVC.PVCName == "" {
return fmt.Errorf("PVC storage type selected but no PVC name configured (set checkpoint.storage.pvc.pvcName)")
}
pvcName := storageConfig.PVC.PVCName
if basePath == "" {
return fmt.Errorf("PVC storage type selected but no PVC base path configured (set checkpoint.storage.pvc.basePath)")
}
info.Location = fmt.Sprintf("%s/%s", basePath, info.Hash)
InjectCheckpointVolume(podSpec, pvcName)
InjectCheckpointVolume(podSpec, storageConfig.PVC.PVCName)
InjectCheckpointVolumeMount(mainContainer, basePath)
}
// Downward API volume for pod identity after CRIU restore
InjectPodInfoVolume(podSpec)
InjectPodInfoVolumeMount(mainContainer)
// Checkpoint environment variables (path, hash)
InjectCheckpointEnvVars(mainContainer, info, checkpointConfig)
return nil
}
// InjectCheckpointLabelsFromConfig adds checkpoint identity labels to a label map based on config.
// Restore trigger labels are injected only when a concrete restore request is prepared.
func InjectCheckpointLabelsFromConfig(labels map[string]string, config *nvidiacomv1alpha1.ServiceCheckpointConfig) (map[string]string, error) {
if config == nil || !config.Enabled {
return labels, nil
}
if labels == nil {
labels = make(map[string]string)
}
// Compute hash from identity if provided
if config.Identity != nil {
hash, err := ComputeIdentityHash(*config.Identity)
if err != nil {
return nil, fmt.Errorf("failed to compute identity hash for labels: %w", err)
}
labels[consts.KubeLabelCheckpointHash] = hash
}
return labels, nil
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package checkpoint
import (
"context"
"fmt"
configv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/config/v1alpha1"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"
)
type CheckpointInfo struct {
Enabled bool
Exists bool
Identity *nvidiacomv1alpha1.DynamoCheckpointIdentity
Hash string
Location string
StorageType nvidiacomv1alpha1.DynamoCheckpointStorageType
CheckpointName string
Ready bool
}
func checkpointInfoFromObject(ckpt *nvidiacomv1alpha1.DynamoCheckpoint) (*CheckpointInfo, error) {
hash, err := checkpointIdentityHash(ckpt)
if err != nil {
return nil, err
}
return &CheckpointInfo{
Enabled: true,
Exists: true,
Identity: &ckpt.Spec.Identity,
Hash: hash,
Location: ckpt.Status.Location,
StorageType: ckpt.Status.StorageType,
CheckpointName: ckpt.Name,
Ready: ckpt.Status.Phase == nvidiacomv1alpha1.DynamoCheckpointPhaseReady,
}, nil
}
func ResolveCheckpointForService(
ctx context.Context,
c client.Client,
namespace string,
config *nvidiacomv1alpha1.ServiceCheckpointConfig,
) (*CheckpointInfo, error) {
switch {
case config == nil || !config.Enabled:
return &CheckpointInfo{Enabled: false}, nil
case config.CheckpointRef != nil && *config.CheckpointRef != "":
ckpt := &nvidiacomv1alpha1.DynamoCheckpoint{}
if err := c.Get(ctx, types.NamespacedName{
Namespace: namespace,
Name: *config.CheckpointRef,
}, ckpt); err != nil {
return nil, fmt.Errorf("failed to get referenced checkpoint %s: %w", *config.CheckpointRef, err)
}
return checkpointInfoFromObject(ckpt)
case config.Identity == nil:
return nil, fmt.Errorf("checkpoint enabled but no checkpointRef or identity provided")
}
hash, err := ComputeIdentityHash(*config.Identity)
if err != nil {
return nil, fmt.Errorf("failed to compute identity hash: %w", err)
}
existing, err := FindCheckpointByIdentityHash(ctx, c, namespace, hash, "")
if err != nil {
return nil, err
}
if existing == nil {
return &CheckpointInfo{
Enabled: true,
Identity: config.Identity,
Hash: hash,
}, nil
}
info, err := checkpointInfoFromObject(existing)
if err != nil {
return nil, err
}
info.Identity = config.Identity
return info, nil
}
func ResolveCheckpointStorage(
hash string,
config *configv1alpha1.CheckpointConfiguration,
) (string, nvidiacomv1alpha1.DynamoCheckpointStorageType, error) {
storageType := configv1alpha1.CheckpointStorageTypePVC
if config != nil && config.Storage.Type != "" {
storageType = config.Storage.Type
}
switch storageType {
case configv1alpha1.CheckpointStorageTypeS3:
if config == nil || config.Storage.S3.URI == "" {
return "", "", fmt.Errorf("S3 storage type selected but no S3 URI configured (set checkpoint.storage.s3.uri)")
}
return fmt.Sprintf("%s/%s.tar", config.Storage.S3.URI, hash), nvidiacomv1alpha1.DynamoCheckpointStorageType(storageType), nil
case configv1alpha1.CheckpointStorageTypeOCI:
if config == nil || config.Storage.OCI.URI == "" {
return "", "", fmt.Errorf("OCI storage type selected but no OCI URI configured (set checkpoint.storage.oci.uri)")
}
return fmt.Sprintf("%s:%s", config.Storage.OCI.URI, hash), nvidiacomv1alpha1.DynamoCheckpointStorageType(storageType), nil
default:
if config == nil || config.Storage.PVC.BasePath == "" {
return "", "", fmt.Errorf("PVC storage type selected but no PVC base path configured (set checkpoint.storage.pvc.basePath)")
}
return fmt.Sprintf("%s/%s", config.Storage.PVC.BasePath, hash), nvidiacomv1alpha1.DynamoCheckpointStorageType(storageType), nil
}
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package checkpoint
import (
"context"
"fmt"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"
)
func checkpointIdentityHash(ckpt *nvidiacomv1alpha1.DynamoCheckpoint) (string, error) {
if ckpt.Status.IdentityHash != "" {
return ckpt.Status.IdentityHash, nil
}
hash, err := ComputeIdentityHash(ckpt.Spec.Identity)
if err != nil {
return "", fmt.Errorf("failed to compute checkpoint hash for %s: %w", ckpt.Name, err)
}
return hash, nil
}
func FindCheckpointByIdentityHash(
ctx context.Context,
c client.Client,
namespace string,
hash string,
excludeName string,
) (*nvidiacomv1alpha1.DynamoCheckpoint, error) {
checkpoints := &nvidiacomv1alpha1.DynamoCheckpointList{}
if err := c.List(
ctx,
checkpoints,
client.InNamespace(namespace),
client.MatchingLabels{consts.KubeLabelCheckpointHash: hash},
); err != nil {
return nil, fmt.Errorf("failed to list checkpoints by hash label: %w", err)
}
var existing *nvidiacomv1alpha1.DynamoCheckpoint
for i := range checkpoints.Items {
if checkpoints.Items[i].Name == excludeName {
continue
}
if existing != nil {
return nil, fmt.Errorf("multiple checkpoints found for identity hash %s", hash)
}
existing = checkpoints.Items[i].DeepCopy()
}
if existing != nil {
return existing, nil
}
// Fall back to a full scan so legacy checkpoints without the hash label still resolve.
checkpoints = &nvidiacomv1alpha1.DynamoCheckpointList{}
if err := c.List(ctx, checkpoints, client.InNamespace(namespace)); err != nil {
return nil, fmt.Errorf("failed to list checkpoints: %w", err)
}
for i := range checkpoints.Items {
ckpt := &checkpoints.Items[i]
if ckpt.Name == excludeName {
continue
}
existingHash, err := checkpointIdentityHash(ckpt)
if err != nil {
return nil, err
}
if existingHash != hash {
continue
}
if existing != nil {
return nil, fmt.Errorf("multiple checkpoints found for identity hash %s", hash)
}
existing = ckpt.DeepCopy()
}
return existing, nil
}
func CreateOrGetAutoCheckpoint(
ctx context.Context,
c client.Client,
namespace string,
identity nvidiacomv1alpha1.DynamoCheckpointIdentity,
podTemplate corev1.PodTemplateSpec,
) (*nvidiacomv1alpha1.DynamoCheckpoint, error) {
hash, err := ComputeIdentityHash(identity)
if err != nil {
return nil, fmt.Errorf("failed to compute identity hash: %w", err)
}
ckpt := &nvidiacomv1alpha1.DynamoCheckpoint{
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("checkpoint-%s", hash),
Namespace: namespace,
Labels: map[string]string{
consts.KubeLabelCheckpointHash: hash,
},
},
Spec: nvidiacomv1alpha1.DynamoCheckpointSpec{
Identity: identity,
Job: nvidiacomv1alpha1.DynamoCheckpointJobConfig{
PodTemplateSpec: podTemplate,
},
},
}
if err := c.Create(ctx, ckpt); err != nil {
if !apierrors.IsAlreadyExists(err) {
return nil, fmt.Errorf("failed to create checkpoint %s: %w", ckpt.Name, err)
}
existing := &nvidiacomv1alpha1.DynamoCheckpoint{}
key := types.NamespacedName{Name: ckpt.Name, Namespace: namespace}
if err := c.Get(ctx, key, existing); err != nil {
return nil, fmt.Errorf("failed to get checkpoint %s after already exists: %w", ckpt.Name, err)
}
existingHash, err := checkpointIdentityHash(existing)
if err != nil {
return nil, err
}
if existingHash != hash {
return nil, fmt.Errorf("checkpoint %s already exists with identity hash %s", ckpt.Name, existingHash)
}
return existing, nil
}
existing, err := FindCheckpointByIdentityHash(ctx, c, namespace, hash, ckpt.Name)
if err != nil {
if deleteErr := c.Delete(ctx, ckpt); deleteErr != nil && !apierrors.IsNotFound(deleteErr) {
return nil, fmt.Errorf("failed to clean up checkpoint %s after dedupe error: %v (lookup error: %w)", ckpt.Name, deleteErr, err)
}
return nil, err
}
if existing != nil {
if err := c.Delete(ctx, ckpt); err != nil && !apierrors.IsNotFound(err) {
return nil, fmt.Errorf("failed to delete duplicate checkpoint %s: %w", ckpt.Name, err)
}
return existing, nil
}
return ckpt, nil
}
......@@ -144,17 +144,14 @@ const (
// deploy/snapshot/pkg/config/constants.go. If you change a value here, update there too.
// Kubernetes labels
KubeLabelIsCheckpointSource = "nvidia.com/snapshot-is-checkpoint-source" // Pod label that triggers DaemonSet auto-checkpoint
KubeLabelCheckpointHash = "nvidia.com/snapshot-checkpoint-hash" // Checkpoint identity hash (= DynamoCheckpoint CR name)
KubeLabelIsRestoreTarget = "nvidia.com/snapshot-is-restore-target" // Pod label that triggers DaemonSet auto-restore
KubeLabelIsCheckpointSource = "nvidia.com/snapshot-is-checkpoint-source" // Pod label that triggers DaemonSet auto-checkpoint
KubeLabelCheckpointHash = "nvidia.com/snapshot-checkpoint-hash" // Checkpoint identity hash used for lookup/reuse (may differ from DynamoCheckpoint metadata.name)
KubeLabelIsRestoreTarget = "nvidia.com/snapshot-is-restore-target" // Pod label that triggers DaemonSet auto-restore
KubeAnnotationCheckpointLocation = "nvidia.com/snapshot-checkpoint-location" // Pod annotation that tells snapshot-agent where the checkpoint lives
KubeAnnotationCheckpointStorageType = "nvidia.com/snapshot-checkpoint-storage-type" // Pod annotation that tells snapshot-agent which storage backend owns the checkpoint
// Environment variables injected into pods
EnvCheckpointStorageType = "DYN_CHECKPOINT_STORAGE_TYPE" // Storage backend (pvc, s3, oci) — checkpoint job pods only
EnvCheckpointLocation = "DYN_CHECKPOINT_LOCATION" // Full checkpoint URI — future S3/OCI; for PVC, use PATH+HASH instead
EnvCheckpointPath = "DYN_CHECKPOINT_PATH" // Base checkpoint directory (e.g., /checkpoints) — PVC restored pods
EnvCheckpointHash = "DYN_CHECKPOINT_HASH" // Identity hash — all checkpoint-related pods
EnvReadyForCheckpointFile = "DYN_READY_FOR_CHECKPOINT_FILE" // Ready-for-checkpoint file path — checkpoint job pods
EnvSkipWaitForCheckpoint = "SKIP_WAIT_FOR_CHECKPOINT" // Skip polling, check once — restored/DGD pods
// Checkpoint pod-internal constants
CheckpointVolumeName = "checkpoint-storage" // Pod-internal volume name for checkpoint PVC
......@@ -173,19 +170,12 @@ const (
PodInfoFieldPodUID = "metadata.uid"
PodInfoFieldPodNamespace = "metadata.namespace"
// Downward API file names for DGD annotations
PodInfoFileDynNamespace = "dyn_namespace"
PodInfoFileDynComponent = "dyn_component"
PodInfoFileDynParentDGDName = "dyn_parent_dgd_name"
PodInfoFileDynParentDGDNS = "dyn_parent_dgd_namespace"
PodInfoFileDynDiscoveryBackend = "dyn_discovery_backend"
// Annotation keys for DGD info (exposed via Downward API)
AnnotationDynNamespace = "nvidia.com/dyn-namespace"
AnnotationDynComponent = "nvidia.com/dyn-component"
AnnotationDynParentDGDName = "nvidia.com/dyn-parent-dgd-name"
AnnotationDynParentDGDNS = "nvidia.com/dyn-parent-dgd-namespace"
AnnotationDynDiscoveryBackend = "nvidia.com/dyn-discovery-backend"
// Downward API file names for restore identity
PodInfoFileDynNamespace = "dyn_namespace"
PodInfoFileDynNamespaceWorkerSuffix = "dyn_namespace_worker_suffix"
PodInfoFileDynComponent = "dyn_component"
PodInfoFileDynParentDGDName = "dyn_parent_dgd_k8s_name"
PodInfoFileDynParentDGDNamespace = "dyn_parent_dgd_k8s_namespace"
// Rolling update annotations
AnnotationCurrentWorkerHash = "nvidia.com/current-worker-hash"
......
......@@ -1068,19 +1068,19 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex
maps.Copy(podAnnotations, extraPodMetadata.Annotations)
maps.Copy(podLabels, extraPodMetadata.Labels)
}
// Restore labels are operator-controlled. Clear any stale/user-provided
// value after metadata merge; the controller re-adds it only when the
// checkpoint contract below is satisfied.
delete(podLabels, commonconsts.KubeLabelIsRestoreTarget)
// Explicit restore orchestration contract:
// only mark pods as restore targets when checkpoint material is ready.
if checkpointInfo != nil && checkpointInfo.Enabled && checkpointInfo.Ready {
podLabels[commonconsts.KubeLabelIsRestoreTarget] = commonconsts.KubeLabelValueTrue
if checkpointInfo.Hash != "" {
podLabels[commonconsts.KubeLabelCheckpointHash] = checkpointInfo.Hash
}
podLabels[commonconsts.KubeLabelDynamoGraphDeploymentName] = opt.dynamoComponentDeployment.Spec.Labels[commonconsts.KubeLabelDynamoGraphDeploymentName]
if opt.dynamoComponentDeployment.Spec.ComponentType != "" {
podLabels[commonconsts.KubeLabelDynamoComponentType] = opt.dynamoComponentDeployment.Spec.ComponentType
}
if opt.dynamoComponentDeployment.Spec.DynamoNamespace != nil && *opt.dynamoComponentDeployment.Spec.DynamoNamespace != "" {
podLabels[commonconsts.KubeLabelDynamoNamespace] = *opt.dynamoComponentDeployment.Spec.DynamoNamespace
}
if workerHash := opt.dynamoComponentDeployment.Spec.Labels[commonconsts.KubeLabelDynamoWorkerHash]; workerHash != "" {
podLabels[commonconsts.KubeLabelDynamoWorkerHash] = workerHash
}
// Restore labels are operator-controlled state. Clear stale values after
// metadata merge and only reapply them when checkpoint material is ready.
checkpoint.ApplyRestorePodMetadata(podLabels, podAnnotations, checkpointInfo)
// Propagate restart annotation to pod template to trigger rolling restart
// This is the same mechanism used by kubectl rollout restart
......
......@@ -25,6 +25,7 @@ import (
configv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/config/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/checkpoint"
commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/dynamo"
......@@ -724,10 +725,11 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
LeaderTemplate: &corev1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
"instance-id": "0",
commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue,
"role": "leader",
"nvidia.com/label1": "label1",
"instance-id": "0",
commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue,
"role": "leader",
"nvidia.com/label1": "label1",
commonconsts.KubeLabelDynamoNamespace: "default-test-lws-deploy",
commonconsts.KubeLabelDynamoComponentType: commonconsts.ComponentTypeWorker,
commonconsts.KubeLabelDynamoSubComponentType: "test-sub-component",
commonconsts.KubeLabelDynamoGraphDeploymentName: "",
......@@ -865,10 +867,11 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
WorkerTemplate: corev1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
"instance-id": "0",
commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue,
"role": "worker",
"nvidia.com/label1": "label1",
"instance-id": "0",
commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue,
"role": "worker",
"nvidia.com/label1": "label1",
commonconsts.KubeLabelDynamoNamespace: "default-test-lws-deploy",
commonconsts.KubeLabelDynamoComponentType: commonconsts.ComponentTypeWorker,
commonconsts.KubeLabelDynamoSubComponentType: "test-sub-component",
commonconsts.KubeLabelDynamoGraphDeploymentName: "",
......@@ -1267,6 +1270,7 @@ func TestDynamoComponentDeploymentReconciler_generatePodTemplateSpec_RestoreLabe
DynamoNamespace: ptr.To("default"),
Labels: map[string]string{
commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dgd",
commonconsts.KubeLabelDynamoWorkerHash: "workerhash",
commonconsts.KubeLabelIsRestoreTarget: commonconsts.KubeLabelValueTrue,
},
Checkpoint: &v1alpha1.ServiceCheckpointConfig{
......@@ -1308,16 +1312,20 @@ func TestDynamoComponentDeploymentReconciler_generatePodTemplateSpec_RestoreLabe
}
t.Run("ready checkpoint adds explicit restore labels", func(t *testing.T) {
checkpointName := "ckpt-ready"
identity := v1alpha1.DynamoCheckpointIdentity{Model: "test-model", BackendFramework: "vllm"}
checkpointName, err := checkpoint.ComputeIdentityHash(identity)
if err != nil {
t.Fatalf("ComputeIdentityHash failed: %v", err)
}
dcd := makeDCD(checkpointName)
ckpt := &v1alpha1.DynamoCheckpoint{
ObjectMeta: metav1.ObjectMeta{
Name: checkpointName,
Namespace: "default",
},
Spec: v1alpha1.DynamoCheckpointSpec{Identity: identity},
Status: v1alpha1.DynamoCheckpointStatus{
Phase: v1alpha1.DynamoCheckpointPhaseReady,
IdentityHash: "hash-ready-1",
Phase: v1alpha1.DynamoCheckpointPhaseReady,
},
}
......@@ -1334,22 +1342,76 @@ func TestDynamoComponentDeploymentReconciler_generatePodTemplateSpec_RestoreLabe
if got := podTemplateSpec.Labels[commonconsts.KubeLabelIsRestoreTarget]; got != commonconsts.KubeLabelValueTrue {
t.Fatalf("expected %s label to be true, got %q", commonconsts.KubeLabelIsRestoreTarget, got)
}
if got := podTemplateSpec.Labels[commonconsts.KubeLabelCheckpointHash]; got != "hash-ready-1" {
if got := podTemplateSpec.Labels[commonconsts.KubeLabelCheckpointHash]; got != checkpointName {
t.Fatalf("expected %s to be checkpoint hash, got %q", commonconsts.KubeLabelCheckpointHash, got)
}
})
t.Run("operator reasserts restore identity labels after metadata merge", func(t *testing.T) {
identity := v1alpha1.DynamoCheckpointIdentity{Model: "test-model", BackendFramework: "vllm"}
checkpointName, err := checkpoint.ComputeIdentityHash(identity)
if err != nil {
t.Fatalf("ComputeIdentityHash failed: %v", err)
}
dcd := makeDCD(checkpointName)
dcd.Spec.ExtraPodMetadata = &v1alpha1.ExtraPodMetadata{
Labels: map[string]string{
commonconsts.KubeLabelDynamoNamespace: "wrong-namespace",
commonconsts.KubeLabelDynamoComponentType: commonconsts.ComponentTypeFrontend,
commonconsts.KubeLabelDynamoGraphDeploymentName: "wrong-dgd",
commonconsts.KubeLabelDynamoWorkerHash: "wrong-hash",
},
}
ckpt := &v1alpha1.DynamoCheckpoint{
ObjectMeta: metav1.ObjectMeta{
Name: checkpointName,
Namespace: "default",
},
Spec: v1alpha1.DynamoCheckpointSpec{Identity: identity},
Status: v1alpha1.DynamoCheckpointStatus{
Phase: v1alpha1.DynamoCheckpointPhaseReady,
},
}
r := makeReconciler(dcd, ckpt)
podTemplateSpec, err := r.generatePodTemplateSpec(
context.Background(),
generateResourceOption{dynamoComponentDeployment: dcd},
dynamo.RoleMain,
)
if err != nil {
t.Fatalf("generatePodTemplateSpec failed: %v", err)
}
if got := podTemplateSpec.Labels[commonconsts.KubeLabelDynamoNamespace]; got != defaultNamespace {
t.Fatalf("expected %s label to be %q, got %q", commonconsts.KubeLabelDynamoNamespace, "default", got)
}
if got := podTemplateSpec.Labels[commonconsts.KubeLabelDynamoComponentType]; got != commonconsts.ComponentTypeWorker {
t.Fatalf("expected %s label to be %q, got %q", commonconsts.KubeLabelDynamoComponentType, commonconsts.ComponentTypeWorker, got)
}
if got := podTemplateSpec.Labels[commonconsts.KubeLabelDynamoGraphDeploymentName]; got != "test-dgd" {
t.Fatalf("expected %s label to be %q, got %q", commonconsts.KubeLabelDynamoGraphDeploymentName, "test-dgd", got)
}
if got := podTemplateSpec.Labels[commonconsts.KubeLabelDynamoWorkerHash]; got != "workerhash" {
t.Fatalf("expected %s label to be %q, got %q", commonconsts.KubeLabelDynamoWorkerHash, "workerhash", got)
}
})
t.Run("non-ready checkpoint clears stale restore labels", func(t *testing.T) {
checkpointName := "ckpt-pending"
identity := v1alpha1.DynamoCheckpointIdentity{Model: "test-model", BackendFramework: "vllm"}
checkpointName, err := checkpoint.ComputeIdentityHash(identity)
if err != nil {
t.Fatalf("ComputeIdentityHash failed: %v", err)
}
dcd := makeDCD(checkpointName)
ckpt := &v1alpha1.DynamoCheckpoint{
ObjectMeta: metav1.ObjectMeta{
Name: checkpointName,
Namespace: "default",
},
Spec: v1alpha1.DynamoCheckpointSpec{Identity: identity},
Status: v1alpha1.DynamoCheckpointStatus{
Phase: v1alpha1.DynamoCheckpointPhaseCreating,
IdentityHash: "hash-pending-1",
Phase: v1alpha1.DynamoCheckpointPhaseCreating,
},
}
......@@ -1440,16 +1502,20 @@ func TestDynamoComponentDeploymentReconciler_generateDeployment_RestoreStrategy(
}
t.Run("ready checkpoint forces Recreate strategy", func(t *testing.T) {
checkpointName := "ckpt-ready"
identity := v1alpha1.DynamoCheckpointIdentity{Model: "test-model", BackendFramework: "vllm"}
checkpointName, err := checkpoint.ComputeIdentityHash(identity)
if err != nil {
t.Fatalf("ComputeIdentityHash failed: %v", err)
}
dcd := makeDCD(checkpointName)
ckpt := &v1alpha1.DynamoCheckpoint{
ObjectMeta: metav1.ObjectMeta{
Name: checkpointName,
Namespace: "default",
},
Spec: v1alpha1.DynamoCheckpointSpec{Identity: identity},
Status: v1alpha1.DynamoCheckpointStatus{
Phase: v1alpha1.DynamoCheckpointPhaseReady,
IdentityHash: "hash-ready-1",
Phase: v1alpha1.DynamoCheckpointPhaseReady,
},
}
......@@ -1469,16 +1535,20 @@ func TestDynamoComponentDeploymentReconciler_generateDeployment_RestoreStrategy(
})
t.Run("non-ready checkpoint keeps RollingUpdate strategy", func(t *testing.T) {
checkpointName := "ckpt-creating"
identity := v1alpha1.DynamoCheckpointIdentity{Model: "test-model", BackendFramework: "vllm"}
checkpointName, err := checkpoint.ComputeIdentityHash(identity)
if err != nil {
t.Fatalf("ComputeIdentityHash failed: %v", err)
}
dcd := makeDCD(checkpointName)
ckpt := &v1alpha1.DynamoCheckpoint{
ObjectMeta: metav1.ObjectMeta{
Name: checkpointName,
Namespace: "default",
},
Spec: v1alpha1.DynamoCheckpointSpec{Identity: identity},
Status: v1alpha1.DynamoCheckpointStatus{
Phase: v1alpha1.DynamoCheckpointPhaseCreating,
IdentityHash: "hash-creating-1",
Phase: v1alpha1.DynamoCheckpointPhaseCreating,
},
}
......@@ -2068,6 +2138,100 @@ func Test_reconcileDeploymentResources(t *testing.T) {
}
}
func Test_reconcileDeploymentResources_DoesNotRecycleFailedRestorePods(t *testing.T) {
ctx := context.Background()
g := gomega.NewGomegaWithT(t)
s := scheme.Scheme
g.Expect(v1alpha1.AddToScheme(s)).To(gomega.Succeed())
g.Expect(appsv1.AddToScheme(s)).To(gomega.Succeed())
g.Expect(corev1.AddToScheme(s)).To(gomega.Succeed())
replicas := int32(1)
dcd := &v1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-component",
Namespace: "default",
},
Spec: v1alpha1.DynamoComponentDeploymentSpec{
BackendFramework: string(dynamo.BackendFrameworkVLLM),
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
ServiceName: "test-service",
DynamoNamespace: ptr.To("default"),
ComponentType: string(commonconsts.ComponentTypeDecode),
Replicas: &replicas,
ExtraPodSpec: &v1alpha1.ExtraPodSpec{
MainContainer: &corev1.Container{
Image: "test-image:latest",
Args: []string{"--test-arg"},
},
},
},
},
}
deployment := &appsv1.Deployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-component",
Namespace: "default",
Generation: 1,
},
Spec: appsv1.DeploymentSpec{
Replicas: ptr.To(int32(1)),
},
Status: appsv1.DeploymentStatus{
ObservedGeneration: 1,
Replicas: 1,
UpdatedReplicas: 1,
ReadyReplicas: 0,
AvailableReplicas: 0,
Conditions: []appsv1.DeploymentCondition{
{
Type: appsv1.DeploymentAvailable,
Status: corev1.ConditionFalse,
},
},
},
}
fakeKubeClient := fake.NewClientBuilder().
WithScheme(s).
WithObjects(dcd, deployment).
WithStatusSubresource(dcd, deployment).
Build()
reconciler := &DynamoComponentDeploymentReconciler{
Client: fakeKubeClient,
Recorder: record.NewFakeRecorder(100),
Config: &configv1alpha1.OperatorConfiguration{},
RuntimeConfig: &controller_common.RuntimeConfig{},
DockerSecretRetriever: &mockDockerSecretRetriever{
GetSecretsFunc: func(namespace, imageName string) ([]string, error) {
return []string{}, nil
},
},
}
result, err := reconciler.reconcileDeploymentResources(ctx, dcd)
g.Expect(err).NotTo(gomega.HaveOccurred())
g.Expect(result).To(gomega.Equal(ComponentReconcileResult{
modified: true,
status: metav1.ConditionFalse,
reason: "DeploymentNotReady",
message: "Deployment is not ready",
serviceReplicaStatus: &v1alpha1.ServiceReplicaStatus{
ComponentKind: v1alpha1.ComponentKindDeployment,
ComponentName: "test-component",
ComponentNames: []string{"test-component"},
Replicas: 1,
UpdatedReplicas: 1,
ReadyReplicas: ptr.To(int32(0)),
AvailableReplicas: ptr.To(int32(0)),
},
}))
}
func Test_setStatusConditionAndServiceReplicaStatus(t *testing.T) {
ctx := context.Background()
......
......@@ -87,6 +87,7 @@ type DynamoGraphDeploymentReconciler struct {
// +kubebuilder:rbac:groups=grove.io,resources=podcliquescalinggroups/scale,verbs=get;update;patch
// +kubebuilder:rbac:groups=scheduling.run.ai,resources=queues,verbs=get;list
// +kubebuilder:rbac:groups=inference.networking.k8s.io,resources=inferencepools,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch
// Reconcile is part of the main kubernetes reconciliation loop which aims to
// move the current state of the cluster closer to the desired state.
......@@ -1202,12 +1203,15 @@ func (r *DynamoGraphDeploymentReconciler) reconcilePVCs(ctx context.Context, dyn
return nil
}
// reconcileCheckpoints reconciles Checkpoint CRs for services with checkpointing enabled
// For Auto mode, it creates Checkpoint CRs if they don't exist
// Returns a map of service names to checkpoint status and a map of service names to checkpoint info
func (r *DynamoGraphDeploymentReconciler) reconcileCheckpoints(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) (map[string]nvidiacomv1alpha1.ServiceCheckpointStatus, map[string]*checkpoint.CheckpointInfo, error) {
// reconcileCheckpoints reconciles Checkpoint CRs for services with checkpointing enabled.
// For Auto mode, it creates Checkpoint CRs if they do not exist.
// Returns per-service checkpoint status and resolved checkpoint info.
func (r *DynamoGraphDeploymentReconciler) reconcileCheckpoints(
ctx context.Context,
dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment,
) (map[string]nvidiacomv1alpha1.ServiceCheckpointStatus, map[string]*checkpoint.CheckpointInfo, error) {
logger := log.FromContext(ctx)
statuses := make(map[string]nvidiacomv1alpha1.ServiceCheckpointStatus)
checkpointStatuses := make(map[string]nvidiacomv1alpha1.ServiceCheckpointStatus)
checkpointInfos := make(map[string]*checkpoint.CheckpointInfo)
for serviceName, component := range dynamoDeployment.Spec.Services {
......@@ -1227,8 +1231,13 @@ func (r *DynamoGraphDeploymentReconciler) reconcileCheckpoints(ctx context.Conte
// Store checkpoint info for later use in pod spec generation
checkpointInfos[serviceName] = info
// If no checkpoint found and mode is Auto, create one
if info.CheckpointName == "" && component.Checkpoint.Mode == nvidiacomv1alpha1.CheckpointModeAuto {
// checkpointRef is authoritative. Auto mode should only create the canonical checkpoint
// when the service is using identity-based lookup.
if component.Checkpoint.Mode == nvidiacomv1alpha1.CheckpointModeAuto &&
(component.Checkpoint.CheckpointRef == nil || *component.Checkpoint.CheckpointRef == "") &&
!info.Exists &&
info.Identity != nil &&
!info.Ready {
logger.Info("Creating DynamoCheckpoint CR in Auto mode", "service", serviceName)
ckpt, err := r.createCheckpointCR(ctx, dynamoDeployment, serviceName, component)
......@@ -1236,28 +1245,22 @@ func (r *DynamoGraphDeploymentReconciler) reconcileCheckpoints(ctx context.Conte
logger.Error(err, "Failed to create DynamoCheckpoint CR", "service", serviceName)
return nil, nil, fmt.Errorf("failed to create checkpoint for service %s: %w", serviceName, err)
}
info.Exists = true
info.CheckpointName = ckpt.Name
// Compute hash locally since status may not be populated yet
// (checkpoint controller reconciles asynchronously)
hash, err := checkpoint.ComputeIdentityHash(*component.Checkpoint.Identity)
if err != nil {
logger.Error(err, "Failed to compute checkpoint identity hash", "service", serviceName)
return nil, nil, fmt.Errorf("failed to compute checkpoint hash for service %s: %w", serviceName, err)
if info.Hash == "" {
info.Hash = ckpt.Status.IdentityHash
}
info.Hash = hash
info.Ready = false // Newly created checkpoint is not ready yet
info.Ready = false
}
// Update status
statuses[serviceName] = nvidiacomv1alpha1.ServiceCheckpointStatus{
checkpointStatuses[serviceName] = nvidiacomv1alpha1.ServiceCheckpointStatus{
CheckpointName: info.CheckpointName,
IdentityHash: info.Hash,
Ready: info.Ready,
}
}
return statuses, checkpointInfos, nil
return checkpointStatuses, checkpointInfos, nil
}
// createCheckpointCR creates a DynamoCheckpoint CR for a service in Auto mode
......@@ -1273,70 +1276,36 @@ func (r *DynamoGraphDeploymentReconciler) createCheckpointCR(
identity := component.Checkpoint.Identity
// Compute hash for naming
hash, err := checkpoint.ComputeIdentityHash(*identity)
if err != nil {
return nil, fmt.Errorf("failed to compute identity hash: %w", err)
}
// Generate checkpoint name: use hash directly (16 chars, 64 bits)
// This allows natural deduplication - same identity = same checkpoint name
// 16 characters provides excellent collision resistance (1% at 500M configs)
ckptName := hash
// Use SyncResource to create/update the DynamoCheckpoint CR
// Pass nil as parentResource to create an independent checkpoint (no owner reference)
// This ensures the checkpoint persists even if the DGD is deleted
_, ckpt, err := commoncontroller.SyncResource(ctx, r, nil, func(ctx context.Context) (*nvidiacomv1alpha1.DynamoCheckpoint, bool, error) {
// Build the checkpoint identity from service identity
checkpointIdentity := nvidiacomv1alpha1.DynamoCheckpointIdentity{
Model: identity.Model,
BackendFramework: identity.BackendFramework,
DynamoVersion: identity.DynamoVersion,
TensorParallelSize: identity.TensorParallelSize,
PipelineParallelSize: identity.PipelineParallelSize,
Dtype: identity.Dtype,
MaxModelLen: identity.MaxModelLen,
ExtraParameters: identity.ExtraParameters,
}
// Build pod template from service spec for checkpoint job
// This uses GenerateBasePodSpec to ensure same config as worker pods (image pull secrets, etc.)
// Pass framework from checkpoint identity for accurate backend detection
podTemplate, err := r.buildCheckpointJobPodTemplate(
dynamoDeployment,
component,
serviceName,
identity.BackendFramework, // Use framework from checkpoint identity
)
if err != nil {
return nil, false, fmt.Errorf("failed to build checkpoint job pod template: %w", err)
}
checkpointIdentity := nvidiacomv1alpha1.DynamoCheckpointIdentity{
Model: identity.Model,
BackendFramework: identity.BackendFramework,
DynamoVersion: identity.DynamoVersion,
TensorParallelSize: identity.TensorParallelSize,
PipelineParallelSize: identity.PipelineParallelSize,
Dtype: identity.Dtype,
MaxModelLen: identity.MaxModelLen,
ExtraParameters: identity.ExtraParameters,
}
ckpt := &nvidiacomv1alpha1.DynamoCheckpoint{
ObjectMeta: metav1.ObjectMeta{
Name: ckptName,
Namespace: dynamoDeployment.Namespace,
Labels: map[string]string{
consts.KubeLabelDynamoGraphDeploymentName: dynamoDeployment.Name,
consts.KubeLabelDynamoComponent: serviceName,
consts.KubeLabelCheckpointHash: hash,
},
},
Spec: nvidiacomv1alpha1.DynamoCheckpointSpec{
Identity: checkpointIdentity,
Job: nvidiacomv1alpha1.DynamoCheckpointJobConfig{
PodTemplateSpec: podTemplate,
},
},
}
return ckpt, false, nil
})
// Capture config is not part of the checkpoint identity. Once a checkpoint object exists for a
// hash, later reconcilers must reuse it instead of racing to overwrite the capture pod template.
podTemplate, err := r.buildCheckpointJobPodTemplate(
dynamoDeployment,
component,
serviceName,
identity.BackendFramework,
)
if err != nil {
return nil, fmt.Errorf("failed to sync checkpoint CR: %w", err)
return nil, fmt.Errorf("failed to build checkpoint job pod template: %w", err)
}
return ckpt, nil
return checkpoint.CreateOrGetAutoCheckpoint(
ctx,
r.Client,
dynamoDeployment.Namespace,
checkpointIdentity,
podTemplate,
)
}
// buildCheckpointJobPodTemplate builds a pod template for the checkpoint job from service spec
......@@ -1603,6 +1572,7 @@ func (r *DynamoGraphDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) err
GenericFunc: func(ge event.GenericEvent) bool { return false },
}),
)
}
// Wrap with metrics collection
observedReconciler := observability.NewObservedReconciler(r, consts.ResourceTypeDynamoGraphDeployment)
......
......@@ -23,11 +23,13 @@ import (
configv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/config/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/checkpoint"
commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common"
grovev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"
"github.com/onsi/gomega"
autoscalingv1 "k8s.io/api/autoscaling/v1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/types"
......@@ -342,6 +344,314 @@ func TestDynamoGraphDeploymentReconciler_reconcileScalingAdapters(t *testing.T)
}
}
func TestDynamoGraphDeploymentReconciler_createCheckpointCR_reusesExistingCapture(t *testing.T) {
if err := v1alpha1.AddToScheme(scheme.Scheme); err != nil {
t.Fatalf("Failed to add v1alpha1 to scheme: %v", err)
}
ctx := context.Background()
identity := v1alpha1.DynamoCheckpointIdentity{
Model: "meta-llama/Llama-2-7b-hf",
BackendFramework: "vllm",
}
hash, err := checkpoint.ComputeIdentityHash(identity)
if err != nil {
t.Fatalf("Failed to compute checkpoint hash: %v", err)
}
existing := &v1alpha1.DynamoCheckpoint{
ObjectMeta: metav1.ObjectMeta{
Name: "existing-worker-checkpoint",
Namespace: "default",
},
Spec: v1alpha1.DynamoCheckpointSpec{
Identity: identity,
Job: v1alpha1.DynamoCheckpointJobConfig{
PodTemplateSpec: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Containers: []corev1.Container{{
Name: "main",
Image: "keep-existing:latest",
}},
},
},
},
},
Status: v1alpha1.DynamoCheckpointStatus{
IdentityHash: hash,
},
}
reconciler := &DynamoGraphDeploymentReconciler{
Client: fake.NewClientBuilder().
WithScheme(scheme.Scheme).
WithObjects(existing).
Build(),
Config: &configv1alpha1.OperatorConfiguration{},
Recorder: record.NewFakeRecorder(10),
}
dgd := &v1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd",
Namespace: "default",
},
}
component := &v1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: string(commonconsts.ComponentTypeWorker),
Checkpoint: &v1alpha1.ServiceCheckpointConfig{
Enabled: true,
Mode: v1alpha1.CheckpointModeAuto,
Identity: &v1alpha1.DynamoCheckpointIdentity{
Model: identity.Model,
BackendFramework: identity.BackendFramework,
TensorParallelSize: 1,
PipelineParallelSize: 1,
ExtraParameters: map[string]string{},
},
},
ExtraPodSpec: &v1alpha1.ExtraPodSpec{
MainContainer: &corev1.Container{
Name: "main",
Image: "new-writer:latest",
},
},
}
ckpt, err := reconciler.createCheckpointCR(ctx, dgd, "worker", component)
if err != nil {
t.Fatalf("createCheckpointCR() error = %v", err)
}
if ckpt.Name != "existing-worker-checkpoint" {
t.Fatalf("createCheckpointCR() returned checkpoint %s, want existing-worker-checkpoint", ckpt.Name)
}
updated := &v1alpha1.DynamoCheckpoint{}
if err := reconciler.Get(ctx, types.NamespacedName{Name: "existing-worker-checkpoint", Namespace: "default"}, updated); err != nil {
t.Fatalf("Failed to get checkpoint: %v", err)
}
if len(updated.Spec.Job.PodTemplateSpec.Spec.Containers) != 1 {
t.Fatalf("expected one job container, got %d", len(updated.Spec.Job.PodTemplateSpec.Spec.Containers))
}
if updated.Spec.Job.PodTemplateSpec.Spec.Containers[0].Image != "keep-existing:latest" {
t.Fatalf("existing job image was mutated to %s", updated.Spec.Job.PodTemplateSpec.Spec.Containers[0].Image)
}
}
func TestDynamoGraphDeploymentReconciler_reconcileCheckpoints_checkpointRefSkipsAutoCreateWhileReferencedCRIsNotReady(t *testing.T) {
if err := v1alpha1.AddToScheme(scheme.Scheme); err != nil {
t.Fatalf("Failed to add v1alpha1 to scheme: %v", err)
}
ctx := context.Background()
identity := v1alpha1.DynamoCheckpointIdentity{
Model: "meta-llama/Llama-2-7b-hf",
BackendFramework: "vllm",
}
hash, err := checkpoint.ComputeIdentityHash(identity)
if err != nil {
t.Fatalf("Failed to compute checkpoint hash: %v", err)
}
referenced := &v1alpha1.DynamoCheckpoint{
ObjectMeta: metav1.ObjectMeta{
Name: "friendly-checkpoint",
Namespace: "default",
},
Spec: v1alpha1.DynamoCheckpointSpec{
Identity: identity,
Job: v1alpha1.DynamoCheckpointJobConfig{
PodTemplateSpec: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Containers: []corev1.Container{{
Name: "main",
Image: "keep-existing:latest",
}},
},
},
},
},
Status: v1alpha1.DynamoCheckpointStatus{
Phase: v1alpha1.DynamoCheckpointPhaseCreating,
IdentityHash: hash,
},
}
reconciler := &DynamoGraphDeploymentReconciler{
Client: fake.NewClientBuilder().
WithScheme(scheme.Scheme).
WithObjects(referenced).
WithStatusSubresource(referenced).
Build(),
Config: &configv1alpha1.OperatorConfiguration{},
Recorder: record.NewFakeRecorder(10),
}
ref := friendlyCheckpointName
dgd := &v1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd",
Namespace: "default",
},
Spec: v1alpha1.DynamoGraphDeploymentSpec{
Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {
ComponentType: string(commonconsts.ComponentTypeWorker),
Checkpoint: &v1alpha1.ServiceCheckpointConfig{
Enabled: true,
Mode: v1alpha1.CheckpointModeAuto,
CheckpointRef: &ref,
},
},
},
},
}
checkpointStatuses, checkpointInfos, err := reconciler.reconcileCheckpoints(ctx, dgd)
if err != nil {
t.Fatalf("reconcileCheckpoints() error = %v", err)
}
info, ok := checkpointInfos["worker"]
if !ok {
t.Fatalf("expected checkpoint info for worker service")
}
if info.Ready {
t.Fatalf("expected referenced checkpoint to remain not ready")
}
if !info.Exists {
t.Fatalf("expected referenced checkpoint to exist")
}
if info.Hash != hash {
t.Fatalf("checkpoint hash = %s, want %s", info.Hash, hash)
}
if checkpointStatuses["worker"].CheckpointName != "friendly-checkpoint" {
t.Fatalf("checkpoint status name = %s, want friendly-checkpoint", checkpointStatuses["worker"].CheckpointName)
}
checkpoints := &v1alpha1.DynamoCheckpointList{}
if err := reconciler.List(ctx, checkpoints, client.InNamespace("default")); err != nil {
t.Fatalf("failed to list checkpoints: %v", err)
}
if len(checkpoints.Items) != 1 {
t.Fatalf("expected only the referenced checkpoint to exist, found %d", len(checkpoints.Items))
}
if checkpoints.Items[0].Name != "friendly-checkpoint" {
t.Fatalf("unexpected checkpoint %s", checkpoints.Items[0].Name)
}
}
func TestDynamoGraphDeploymentReconciler_reconcileCheckpoints_autoModeWaitsForExistingCreatingCheckpoint(t *testing.T) {
if err := v1alpha1.AddToScheme(scheme.Scheme); err != nil {
t.Fatalf("Failed to add v1alpha1 to scheme: %v", err)
}
ctx := context.Background()
identity := v1alpha1.DynamoCheckpointIdentity{
Model: "meta-llama/Llama-2-7b-hf",
BackendFramework: "vllm",
}
hash, err := checkpoint.ComputeIdentityHash(identity)
if err != nil {
t.Fatalf("Failed to compute checkpoint hash: %v", err)
}
existing := &v1alpha1.DynamoCheckpoint{
ObjectMeta: metav1.ObjectMeta{
Name: "existing-worker-checkpoint",
Namespace: "default",
},
Spec: v1alpha1.DynamoCheckpointSpec{
Identity: identity,
Job: v1alpha1.DynamoCheckpointJobConfig{
PodTemplateSpec: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Containers: []corev1.Container{{
Name: "main",
Image: "keep-existing:latest",
}},
},
},
},
},
Status: v1alpha1.DynamoCheckpointStatus{
Phase: v1alpha1.DynamoCheckpointPhaseCreating,
IdentityHash: hash,
},
}
reconciler := &DynamoGraphDeploymentReconciler{
Client: fake.NewClientBuilder().
WithScheme(scheme.Scheme).
WithObjects(existing).
WithStatusSubresource(existing).
Build(),
Config: &configv1alpha1.OperatorConfiguration{},
Recorder: record.NewFakeRecorder(10),
}
dgd := &v1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd",
Namespace: "default",
},
Spec: v1alpha1.DynamoGraphDeploymentSpec{
Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {
ComponentType: string(commonconsts.ComponentTypeWorker),
Checkpoint: &v1alpha1.ServiceCheckpointConfig{
Enabled: true,
Mode: v1alpha1.CheckpointModeAuto,
Identity: &v1alpha1.DynamoCheckpointIdentity{
Model: identity.Model,
BackendFramework: identity.BackendFramework,
},
},
ExtraPodSpec: &v1alpha1.ExtraPodSpec{
MainContainer: &corev1.Container{
Name: "main",
Image: "new-writer:latest",
},
},
},
},
},
}
checkpointStatuses, checkpointInfos, err := reconciler.reconcileCheckpoints(ctx, dgd)
if err != nil {
t.Fatalf("reconcileCheckpoints() error = %v", err)
}
info, ok := checkpointInfos["worker"]
if !ok {
t.Fatalf("expected checkpoint info for worker service")
}
if info.Ready {
t.Fatalf("expected existing checkpoint to remain not ready")
}
if !info.Exists {
t.Fatalf("expected existing checkpoint to be detected")
}
if info.Hash != hash {
t.Fatalf("checkpoint hash = %s, want %s", info.Hash, hash)
}
if checkpointStatuses["worker"].CheckpointName != "existing-worker-checkpoint" {
t.Fatalf("checkpoint status name = %s, want existing-worker-checkpoint", checkpointStatuses["worker"].CheckpointName)
}
updated := &v1alpha1.DynamoCheckpoint{}
if err := reconciler.Get(ctx, types.NamespacedName{Name: "existing-worker-checkpoint", Namespace: "default"}, updated); err != nil {
t.Fatalf("Failed to get checkpoint: %v", err)
}
if len(updated.Spec.Job.PodTemplateSpec.Spec.Containers) != 1 {
t.Fatalf("expected one job container, got %d", len(updated.Spec.Job.PodTemplateSpec.Spec.Containers))
}
if updated.Spec.Job.PodTemplateSpec.Spec.Containers[0].Image != "keep-existing:latest" {
t.Fatalf("existing job image was mutated to %s", updated.Spec.Job.PodTemplateSpec.Spec.Containers[0].Image)
}
}
// mockScaleClient implements scale.ScalesGetter for testing
type mockScaleClient struct{}
......
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment