Unverified Commit 43e810a4 authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

refactor(snapshot): add manifest-based snapshotctl flow and shared workload builders (#7671)


Signed-off-by: default avatarSchwinn Saereesitthipitak <schwinns@nvidia.com>
parent 23144df5
......@@ -21,7 +21,8 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
// Checkpoint storage type constants
// Checkpoint storage type constants retained for compatibility with older
// operator configuration files.
const (
CheckpointStorageTypePVC = "pvc"
CheckpointStorageTypeS3 = "s3"
......@@ -245,46 +246,49 @@ type CheckpointConfiguration struct {
// ReadyForCheckpointFilePath signals model readiness for checkpoint jobs
// +kubebuilder:default="/tmp/ready-for-checkpoint"
ReadyForCheckpointFilePath string `json:"readyForCheckpointFilePath"`
// Storage holds storage backend configuration
// Deprecated: Storage is retained for compatibility and ignored by the
// current snapshot flow. Snapshot storage is discovered from the
// snapshot-agent DaemonSet instead.
Storage CheckpointStorageConfiguration `json:"storage"`
}
// CheckpointStorageConfiguration holds storage backend configuration for checkpoints.
// Deprecated: CheckpointStorageConfiguration is retained for compatibility and
// ignored by the current snapshot flow.
type CheckpointStorageConfiguration struct {
// Type is the storage backend type: pvc, s3, or oci
// +kubebuilder:default="pvc"
// Type is the legacy storage backend type: pvc, s3, or oci.
Type string `json:"type"`
// PVC configuration (used when Type=pvc)
// PVC configuration for legacy pvc-based settings.
PVC CheckpointPVCConfig `json:"pvc"`
// S3 configuration (used when Type=s3)
// S3 configuration for legacy s3-based settings.
S3 CheckpointS3Config `json:"s3"`
// OCI configuration (used when Type=oci)
// OCI configuration for legacy oci-based settings.
OCI CheckpointOCIConfig `json:"oci"`
}
// CheckpointPVCConfig holds PVC storage configuration.
// Deprecated: CheckpointPVCConfig is retained for compatibility and ignored by
// the current snapshot flow.
type CheckpointPVCConfig struct {
// PVCName is the name of the PVC
// +kubebuilder:default="snapshot-pvc"
// PVCName is the legacy PVC name.
PVCName string `json:"pvcName"`
// BasePath is the base directory within the PVC
// +kubebuilder:default="/checkpoints"
// BasePath is the legacy base directory within the PVC.
BasePath string `json:"basePath"`
}
// CheckpointS3Config holds S3 storage configuration.
// Deprecated: CheckpointS3Config is retained for compatibility and ignored by
// the current snapshot flow.
type CheckpointS3Config struct {
// URI is the S3 URI (s3://[endpoint/]bucket/prefix)
// URI is the legacy S3 URI (s3://[endpoint/]bucket/prefix).
URI string `json:"uri"`
// CredentialsSecretRef is the name of the credentials secret
// CredentialsSecretRef is the legacy credentials secret name.
CredentialsSecretRef string `json:"credentialsSecretRef"`
}
// CheckpointOCIConfig holds OCI registry storage configuration.
// Deprecated: CheckpointOCIConfig is retained for compatibility and ignored by
// the current snapshot flow.
type CheckpointOCIConfig struct {
// URI is the OCI URI (oci://registry/repository)
// URI is the legacy OCI URI (oci://registry/repository).
URI string `json:"uri"`
// CredentialsSecretRef is the name of the docker config secret
// CredentialsSecretRef is the legacy docker config secret name.
CredentialsSecretRef string `json:"credentialsSecretRef"`
}
......
......@@ -37,7 +37,6 @@ func ValidateOperatorConfiguration(config *configv1alpha1.OperatorConfiguration)
allErrs = append(allErrs, validateMPI(&config.MPI, field.NewPath("mpi"))...)
allErrs = append(allErrs, validateInfrastructure(&config.Infrastructure, field.NewPath("infrastructure"))...)
allErrs = append(allErrs, validateDiscovery(&config.Discovery, field.NewPath("discovery"))...)
allErrs = append(allErrs, validateCheckpoint(&config.Checkpoint, field.NewPath("checkpoint"))...)
allErrs = append(allErrs, validateRBAC(config)...)
allErrs = append(allErrs, validateOrchestrators(&config.Orchestrators, field.NewPath("orchestrators"))...)
allErrs = append(allErrs, validateIngress(&config.Ingress, field.NewPath("ingress"))...)
......@@ -127,33 +126,6 @@ func validateDiscovery(discovery *configv1alpha1.DiscoveryConfiguration, fldPath
return allErrs
}
func validateCheckpoint(checkpoint *configv1alpha1.CheckpointConfiguration, fldPath *field.Path) field.ErrorList {
allErrs := field.ErrorList{}
if !checkpoint.Enabled {
return allErrs
}
storagePath := fldPath.Child("storage")
switch checkpoint.Storage.Type {
case configv1alpha1.CheckpointStorageTypePVC:
// PVC is the default, no additional required fields
case configv1alpha1.CheckpointStorageTypeS3:
if checkpoint.Storage.S3.URI == "" {
allErrs = append(allErrs, field.Required(storagePath.Child("s3", "uri"), "S3 URI is required when storage type is s3"))
}
case configv1alpha1.CheckpointStorageTypeOCI:
if checkpoint.Storage.OCI.URI == "" {
allErrs = append(allErrs, field.Required(storagePath.Child("oci", "uri"), "OCI URI is required when storage type is oci"))
}
default:
allErrs = append(allErrs, field.NotSupported(storagePath.Child("type"), checkpoint.Storage.Type,
[]string{configv1alpha1.CheckpointStorageTypePVC, configv1alpha1.CheckpointStorageTypeS3, configv1alpha1.CheckpointStorageTypeOCI}))
}
return allErrs
}
// validateRBAC is mode-aware: validates RBAC fields based on namespace mode.
func validateRBAC(config *configv1alpha1.OperatorConfiguration) field.ErrorList {
allErrs := field.ErrorList{}
......
......@@ -18,6 +18,7 @@
package validation
import (
"encoding/json"
"testing"
"time"
......@@ -31,7 +32,10 @@ func validConfig() *configv1alpha1.OperatorConfiguration {
configv1alpha1.SetDefaultsOperatorConfiguration(cfg)
cfg.MPI.SSHSecretName = "mpi-ssh"
cfg.MPI.SSHSecretNamespace = "default"
// Cluster-wide validation requires chart-provided RBAC names.
cfg.RBAC.PlannerClusterRoleName = "planner-role"
cfg.RBAC.DGDRProfilingClusterRoleName = "dgdr-profiling-role"
cfg.RBAC.EPPClusterRoleName = "epp-role"
return cfg
}
......@@ -41,6 +45,8 @@ func validNamespaceScopedConfig() *configv1alpha1.OperatorConfiguration {
cfg.Namespace.Restricted = "my-namespace"
// RBAC not required in namespace mode
cfg.RBAC.PlannerClusterRoleName = ""
cfg.RBAC.DGDRProfilingClusterRoleName = ""
cfg.RBAC.EPPClusterRoleName = ""
return cfg
}
......@@ -120,45 +126,42 @@ func TestValidateOperatorConfiguration_NamespaceScopedLeaseRenewExceedsDuration(
}
}
func TestValidateOperatorConfiguration_CheckpointS3MissingURI(t *testing.T) {
func TestValidateOperatorConfiguration_CheckpointEnabledRequiresNoStorageConfig(t *testing.T) {
cfg := validConfig()
cfg.Checkpoint.Enabled = true
cfg.Checkpoint.Storage.Type = configv1alpha1.CheckpointStorageTypeS3
cfg.Checkpoint.Storage.S3.URI = ""
errs := ValidateOperatorConfiguration(cfg)
if len(errs) != 1 {
t.Errorf("expected 1 error for missing S3 URI, got %d: %v", len(errs), errs)
if len(errs) != 0 {
t.Errorf("expected no errors for checkpoint config without storage settings, got: %v", errs)
}
}
func TestValidateOperatorConfiguration_CheckpointOCIMissingURI(t *testing.T) {
func TestValidateOperatorConfiguration_CheckpointDeprecatedStorageConfigIsAccepted(t *testing.T) {
cfg := validConfig()
cfg.Checkpoint.Enabled = true
cfg.Checkpoint.Storage.Type = configv1alpha1.CheckpointStorageTypeOCI
cfg.Checkpoint.Storage.OCI.URI = ""
errs := ValidateOperatorConfiguration(cfg)
if len(errs) != 1 {
t.Errorf("expected 1 error for missing OCI URI, got %d: %v", len(errs), errs)
rawConfig := []byte(`{
"checkpoint": {
"enabled": true,
"storage": {
"type": "s3",
"s3": {
"uri": "s3://legacy-bucket/checkpoints"
}
}
}
}`)
if err := json.Unmarshal(rawConfig, cfg); err != nil {
t.Fatalf("failed to unmarshal compatibility config: %v", err)
}
}
func TestValidateOperatorConfiguration_CheckpointInvalidStorageType(t *testing.T) {
cfg := validConfig()
cfg.Checkpoint.Enabled = true
cfg.Checkpoint.Storage.Type = "nfs"
errs := ValidateOperatorConfiguration(cfg)
if len(errs) != 1 {
t.Errorf("expected 1 error for invalid storage type, got %d: %v", len(errs), errs)
if len(errs) != 0 {
t.Errorf("expected no errors for deprecated checkpoint storage config, got: %v", errs)
}
}
func TestValidateOperatorConfiguration_CheckpointDisabledSkipsValidation(t *testing.T) {
cfg := validConfig()
cfg.Checkpoint.Enabled = false
cfg.Checkpoint.Storage.Type = "invalid"
errs := ValidateOperatorConfiguration(cfg)
if len(errs) != 0 {
......
......@@ -31,13 +31,15 @@ const (
DynamoCheckpointPhasePending DynamoCheckpointPhase = "Pending"
// DynamoCheckpointPhaseCreating indicates the checkpoint Job is running
DynamoCheckpointPhaseCreating DynamoCheckpointPhase = "Creating"
// DynamoCheckpointPhaseReady indicates the checkpoint tar file is available on the PVC
// DynamoCheckpointPhaseReady indicates the checkpoint artifact is available
DynamoCheckpointPhaseReady DynamoCheckpointPhase = "Ready"
// DynamoCheckpointPhaseFailed indicates the checkpoint creation failed
DynamoCheckpointPhaseFailed DynamoCheckpointPhase = "Failed"
)
// DynamoCheckpointStorageType defines the supported storage backends for checkpoints
// Deprecated: StorageType is retained for compatibility with older
// DynamoCheckpoint status consumers. The current checkpoint flow publishes
// PVC-backed artifacts discovered from the snapshot-agent DaemonSet.
// +kubebuilder:validation:Enum=pvc;s3;oci
type DynamoCheckpointStorageType string
......@@ -109,10 +111,10 @@ type DynamoCheckpointJobConfig struct {
// +kubebuilder:validation:Minimum=0
BackoffLimit *int32 `json:"backoffLimit,omitempty"`
// TTLSecondsAfterFinished specifies how long to keep the Job after completion
// Deprecated: TTLSecondsAfterFinished is ignored. Checkpoint Jobs use a fixed
// 300 second TTL.
// +optional
// +kubebuilder:validation:Minimum=0
// +kubebuilder:default=300
TTLSecondsAfterFinished *int32 `json:"ttlSecondsAfterFinished,omitempty"`
}
......@@ -148,14 +150,13 @@ type DynamoCheckpointStatus struct {
// +optional
IdentityHash string `json:"identityHash,omitempty"`
// Location is the full URI/path to the checkpoint in the storage backend
// For PVC: same as TarPath (e.g., /checkpoints/{hash}.tar)
// For S3: s3://bucket/prefix/{hash}.tar
// For OCI: oci://registry/repo:{hash}
// Deprecated: Location is ignored and no longer populated. It is retained
// only so older objects continue to validate.
// +optional
Location string `json:"location,omitempty"`
// StorageType indicates the storage backend type used for this checkpoint
// Deprecated: StorageType is ignored and no longer populated. It is retained
// only so older objects continue to validate.
// +optional
StorageType DynamoCheckpointStorageType `json:"storageType,omitempty"`
......@@ -163,7 +164,7 @@ type DynamoCheckpointStatus struct {
// +optional
JobName string `json:"jobName,omitempty"`
// CreatedAt is the timestamp when the checkpoint tar was created
// CreatedAt is the timestamp when the checkpoint became ready
// +optional
CreatedAt *metav1.Time `json:"createdAt,omitempty"`
......
......@@ -8173,8 +8173,9 @@ spec:
- message: sharedMemory.size must not be set when sharedMemory.disabled is true
rule: '!(has(self.disabled) && self.disabled && has(self.size))'
ttlSecondsAfterFinished:
default: 300
description: TTLSecondsAfterFinished specifies how long to keep the Job after completion
description: |-
Deprecated: TTLSecondsAfterFinished is ignored. Checkpoint Jobs use a fixed
300 second TTL.
format: int32
minimum: 0
type: integer
......@@ -8245,7 +8246,7 @@ spec:
type: object
type: array
createdAt:
description: CreatedAt is the timestamp when the checkpoint tar was created
description: CreatedAt is the timestamp when the checkpoint became ready
format: date-time
type: string
identityHash:
......@@ -8258,10 +8259,8 @@ spec:
type: string
location:
description: |-
Location is the full URI/path to the checkpoint in the storage backend
For PVC: same as TarPath (e.g., /checkpoints/{hash}.tar)
For S3: s3://bucket/prefix/{hash}.tar
For OCI: oci://registry/repo:{hash}
Deprecated: Location is ignored and no longer populated. It is retained
only so older objects continue to validate.
type: string
message:
description: Message provides additional information about the current state
......@@ -8275,7 +8274,9 @@ spec:
- Failed
type: string
storageType:
description: StorageType indicates the storage backend type used for this checkpoint
description: |-
Deprecated: StorageType is ignored and no longer populated. It is retained
only so older objects continue to validate.
enum:
- pvc
- s3
......
......@@ -38,6 +38,14 @@ rules:
- get
- list
- watch
- apiGroups:
- apps
resources:
- daemonsets
verbs:
- get
- list
- watch
- apiGroups:
- apps
resources:
......
......@@ -16,46 +16,60 @@
apiVersion: nvidia.com/v1alpha1
kind: DynamoCheckpoint
metadata:
name: llama3-8b-bf16
name: qwen3-06b-bf16
spec:
# Identity - determines the checkpoint hash
identity:
model: "meta-llama/Meta-Llama-3-8B-Instruct"
model: "Qwen/Qwen3-0.6B"
backendFramework: "vllm"
dynamoVersion: "0.6.0"
tensorParallelSize: 1
pipelineParallelSize: 1
dtype: "bfloat16"
maxModelLen: 8192
extraParameters:
enableChunkedPrefill: "true"
maxModelLen: 2048
# Job configuration for checkpoint creation
job:
activeDeadlineSeconds: 3600
ttlSecondsAfterFinished: 300
podTemplateSpec:
spec:
restartPolicy: Never
imagePullSecrets:
- name: ngc-secret
volumes:
- name: hf-cache
persistentVolumeClaim:
claimName: hf-cache-pvc
containers:
- name: checkpoint-worker
image: nvcr.io/nvidia/ai-dynamo/dynamo-vllm:latest
command: ["python", "-m", "vllm.entrypoints.openai.api_server"]
- name: worker
image: registry.example.com/dynamo/vllm-placeholder:1.0.0
command:
- python3
args:
- "-m"
- "dynamo.vllm"
- "--model"
- "meta-llama/Meta-Llama-3-8B-Instruct"
- "--tensor-parallel-size"
- "1"
- "Qwen/Qwen3-0.6B"
- "--dtype"
- "bfloat16"
- "--tensor-parallel-size"
- "1"
- "--max-model-len"
- "8192"
- "2048"
envFrom:
- secretRef:
name: hf-token-secret
env:
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-secret
key: token
- name: HF_HOME
value: /home/dynamo/.cache/huggingface
- name: NCCL_DEBUG
value: ERROR
- name: TORCH_CPP_LOG_LEVEL
value: ERROR
- name: TORCH_DISTRIBUTED_DEBUG
value: "OFF"
volumeMounts:
- name: hf-cache
mountPath: /home/dynamo/.cache/huggingface
resources:
limits:
nvidia.com/gpu: 1
restartPolicy: Never
nvidia.com/gpu: "1"
......@@ -5,6 +5,7 @@ go 1.25.0
require (
emperror.dev/errors v0.8.1
github.com/Masterminds/semver/v3 v3.4.0
github.com/ai-dynamo/dynamo/deploy/snapshot v0.0.0
github.com/ai-dynamo/grove/operator/api v0.1.0-alpha.6
github.com/bsm/gomega v1.27.10
github.com/go-logr/logr v1.4.3
......@@ -15,6 +16,8 @@ require (
github.com/open-policy-agent/cert-controller v0.14.0
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.71.2
github.com/prometheus/client_golang v1.23.2
github.com/prometheus/client_model v0.6.2
github.com/prometheus/common v0.67.5
github.com/stretchr/testify v1.11.1
golang.org/x/crypto v0.48.0
istio.io/api v1.23.1
......@@ -31,6 +34,8 @@ require (
volcano.sh/apis v1.12.2
)
replace github.com/ai-dynamo/dynamo/deploy/snapshot => ../snapshot
require (
cel.dev/expr v0.25.1 // indirect
github.com/antlr4-go/antlr/v4 v4.13.0 // indirect
......@@ -68,8 +73,6 @@ require (
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus/client_model v0.6.2 // indirect
github.com/prometheus/common v0.67.5 // indirect
github.com/prometheus/procfs v0.17.0 // indirect
github.com/spf13/cobra v1.9.1 // indirect
github.com/spf13/pflag v1.0.10 // indirect
......
cel.dev/expr v0.24.0 h1:56OvJKSH3hDGL0ml5uSxZmz3/3Pq4tJ+fb1unVLAFcY=
cel.dev/expr v0.24.0/go.mod h1:hLPLo1W4QUmuYdA72RBX06QTs6MXw941piREPl3Yfiw=
cel.dev/expr v0.25.1 h1:1KrZg61W6TWSxuNZ37Xy49ps13NUovb66QLprthtwi4=
cel.dev/expr v0.25.1/go.mod h1:hrXvqGP6G6gyx8UAHSHJ5RGk//1Oj5nXQ2NI02Nrsg4=
emperror.dev/errors v0.8.1 h1:UavXZ5cSX/4u9iyvH6aDcuGkVjeexUGJ7Ij7G4VfQT0=
......@@ -258,12 +256,8 @@ gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk=
gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E=
google.golang.org/genproto/googleapis/api v0.0.0-20251202230838-ff82c1b0f217 h1:fCvbg86sFXwdrl5LgVcTEvNC+2txB5mgROGmRL5mrls=
google.golang.org/genproto/googleapis/api v0.0.0-20251202230838-ff82c1b0f217/go.mod h1:+rXWjjaukWZun3mLfjmVnQi18E1AsFbDN9QdJ5YXLto=
google.golang.org/genproto/googleapis/rpc v0.0.0-20251124214823-79d6a2a48846 h1:Wgl1rcDNThT+Zn47YyCXOXyX/COgMTIdhJ717F0l4xk=
google.golang.org/genproto/googleapis/rpc v0.0.0-20251124214823-79d6a2a48846/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk=
google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217 h1:gRkg/vSppuSQoDjxyiGfN4Upv/h/DQmIR10ZU8dh4Ww=
google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk=
google.golang.org/grpc v1.76.0 h1:UnVkv1+uMLYXoIz6o7chp59WfQUYA2ex/BXQ9rHZu7A=
google.golang.org/grpc v1.76.0/go.mod h1:Ju12QI8M6iQJtbcsV+awF5a4hfJMLi4X0JLo94ULZ6c=
google.golang.org/grpc v1.79.3 h1:sybAEdRIEtvcD68Gx7dmnwjZKlyfuc61Dyo9pGXXkKE=
google.golang.org/grpc v1.79.3/go.mod h1:KmT0Kjez+0dde/v2j9vzwoAScgEPx/Bw1CYChhHLrHQ=
google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE=
......
......@@ -21,11 +21,12 @@ import (
"context"
"testing"
configv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/config/v1alpha1"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
......@@ -39,19 +40,6 @@ const (
testNamespace = "default"
)
func testPVCConfig() *configv1alpha1.CheckpointConfiguration {
return &configv1alpha1.CheckpointConfiguration{
Enabled: true,
Storage: configv1alpha1.CheckpointStorageConfiguration{
Type: configv1alpha1.CheckpointStorageTypePVC,
PVC: configv1alpha1.CheckpointPVCConfig{
PVCName: "snapshot-pvc",
BasePath: "/checkpoints",
},
},
}
}
func testIdentity() nvidiacomv1alpha1.DynamoCheckpointIdentity {
return nvidiacomv1alpha1.DynamoCheckpointIdentity{
Model: "meta-llama/Llama-2-7b-hf",
......@@ -74,6 +62,7 @@ func testScheme() *runtime.Scheme {
s := runtime.NewScheme()
_ = nvidiacomv1alpha1.AddToScheme(s)
_ = corev1.AddToScheme(s)
_ = appsv1.AddToScheme(s)
return s
}
......@@ -81,6 +70,39 @@ func testInfo() *CheckpointInfo {
return &CheckpointInfo{Enabled: true, Hash: testHash}
}
func testSnapshotAgentDaemonSet() *appsv1.DaemonSet {
return &appsv1.DaemonSet{
ObjectMeta: metav1.ObjectMeta{
Name: "snapshot-agent",
Namespace: testNamespace,
Labels: map[string]string{
snapshotprotocol.SnapshotAgentLabelKey: snapshotprotocol.SnapshotAgentLabelValue,
},
},
Spec: appsv1.DaemonSetSpec{
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Containers: []corev1.Container{{
Name: snapshotprotocol.SnapshotAgentContainerName,
VolumeMounts: []corev1.VolumeMount{{
Name: "checkpoints",
MountPath: "/checkpoints",
}},
}},
Volumes: []corev1.Volume{{
Name: "checkpoints",
VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
ClaimName: "snapshot-pvc",
},
},
}},
},
},
},
}
}
type createHookClient struct {
client.Client
onCreate func(ctx context.Context, obj client.Object) error
......@@ -97,71 +119,6 @@ func (c *createHookClient) Create(ctx context.Context, obj client.Object, opts .
return c.Client.Create(ctx, obj, opts...)
}
// --- Resource helper tests ---
func TestHelpers(t *testing.T) {
// checkpointInfoFromObject — ready
hash, err := ComputeIdentityHash(testIdentity())
require.NoError(t, err)
ckpt := &nvidiacomv1alpha1.DynamoCheckpoint{
ObjectMeta: metav1.ObjectMeta{Name: hash},
Spec: nvidiacomv1alpha1.DynamoCheckpointSpec{Identity: testIdentity()},
Status: nvidiacomv1alpha1.DynamoCheckpointStatus{
Phase: nvidiacomv1alpha1.DynamoCheckpointPhaseReady,
IdentityHash: hash,
Location: "/checkpoints/" + hash,
StorageType: "pvc",
},
}
info, err := checkpointInfoFromObject(ckpt)
require.NoError(t, err)
assert.True(t, info.Enabled)
assert.True(t, info.Ready)
assert.Equal(t, hash, info.Hash)
assert.Equal(t, "/checkpoints/"+hash, info.Location)
assert.Equal(t, ckpt.Name, info.CheckpointName)
// checkpointInfoFromObject — not ready
ckpt.Status.Phase = nvidiacomv1alpha1.DynamoCheckpointPhaseCreating
info, err = checkpointInfoFromObject(ckpt)
require.NoError(t, err)
assert.False(t, info.Ready)
}
func TestArtifactVersionHelpers(t *testing.T) {
t.Run("new checkpoints default to version 1", func(t *testing.T) {
ckpt := &nvidiacomv1alpha1.DynamoCheckpoint{}
assert.Nil(t, ckpt.Annotations)
assert.Equal(t, "checkpoint-job-"+testHash+"-"+consts.DefaultCheckpointArtifactVersion, "checkpoint-job-"+testHash+"-"+consts.DefaultCheckpointArtifactVersion)
})
t.Run("annotation overrides desired version", func(t *testing.T) {
ckpt := &nvidiacomv1alpha1.DynamoCheckpoint{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{
consts.KubeAnnotationCheckpointArtifactVersion: "3",
},
},
}
assert.Equal(t, "3", ckpt.Annotations[consts.KubeAnnotationCheckpointArtifactVersion])
assert.Equal(t, "checkpoint-job-"+testHash+"-3", "checkpoint-job-"+testHash+"-"+ckpt.Annotations[consts.KubeAnnotationCheckpointArtifactVersion])
})
}
func TestResolveCheckpointStorage(t *testing.T) {
config := testPVCConfig()
location, storageType, err := ResolveCheckpointStorage(testHash, "", config)
require.NoError(t, err)
assert.Equal(t, "/checkpoints/"+testHash+"/versions/"+consts.DefaultCheckpointArtifactVersion, location)
assert.Equal(t, nvidiacomv1alpha1.DynamoCheckpointStorageType("pvc"), storageType)
location, storageType, err = ResolveCheckpointStorage(testHash, "7", config)
require.NoError(t, err)
assert.Equal(t, "/checkpoints/"+testHash+"/versions/7", location)
assert.Equal(t, nvidiacomv1alpha1.DynamoCheckpointStorageType("pvc"), storageType)
}
func TestCreateOrGetAutoCheckpointDeduplicatesConcurrentSameHashCheckpoint(t *testing.T) {
ctx := context.Background()
s := testScheme()
......@@ -175,7 +132,7 @@ func TestCreateOrGetAutoCheckpointDeduplicatesConcurrentSameHashCheckpoint(t *te
Name: "friendly-checkpoint",
Namespace: testNamespace,
Labels: map[string]string{
consts.KubeLabelCheckpointHash: hash,
consts.KubeLabelCheckpointID: hash,
},
},
Spec: nvidiacomv1alpha1.DynamoCheckpointSpec{
......@@ -223,184 +180,59 @@ func TestCreateOrGetAutoCheckpointSetsDefaultArtifactVersion(t *testing.T) {
assert.Equal(t, consts.DefaultCheckpointArtifactVersion, ckpt.Annotations[consts.KubeAnnotationCheckpointArtifactVersion])
}
// --- Injection idempotency tests ---
func TestInjectionIdempotency(t *testing.T) {
// Volume injection is idempotent
podSpec := &corev1.PodSpec{Volumes: []corev1.Volume{{Name: consts.CheckpointVolumeName}, {Name: consts.PodInfoVolumeName}}}
InjectCheckpointVolume(podSpec, "snapshot-pvc")
InjectPodInfoVolume(podSpec)
assert.Len(t, podSpec.Volumes, 2)
// Mount injection is idempotent
container := &corev1.Container{VolumeMounts: []corev1.VolumeMount{
{Name: consts.CheckpointVolumeName}, {Name: consts.PodInfoVolumeName},
}}
InjectCheckpointVolumeMount(container, "/checkpoints")
InjectPodInfoVolumeMount(container)
assert.Len(t, container.VolumeMounts, 2)
}
func TestApplyCheckpointPodMetadata(t *testing.T) {
t.Run("checkpoint source metadata uses annotations for location and storage", func(t *testing.T) {
labels := map[string]string{}
annotations := map[string]string{}
ApplyCheckpointSourcePodMetadata(labels, annotations, testHash, "/checkpoints/"+testHash, "pvc")
assert.Equal(t, consts.KubeLabelValueTrue, labels[consts.KubeLabelIsCheckpointSource])
assert.Equal(t, testHash, labels[consts.KubeLabelCheckpointHash])
assert.Equal(t, "/checkpoints/"+testHash, annotations[consts.KubeAnnotationCheckpointLocation])
assert.Equal(t, "pvc", annotations[consts.KubeAnnotationCheckpointStorageType])
})
t.Run("restore metadata clears stale values when checkpoint is not ready", func(t *testing.T) {
labels := map[string]string{
consts.KubeLabelIsRestoreTarget: consts.KubeLabelValueTrue,
consts.KubeLabelCheckpointHash: "stale-hash",
}
annotations := map[string]string{
consts.KubeAnnotationCheckpointLocation: "/checkpoints/stale-hash",
consts.KubeAnnotationCheckpointStorageType: "pvc",
}
ApplyRestorePodMetadata(labels, annotations, &CheckpointInfo{Enabled: true, Ready: false})
_, hasRestoreTarget := labels[consts.KubeLabelIsRestoreTarget]
_, hasCheckpointHash := labels[consts.KubeLabelCheckpointHash]
_, hasLocation := annotations[consts.KubeAnnotationCheckpointLocation]
_, hasStorageType := annotations[consts.KubeAnnotationCheckpointStorageType]
assert.False(t, hasRestoreTarget)
assert.False(t, hasCheckpointHash)
assert.False(t, hasLocation)
assert.False(t, hasStorageType)
})
}
// --- InjectCheckpointIntoPodSpec tests ---
func TestInjectCheckpointIntoPodSpec(t *testing.T) {
t.Run("nil or disabled info is a no-op", func(t *testing.T) {
for _, info := range []*CheckpointInfo{nil, {Enabled: false}} {
podSpec := testPodSpec()
require.NoError(t, InjectCheckpointIntoPodSpec(podSpec, info, testPVCConfig()))
assert.Equal(t, []string{"python3"}, podSpec.Containers[0].Command)
}
})
t.Run("ready checkpoint overrides command to sleep infinity", func(t *testing.T) {
t.Run("ready checkpoint injects podinfo and overrides command", func(t *testing.T) {
podSpec := testPodSpec()
info := &CheckpointInfo{Enabled: true, Ready: true, Hash: testHash}
require.NoError(t, InjectCheckpointIntoPodSpec(podSpec, info, testPVCConfig()))
info := &CheckpointInfo{Enabled: true, Ready: true, Identity: ptr.To(testIdentity())}
reader := fake.NewClientBuilder().WithScheme(testScheme()).WithObjects(testSnapshotAgentDaemonSet()).Build()
require.NoError(t, InjectCheckpointIntoPodSpec(context.Background(), reader, testNamespace, podSpec, info))
assert.Equal(t, []string{"sleep", "infinity"}, podSpec.Containers[0].Command)
assert.Nil(t, podSpec.Containers[0].Args)
})
assert.Len(t, info.Hash, 16)
t.Run("ready checkpoint preserves published versioned location", func(t *testing.T) {
podSpec := testPodSpec()
info := &CheckpointInfo{
Enabled: true,
Ready: true,
Hash: testHash,
Location: "/checkpoints/" + testHash + "/versions/2",
StorageType: "pvc",
volumes := map[string]corev1.Volume{}
for _, volume := range podSpec.Volumes {
volumes[volume.Name] = volume
}
require.NoError(t, InjectCheckpointIntoPodSpec(podSpec, info, testPVCConfig()))
assert.Equal(t, "/checkpoints/"+testHash+"/versions/2", info.Location)
assert.Equal(t, nvidiacomv1alpha1.DynamoCheckpointStorageType("pvc"), info.StorageType)
})
t.Run("not-ready checkpoint preserves original command", func(t *testing.T) {
podSpec := testPodSpec()
require.NoError(t, InjectCheckpointIntoPodSpec(podSpec, testInfo(), testPVCConfig()))
assert.Equal(t, []string{"python3"}, podSpec.Containers[0].Command)
})
t.Run("sets seccomp profile", func(t *testing.T) {
podSpec := testPodSpec()
require.NoError(t, InjectCheckpointIntoPodSpec(podSpec, testInfo(), testPVCConfig()))
require.NotNil(t, podSpec.SecurityContext)
require.NotNil(t, podSpec.SecurityContext.SeccompProfile)
assert.Equal(t, corev1.SeccompProfileTypeLocalhost, podSpec.SecurityContext.SeccompProfile.Type)
assert.Equal(t, consts.SeccompProfilePath, *podSpec.SecurityContext.SeccompProfile.LocalhostProfile)
})
require.Contains(t, volumes, consts.PodInfoVolumeName)
require.NotNil(t, volumes[consts.PodInfoVolumeName].DownwardAPI)
t.Run("preserves existing security context", func(t *testing.T) {
podSpec := testPodSpec()
podSpec.SecurityContext = &corev1.PodSecurityContext{RunAsUser: ptr.To(int64(1000))}
require.NoError(t, InjectCheckpointIntoPodSpec(podSpec, testInfo(), testPVCConfig()))
assert.Equal(t, int64(1000), *podSpec.SecurityContext.RunAsUser)
require.NotNil(t, podSpec.SecurityContext.SeccompProfile)
})
t.Run("PVC storage injects volumes and mounts", func(t *testing.T) {
podSpec := testPodSpec()
require.NoError(t, InjectCheckpointIntoPodSpec(podSpec, testInfo(), testPVCConfig()))
// Volumes
volNames := make(map[string]bool)
for _, v := range podSpec.Volumes {
volNames[v.Name] = true
if v.Name == consts.CheckpointVolumeName {
assert.Equal(t, "snapshot-pvc", v.PersistentVolumeClaim.ClaimName)
}
if v.Name == consts.PodInfoVolumeName {
require.NotNil(t, v.DownwardAPI)
fieldPaths := map[string]string{}
for _, item := range v.DownwardAPI.Items {
if item.FieldRef != nil {
fieldPaths[item.Path] = item.FieldRef.FieldPath
}
}
assert.Equal(t, "metadata.labels['"+consts.KubeLabelDynamoNamespace+"']", fieldPaths[consts.PodInfoFileDynNamespace])
assert.Equal(t, "metadata.labels['"+consts.KubeLabelDynamoWorkerHash+"']", fieldPaths[consts.PodInfoFileDynNamespaceWorkerSuffix])
assert.Equal(t, "metadata.labels['"+consts.KubeLabelDynamoComponentType+"']", fieldPaths[consts.PodInfoFileDynComponent])
assert.Equal(t, "metadata.labels['"+consts.KubeLabelDynamoGraphDeploymentName+"']", fieldPaths[consts.PodInfoFileDynParentDGDName])
assert.Equal(t, consts.PodInfoFieldPodNamespace, fieldPaths[consts.PodInfoFileDynParentDGDNamespace])
fields := map[string]string{}
for _, item := range volumes[consts.PodInfoVolumeName].DownwardAPI.Items {
if item.FieldRef != nil {
fields[item.Path] = item.FieldRef.FieldPath
}
}
assert.True(t, volNames[consts.CheckpointVolumeName])
assert.True(t, volNames[consts.PodInfoVolumeName])
// Mounts
mountPaths := make(map[string]string)
for _, m := range podSpec.Containers[0].VolumeMounts {
mountPaths[m.Name] = m.MountPath
assert.Equal(t, "metadata.labels['"+consts.KubeLabelDynamoNamespace+"']", fields[consts.PodInfoFileDynNamespace])
assert.Equal(t, "metadata.labels['"+consts.KubeLabelDynamoWorkerHash+"']", fields[consts.PodInfoFileDynNamespaceWorkerSuffix])
assert.Equal(t, "metadata.labels['"+consts.KubeLabelDynamoComponentType+"']", fields[consts.PodInfoFileDynComponent])
assert.Equal(t, "metadata.labels['"+consts.KubeLabelDynamoGraphDeploymentName+"']", fields[consts.PodInfoFileDynParentDGDName])
assert.Equal(t, consts.PodInfoFieldPodNamespace, fields[consts.PodInfoFileDynParentDGDNamespace])
mountPaths := map[string]string{}
for _, mount := range podSpec.Containers[0].VolumeMounts {
mountPaths[mount.Name] = mount.MountPath
}
assert.Equal(t, "/checkpoints", mountPaths[consts.CheckpointVolumeName])
assert.Equal(t, consts.PodInfoMountPath, mountPaths[consts.PodInfoVolumeName])
})
t.Run("computes hash from identity when hash is empty", func(t *testing.T) {
podSpec := testPodSpec()
identity := testIdentity()
info := &CheckpointInfo{Enabled: true, Identity: &identity}
require.NoError(t, InjectCheckpointIntoPodSpec(podSpec, info, testPVCConfig()))
assert.Len(t, info.Hash, 16)
})
t.Run("S3 and OCI storage set location", func(t *testing.T) {
for _, tc := range []struct {
storageType string
config configv1alpha1.CheckpointStorageConfiguration
wantLoc string
}{
{"s3", configv1alpha1.CheckpointStorageConfiguration{
Type: configv1alpha1.CheckpointStorageTypeS3,
S3: configv1alpha1.CheckpointS3Config{URI: "s3://bucket/prefix"},
}, "s3://bucket/prefix/" + testHash + ".tar"},
{"oci", configv1alpha1.CheckpointStorageConfiguration{
Type: configv1alpha1.CheckpointStorageTypeOCI,
OCI: configv1alpha1.CheckpointOCIConfig{URI: "oci://registry/repo"},
}, "oci://registry/repo:" + testHash},
} {
t.Run(tc.storageType, func(t *testing.T) {
podSpec := testPodSpec()
info := &CheckpointInfo{Enabled: true, Hash: testHash}
require.NoError(t, InjectCheckpointIntoPodSpec(podSpec, info, &configv1alpha1.CheckpointConfiguration{Storage: tc.config}))
assert.Equal(t, tc.wantLoc, info.Location)
})
t.Run("ready checkpoint targets the container named main", func(t *testing.T) {
podSpec := &corev1.PodSpec{
Containers: []corev1.Container{
{Name: "sidecar", Image: "sidecar:latest", Command: []string{"sidecar"}, Args: []string{"run"}},
{Name: consts.MainContainerName, Image: "main:latest", Command: []string{"python3"}, Args: []string{"-m", "dynamo.vllm"}},
},
}
info := &CheckpointInfo{Enabled: true, Ready: true, Hash: testHash}
reader := fake.NewClientBuilder().WithScheme(testScheme()).WithObjects(testSnapshotAgentDaemonSet()).Build()
require.NoError(t, InjectCheckpointIntoPodSpec(context.Background(), reader, testNamespace, podSpec, info))
assert.Equal(t, []string{"sidecar"}, podSpec.Containers[0].Command)
assert.Equal(t, []string{"run"}, podSpec.Containers[0].Args)
assert.Equal(t, []string{"sleep", "infinity"}, podSpec.Containers[1].Command)
assert.Nil(t, podSpec.Containers[1].Args)
})
t.Run("error cases", func(t *testing.T) {
......@@ -408,35 +240,21 @@ func TestInjectCheckpointIntoPodSpec(t *testing.T) {
name string
podSpec *corev1.PodSpec
info *CheckpointInfo
config *configv1alpha1.CheckpointConfiguration
reader client.Reader
errMsg string
}{
{"hash empty and identity nil", testPodSpec(), &CheckpointInfo{Enabled: true}, testPVCConfig(), "identity is nil"},
{"no containers", &corev1.PodSpec{}, testInfo(), testPVCConfig(), "no container found"},
{"PVC name missing", testPodSpec(), testInfo(), &configv1alpha1.CheckpointConfiguration{
Storage: configv1alpha1.CheckpointStorageConfiguration{Type: "pvc", PVC: configv1alpha1.CheckpointPVCConfig{BasePath: "/checkpoints"}},
}, "no PVC name"},
{"S3 URI missing", testPodSpec(), testInfo(), &configv1alpha1.CheckpointConfiguration{
Storage: configv1alpha1.CheckpointStorageConfiguration{Type: "s3"},
}, "S3"},
{"OCI URI missing", testPodSpec(), testInfo(), &configv1alpha1.CheckpointConfiguration{
Storage: configv1alpha1.CheckpointStorageConfiguration{Type: "oci"},
}, "OCI"},
{"hash empty and identity nil", testPodSpec(), &CheckpointInfo{Enabled: true}, fake.NewClientBuilder().WithScheme(testScheme()).WithObjects(testSnapshotAgentDaemonSet()).Build(), "identity is nil"},
{"no containers", &corev1.PodSpec{}, testInfo(), fake.NewClientBuilder().WithScheme(testScheme()).WithObjects(testSnapshotAgentDaemonSet()).Build(), "no container found"},
{"main container missing", &corev1.PodSpec{Containers: []corev1.Container{{Name: "sidecar", Image: "img", Command: []string{"python3"}}}}, testInfo(), fake.NewClientBuilder().WithScheme(testScheme()).WithObjects(testSnapshotAgentDaemonSet()).Build(), "main container not found"},
{"snapshot daemonset missing", testPodSpec(), testInfo(), fake.NewClientBuilder().WithScheme(testScheme()).Build(), "no snapshot-agent daemonset found"},
} {
t.Run(tc.name, func(t *testing.T) {
err := InjectCheckpointIntoPodSpec(tc.podSpec, tc.info, tc.config)
err := InjectCheckpointIntoPodSpec(context.Background(), tc.reader, testNamespace, tc.podSpec, tc.info)
require.Error(t, err)
assert.Contains(t, err.Error(), tc.errMsg)
})
}
})
t.Run("falls back to first container when main not found", func(t *testing.T) {
podSpec := &corev1.PodSpec{Containers: []corev1.Container{{Name: "sidecar", Image: "img", Command: []string{"python3"}}}}
info := &CheckpointInfo{Enabled: true, Ready: true, Hash: testHash}
require.NoError(t, InjectCheckpointIntoPodSpec(podSpec, info, testPVCConfig()))
assert.Equal(t, []string{"sleep", "infinity"}, podSpec.Containers[0].Command)
})
}
// --- ResolveCheckpointForService tests ---
......@@ -463,8 +281,6 @@ func TestResolveCheckpointForService(t *testing.T) {
Status: nvidiacomv1alpha1.DynamoCheckpointStatus{
Phase: nvidiacomv1alpha1.DynamoCheckpointPhaseReady,
IdentityHash: hash,
Location: "/checkpoints/" + hash,
StorageType: "pvc",
},
}
c := fake.NewClientBuilder().WithScheme(s).WithObjects(ckpt).WithStatusSubresource(ckpt).Build()
......@@ -477,7 +293,6 @@ func TestResolveCheckpointForService(t *testing.T) {
assert.True(t, info.Exists)
assert.True(t, info.Ready)
assert.Equal(t, hash, info.Hash)
assert.Equal(t, "/checkpoints/"+hash, info.Location)
assert.Equal(t, hash, info.CheckpointName)
})
......@@ -541,8 +356,6 @@ func TestResolveCheckpointForService(t *testing.T) {
Status: nvidiacomv1alpha1.DynamoCheckpointStatus{
Phase: nvidiacomv1alpha1.DynamoCheckpointPhaseReady,
IdentityHash: hash,
Location: "/checkpoints/" + hash,
StorageType: "pvc",
},
}
c := fake.NewClientBuilder().WithScheme(s).WithObjects(ckpt).WithStatusSubresource(ckpt).Build()
......
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
package checkpoint
import (
commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
corev1 "k8s.io/api/core/v1"
)
func EnsurePodInfoVolume(podSpec *corev1.PodSpec) {
for _, volume := range podSpec.Volumes {
if volume.Name == commonconsts.PodInfoVolumeName {
return
}
}
podSpec.Volumes = append(podSpec.Volumes, corev1.Volume{
Name: commonconsts.PodInfoVolumeName,
VolumeSource: corev1.VolumeSource{
DownwardAPI: &corev1.DownwardAPIVolumeSource{
Items: []corev1.DownwardAPIVolumeFile{
{
Path: "pod_name",
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: commonconsts.PodInfoFieldPodName,
},
},
{
Path: "pod_uid",
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: commonconsts.PodInfoFieldPodUID,
},
},
{
Path: "pod_namespace",
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: commonconsts.PodInfoFieldPodNamespace,
},
},
{
Path: commonconsts.PodInfoFileDynNamespace,
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.labels['" + commonconsts.KubeLabelDynamoNamespace + "']",
},
},
{
Path: commonconsts.PodInfoFileDynNamespaceWorkerSuffix,
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.labels['" + commonconsts.KubeLabelDynamoWorkerHash + "']",
},
},
{
Path: commonconsts.PodInfoFileDynComponent,
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.labels['" + commonconsts.KubeLabelDynamoComponentType + "']",
},
},
{
Path: commonconsts.PodInfoFileDynParentDGDName,
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.labels['" + commonconsts.KubeLabelDynamoGraphDeploymentName + "']",
},
},
{
Path: commonconsts.PodInfoFileDynParentDGDNamespace,
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: commonconsts.PodInfoFieldPodNamespace,
},
},
},
},
},
})
}
func EnsurePodInfoMount(container *corev1.Container) {
for _, mount := range container.VolumeMounts {
if mount.Name == commonconsts.PodInfoVolumeName {
return
}
}
container.VolumeMounts = append(container.VolumeMounts, corev1.VolumeMount{
Name: commonconsts.PodInfoVolumeName,
MountPath: commonconsts.PodInfoMountPath,
ReadOnly: true,
})
}
......@@ -18,177 +18,32 @@
package checkpoint
import (
"context"
"fmt"
configv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/config/v1alpha1"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol"
corev1 "k8s.io/api/core/v1"
"k8s.io/utils/ptr"
ctrlclient "sigs.k8s.io/controller-runtime/pkg/client"
)
func ApplyCheckpointSourcePodMetadata(
labels map[string]string,
annotations map[string]string,
hash string,
location string,
storageType nvidiacomv1alpha1.DynamoCheckpointStorageType,
) {
delete(labels, commonconsts.KubeLabelIsRestoreTarget)
delete(labels, commonconsts.KubeLabelCheckpointHash)
delete(annotations, commonconsts.KubeAnnotationCheckpointLocation)
delete(annotations, commonconsts.KubeAnnotationCheckpointStorageType)
labels[commonconsts.KubeLabelIsCheckpointSource] = commonconsts.KubeLabelValueTrue
if hash != "" {
labels[commonconsts.KubeLabelCheckpointHash] = hash
}
if location != "" {
annotations[commonconsts.KubeAnnotationCheckpointLocation] = location
}
if storageType != "" {
annotations[commonconsts.KubeAnnotationCheckpointStorageType] = string(storageType)
}
}
func ApplyRestorePodMetadata(labels map[string]string, annotations map[string]string, checkpointInfo *CheckpointInfo) {
delete(labels, commonconsts.KubeLabelIsRestoreTarget)
delete(labels, commonconsts.KubeLabelCheckpointHash)
delete(annotations, commonconsts.KubeAnnotationCheckpointLocation)
delete(annotations, commonconsts.KubeAnnotationCheckpointStorageType)
if checkpointInfo == nil || !checkpointInfo.Enabled || !checkpointInfo.Ready {
return
}
labels[commonconsts.KubeLabelIsRestoreTarget] = commonconsts.KubeLabelValueTrue
if checkpointInfo.Hash != "" {
labels[commonconsts.KubeLabelCheckpointHash] = checkpointInfo.Hash
}
if checkpointInfo.Location != "" {
annotations[commonconsts.KubeAnnotationCheckpointLocation] = checkpointInfo.Location
}
if checkpointInfo.StorageType != "" {
annotations[commonconsts.KubeAnnotationCheckpointStorageType] = string(checkpointInfo.StorageType)
}
}
func InjectCheckpointVolume(podSpec *corev1.PodSpec, pvcName string) {
for _, volume := range podSpec.Volumes {
if volume.Name == commonconsts.CheckpointVolumeName {
return
}
}
podSpec.Volumes = append(podSpec.Volumes, corev1.Volume{
Name: commonconsts.CheckpointVolumeName,
VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
ClaimName: pvcName,
ReadOnly: false,
},
},
})
}
func InjectCheckpointVolumeMount(container *corev1.Container, basePath string) {
for _, mount := range container.VolumeMounts {
if mount.Name == commonconsts.CheckpointVolumeName {
return
}
}
container.VolumeMounts = append(container.VolumeMounts, corev1.VolumeMount{
Name: commonconsts.CheckpointVolumeName,
MountPath: basePath,
ReadOnly: false,
})
}
func InjectPodInfoVolume(podSpec *corev1.PodSpec) {
for _, volume := range podSpec.Volumes {
if volume.Name == commonconsts.PodInfoVolumeName {
return
}
}
podSpec.Volumes = append(podSpec.Volumes, corev1.Volume{
Name: commonconsts.PodInfoVolumeName,
VolumeSource: corev1.VolumeSource{
DownwardAPI: &corev1.DownwardAPIVolumeSource{
Items: []corev1.DownwardAPIVolumeFile{
{
Path: "pod_name",
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: commonconsts.PodInfoFieldPodName,
},
},
{
Path: "pod_uid",
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: commonconsts.PodInfoFieldPodUID,
},
},
{
Path: "pod_namespace",
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: commonconsts.PodInfoFieldPodNamespace,
},
},
{
Path: commonconsts.PodInfoFileDynNamespace,
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.labels['" + commonconsts.KubeLabelDynamoNamespace + "']",
},
},
{
Path: commonconsts.PodInfoFileDynNamespaceWorkerSuffix,
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.labels['" + commonconsts.KubeLabelDynamoWorkerHash + "']",
},
},
{
Path: commonconsts.PodInfoFileDynComponent,
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.labels['" + commonconsts.KubeLabelDynamoComponentType + "']",
},
},
{
Path: commonconsts.PodInfoFileDynParentDGDName,
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.labels['" + commonconsts.KubeLabelDynamoGraphDeploymentName + "']",
},
},
{
Path: commonconsts.PodInfoFileDynParentDGDNamespace,
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: commonconsts.PodInfoFieldPodNamespace,
},
},
},
},
},
})
}
func InjectPodInfoVolumeMount(container *corev1.Container) {
for _, mount := range container.VolumeMounts {
if mount.Name == commonconsts.PodInfoVolumeName {
return
}
}
container.VolumeMounts = append(container.VolumeMounts, corev1.VolumeMount{
Name: commonconsts.PodInfoVolumeName,
MountPath: commonconsts.PodInfoMountPath,
ReadOnly: true,
})
enabled := checkpointInfo != nil && checkpointInfo.Enabled && checkpointInfo.Ready
hash := ""
artifactVersion := ""
if enabled {
hash = checkpointInfo.Hash
artifactVersion = checkpointInfo.ArtifactVersion
}
snapshotprotocol.ApplyRestoreTargetMetadata(labels, annotations, enabled, hash, artifactVersion)
}
func InjectCheckpointIntoPodSpec(
ctx context.Context,
reader ctrlclient.Reader,
namespace string,
podSpec *corev1.PodSpec,
checkpointInfo *CheckpointInfo,
checkpointConfig *configv1alpha1.CheckpointConfiguration,
) error {
if checkpointInfo == nil || !checkpointInfo.Enabled {
return nil
......@@ -207,6 +62,9 @@ func InjectCheckpointIntoPodSpec(
info.Hash = hash
}
if len(podSpec.Containers) == 0 {
return fmt.Errorf("no container found to inject checkpoint config")
}
var mainContainer *corev1.Container
for i := range podSpec.Containers {
if podSpec.Containers[i].Name == commonconsts.MainContainerName {
......@@ -214,83 +72,27 @@ func InjectCheckpointIntoPodSpec(
break
}
}
if mainContainer == nil && len(podSpec.Containers) > 0 {
mainContainer = &podSpec.Containers[0]
}
if mainContainer == nil {
return fmt.Errorf("no container found to inject checkpoint config")
}
if info.Ready {
mainContainer.Command = []string{"sleep", "infinity"}
mainContainer.Args = nil
}
if podSpec.SecurityContext == nil {
podSpec.SecurityContext = &corev1.PodSecurityContext{}
}
podSpec.SecurityContext.SeccompProfile = &corev1.SeccompProfile{
Type: corev1.SeccompProfileTypeLocalhost,
LocalhostProfile: ptr.To(commonconsts.SeccompProfilePath),
}
storageType := configv1alpha1.CheckpointStorageTypePVC
var storageConfig *configv1alpha1.CheckpointStorageConfiguration
if checkpointConfig != nil {
storageConfig = &checkpointConfig.Storage
if storageConfig.Type != "" {
storageType = storageConfig.Type
}
}
if err := injectCheckpointStorage(podSpec, mainContainer, info, storageType, storageConfig); err != nil {
return fmt.Errorf("main container not found in pod spec")
}
if reader == nil {
return fmt.Errorf("checkpoint client is required")
}
if err := snapshotprotocol.PrepareRestorePodSpecForCheckpoint(
ctx,
reader,
namespace,
podSpec,
mainContainer,
info.Hash,
info.ArtifactVersion,
commonconsts.SeccompProfilePath,
info.Ready,
); err != nil {
return err
}
InjectPodInfoVolume(podSpec)
InjectPodInfoVolumeMount(mainContainer)
EnsurePodInfoVolume(podSpec)
EnsurePodInfoMount(mainContainer)
return nil
}
func injectCheckpointStorage(
podSpec *corev1.PodSpec,
mainContainer *corev1.Container,
info *CheckpointInfo,
storageType string,
storageConfig *configv1alpha1.CheckpointStorageConfiguration,
) error {
if info.StorageType == "" {
info.StorageType = nvidiacomv1alpha1.DynamoCheckpointStorageType(storageType)
}
switch storageType {
case configv1alpha1.CheckpointStorageTypeS3:
if storageConfig == nil || storageConfig.S3.URI == "" {
return fmt.Errorf("S3 storage type selected but no S3 URI configured (set checkpoint.storage.s3.uri)")
}
if info.Location == "" {
info.Location = fmt.Sprintf("%s/%s.tar", storageConfig.S3.URI, info.Hash)
}
return nil
case configv1alpha1.CheckpointStorageTypeOCI:
if storageConfig == nil || storageConfig.OCI.URI == "" {
return fmt.Errorf("OCI storage type selected but no OCI URI configured (set checkpoint.storage.oci.uri)")
}
if info.Location == "" {
info.Location = fmt.Sprintf("%s:%s", storageConfig.OCI.URI, info.Hash)
}
return nil
default:
if storageConfig == nil || storageConfig.PVC.PVCName == "" {
return fmt.Errorf("PVC storage type selected but no PVC name configured (set checkpoint.storage.pvc.pvcName)")
}
if storageConfig.PVC.BasePath == "" {
return fmt.Errorf("PVC storage type selected but no PVC base path configured (set checkpoint.storage.pvc.basePath)")
}
if info.Location == "" {
info.Location = fmt.Sprintf("%s/%s", storageConfig.PVC.BasePath, info.Hash)
}
InjectCheckpointVolume(podSpec, storageConfig.PVC.PVCName)
InjectCheckpointVolumeMount(mainContainer, storageConfig.PVC.BasePath)
return nil
}
}
......@@ -20,24 +20,21 @@ package checkpoint
import (
"context"
"fmt"
"strings"
configv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/config/v1alpha1"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"
)
type CheckpointInfo struct {
Enabled bool
Exists bool
Identity *nvidiacomv1alpha1.DynamoCheckpointIdentity
Hash string
Location string
StorageType nvidiacomv1alpha1.DynamoCheckpointStorageType
CheckpointName string
Ready bool
Enabled bool
Exists bool
Identity *nvidiacomv1alpha1.DynamoCheckpointIdentity
Hash string
ArtifactVersion string
CheckpointName string
Ready bool
}
func checkpointInfoFromObject(ckpt *nvidiacomv1alpha1.DynamoCheckpoint) (*CheckpointInfo, error) {
......@@ -47,17 +44,23 @@ func checkpointInfoFromObject(ckpt *nvidiacomv1alpha1.DynamoCheckpoint) (*Checkp
}
return &CheckpointInfo{
Enabled: true,
Exists: true,
Identity: &ckpt.Spec.Identity,
Hash: hash,
Location: ckpt.Status.Location,
StorageType: ckpt.Status.StorageType,
CheckpointName: ckpt.Name,
Ready: ckpt.Status.Phase == nvidiacomv1alpha1.DynamoCheckpointPhaseReady,
Enabled: true,
Exists: true,
Identity: &ckpt.Spec.Identity,
Hash: hash,
ArtifactVersion: checkpointArtifactVersion(ckpt),
CheckpointName: ckpt.Name,
Ready: ckpt.Status.Phase == nvidiacomv1alpha1.DynamoCheckpointPhaseReady,
}, nil
}
func checkpointArtifactVersion(ckpt *nvidiacomv1alpha1.DynamoCheckpoint) string {
if ckpt == nil {
return snapshotprotocol.DefaultCheckpointArtifactVersion
}
return snapshotprotocol.ArtifactVersion(ckpt.Annotations[snapshotprotocol.CheckpointArtifactVersionAnnotation])
}
func ResolveCheckpointForService(
ctx context.Context,
c client.Client,
......@@ -105,37 +108,3 @@ func ResolveCheckpointForService(
info.Identity = config.Identity
return info, nil
}
func ResolveCheckpointStorage(
hash string,
version string,
config *configv1alpha1.CheckpointConfiguration,
) (string, nvidiacomv1alpha1.DynamoCheckpointStorageType, error) {
version = strings.TrimSpace(version)
if version == "" {
version = consts.DefaultCheckpointArtifactVersion
}
storageType := configv1alpha1.CheckpointStorageTypePVC
if config != nil && config.Storage.Type != "" {
storageType = config.Storage.Type
}
switch storageType {
case configv1alpha1.CheckpointStorageTypeS3:
if config == nil || config.Storage.S3.URI == "" {
return "", "", fmt.Errorf("S3 storage type selected but no S3 URI configured (set checkpoint.storage.s3.uri)")
}
return fmt.Sprintf("%s/%s/versions/%s.tar", config.Storage.S3.URI, hash, version), nvidiacomv1alpha1.DynamoCheckpointStorageType(storageType), nil
case configv1alpha1.CheckpointStorageTypeOCI:
if config == nil || config.Storage.OCI.URI == "" {
return "", "", fmt.Errorf("OCI storage type selected but no OCI URI configured (set checkpoint.storage.oci.uri)")
}
return fmt.Sprintf("%s:%s-%s", config.Storage.OCI.URI, hash, version), nvidiacomv1alpha1.DynamoCheckpointStorageType(storageType), nil
default:
if config == nil || config.Storage.PVC.BasePath == "" {
return "", "", fmt.Errorf("PVC storage type selected but no PVC base path configured (set checkpoint.storage.pvc.basePath)")
}
return fmt.Sprintf("%s/%s/versions/%s", config.Storage.PVC.BasePath, hash, version), nvidiacomv1alpha1.DynamoCheckpointStorageType(storageType), nil
}
}
......@@ -23,6 +23,7 @@ import (
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
......@@ -55,7 +56,7 @@ func FindCheckpointByIdentityHash(
ctx,
checkpoints,
client.InNamespace(namespace),
client.MatchingLabels{consts.KubeLabelCheckpointHash: hash},
client.MatchingLabels{consts.KubeLabelCheckpointID: hash},
); err != nil {
return nil, fmt.Errorf("failed to list checkpoints by hash label: %w", err)
}
......@@ -118,10 +119,10 @@ func CreateOrGetAutoCheckpoint(
Name: fmt.Sprintf("checkpoint-%s", hash),
Namespace: namespace,
Labels: map[string]string{
consts.KubeLabelCheckpointHash: hash,
consts.KubeLabelCheckpointID: hash,
},
Annotations: map[string]string{
consts.KubeAnnotationCheckpointArtifactVersion: consts.DefaultCheckpointArtifactVersion,
snapshotprotocol.CheckpointArtifactVersionAnnotation: snapshotprotocol.DefaultCheckpointArtifactVersion,
},
},
Spec: nvidiacomv1alpha1.DynamoCheckpointSpec{
......
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
package checkpointjob
import (
"fmt"
configv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/config/v1alpha1"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/checkpoint"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/dynamo"
snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
)
func DesiredCheckpointJobName(identityHash string, annotations map[string]string) string {
return "checkpoint-job-" + identityHash + "-" + snapshotprotocol.ArtifactVersion(annotations[snapshotprotocol.CheckpointArtifactVersionAnnotation])
}
func buildCheckpointWorkerDefaultEnv(
ckpt *nvidiacomv1alpha1.DynamoCheckpoint,
podTemplate *corev1.PodTemplateSpec,
) []corev1.EnvVar {
componentType := consts.ComponentTypeWorker
dynamoNamespace := consts.GlobalDynamoNamespace
parentGraphDeploymentName := podTemplate.Labels[consts.KubeLabelDynamoGraphDeploymentName]
workerHashSuffix := podTemplate.Labels[consts.KubeLabelDynamoWorkerHash]
discoveryBackend := configv1alpha1.DiscoveryBackendKubernetes
if podTemplate.Labels[consts.KubeLabelDynamoNamespace] != "" {
dynamoNamespace = podTemplate.Labels[consts.KubeLabelDynamoNamespace]
}
if podTemplate.Labels[consts.KubeLabelDynamoComponentType] != "" &&
dynamo.IsWorkerComponent(podTemplate.Labels[consts.KubeLabelDynamoComponentType]) {
componentType = podTemplate.Labels[consts.KubeLabelDynamoComponentType]
}
defaultContainer, _ := dynamo.NewWorkerDefaults().GetBaseContainer(dynamo.ComponentContext{
ComponentType: componentType,
DynamoNamespace: dynamoNamespace,
ParentGraphDeploymentName: parentGraphDeploymentName,
ParentGraphDeploymentNamespace: ckpt.Namespace,
DiscoveryBackend: discoveryBackend,
WorkerHashSuffix: workerHashSuffix,
})
return defaultContainer.Env
}
func BuildCheckpointJob(
config *configv1alpha1.OperatorConfiguration,
ckpt *nvidiacomv1alpha1.DynamoCheckpoint,
jobName string,
) (*batchv1.Job, error) {
podTemplate := ckpt.Spec.Job.PodTemplateSpec.DeepCopy()
hash := ckpt.Status.IdentityHash
if hash == "" {
var err error
hash, err = checkpoint.ComputeIdentityHash(ckpt.Spec.Identity)
if err != nil {
return nil, fmt.Errorf("failed to compute identity hash: %w", err)
}
}
if podTemplate.Labels == nil {
podTemplate.Labels = make(map[string]string)
}
if podTemplate.Annotations == nil {
podTemplate.Annotations = make(map[string]string)
}
checkpoint.EnsurePodInfoVolume(&podTemplate.Spec)
if len(podTemplate.Spec.Containers) > 0 {
mainContainer := &podTemplate.Spec.Containers[0]
mainContainer.Env = dynamo.MergeEnvs(
buildCheckpointWorkerDefaultEnv(ckpt, podTemplate),
mainContainer.Env,
)
dynamo.AddStandardEnvVars(mainContainer, config)
mainContainer.Env = append(mainContainer.Env, corev1.EnvVar{
Name: consts.EnvReadyForCheckpointFile,
Value: config.Checkpoint.ReadyForCheckpointFilePath,
})
mainContainer.ReadinessProbe = &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
Exec: &corev1.ExecAction{
Command: []string{"cat", config.Checkpoint.ReadyForCheckpointFilePath},
},
},
InitialDelaySeconds: 15,
PeriodSeconds: 2,
}
mainContainer.LivenessProbe = nil
mainContainer.StartupProbe = nil
checkpoint.EnsurePodInfoMount(mainContainer)
dynamo.ApplySharedMemoryVolumeAndMount(&podTemplate.Spec, mainContainer, ckpt.Spec.Job.SharedMemory)
}
activeDeadlineSeconds := ckpt.Spec.Job.ActiveDeadlineSeconds
if activeDeadlineSeconds == nil {
defaultDeadline := int64(3600)
activeDeadlineSeconds = &defaultDeadline
}
wrapLaunchJob := false
if len(podTemplate.Spec.Containers) != 0 {
if gpus, ok := podTemplate.Spec.Containers[0].Resources.Limits[corev1.ResourceName(consts.KubeResourceGPUNvidia)]; ok {
wrapLaunchJob = gpus.Cmp(*resource.NewQuantity(1, resource.DecimalSI)) > 0
}
}
ttlSecondsAfterFinish := snapshotprotocol.DefaultCheckpointJobTTLSeconds
return snapshotprotocol.NewCheckpointJob(podTemplate, snapshotprotocol.CheckpointJobOptions{
Namespace: ckpt.Namespace,
CheckpointID: hash,
ArtifactVersion: snapshotprotocol.ArtifactVersion(ckpt.Annotations[snapshotprotocol.CheckpointArtifactVersionAnnotation]),
SeccompProfile: consts.SeccompProfilePath,
Name: jobName,
ActiveDeadlineSeconds: activeDeadlineSeconds,
TTLSecondsAfterFinish: &ttlSecondsAfterFinish,
WrapLaunchJob: wrapLaunchJob,
})
}
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
package checkpointjob
import (
"testing"
snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol"
)
func TestDesiredCheckpointJobName(t *testing.T) {
name := DesiredCheckpointJobName("abc123def4567890", map[string]string{
snapshotprotocol.CheckpointArtifactVersionAnnotation: "2",
})
if name != "checkpoint-job-abc123def4567890-2" {
t.Fatalf("unexpected checkpoint job name: %s", name)
}
defaultName := DesiredCheckpointJobName("abc123def4567890", nil)
if defaultName != "checkpoint-job-abc123def4567890-"+snapshotprotocol.DefaultCheckpointArtifactVersion {
t.Fatalf("unexpected default checkpoint job name: %s", defaultName)
}
}
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
package checkpointjob
import (
snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
)
type ObservationPhase string
const (
ObservationPhaseRunning ObservationPhase = "running"
ObservationPhaseWaitingForConfirmation ObservationPhase = "waiting_for_confirmation"
ObservationPhaseReady ObservationPhase = "ready"
ObservationPhaseFailed ObservationPhase = "failed"
)
type Observation struct {
Phase ObservationPhase
Reason string
Message string
}
func Observe(job *batchv1.Job, checkpointWorkerActive bool) Observation {
jobComplete := false
jobFailed := false
for _, condition := range job.Status.Conditions {
if condition.Status != corev1.ConditionTrue {
continue
}
if condition.Type == batchv1.JobComplete {
jobComplete = true
continue
}
if condition.Type == batchv1.JobFailed {
jobFailed = true
}
}
status := job.Annotations[snapshotprotocol.CheckpointStatusAnnotation]
if status == snapshotprotocol.CheckpointStatusFailed {
observation := Observation{
Phase: ObservationPhaseFailed,
Reason: "JobFailed",
Message: "Checkpoint job failed",
}
if jobComplete {
observation.Reason = "CheckpointVerificationFailed"
observation.Message = "Checkpoint job completed but snapshot-agent reported checkpoint failure"
}
return observation
}
if jobComplete {
if status == snapshotprotocol.CheckpointStatusCompleted {
return Observation{
Phase: ObservationPhaseReady,
Reason: "JobSucceeded",
Message: "Checkpoint job completed successfully",
}
}
if checkpointWorkerActive {
return Observation{Phase: ObservationPhaseWaitingForConfirmation}
}
return Observation{
Phase: ObservationPhaseFailed,
Reason: "CheckpointVerificationFailed",
Message: "Checkpoint job completed without snapshot-agent completion confirmation",
}
}
if jobFailed {
return Observation{
Phase: ObservationPhaseFailed,
Reason: "JobFailed",
Message: "Checkpoint job failed",
}
}
return Observation{Phase: ObservationPhaseRunning}
}
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
package checkpointjob
import (
"testing"
snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
func TestObserve(t *testing.T) {
makeJob := func(annotation string, conditions ...batchv1.JobCondition) *batchv1.Job {
job := &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{},
},
Status: batchv1.JobStatus{
Conditions: conditions,
},
}
if annotation != "" {
job.Annotations[snapshotprotocol.CheckpointStatusAnnotation] = annotation
}
return job
}
tests := []struct {
name string
job *batchv1.Job
checkpointWorkerActive bool
wantPhase ObservationPhase
wantReason string
wantMessage string
}{
{
name: "running job stays running",
job: makeJob(""),
wantPhase: ObservationPhaseRunning,
},
{
name: "completed job with completion annotation is ready",
job: makeJob(
snapshotprotocol.CheckpointStatusCompleted,
batchv1.JobCondition{Type: batchv1.JobComplete, Status: corev1.ConditionTrue},
),
wantPhase: ObservationPhaseReady,
wantReason: "JobSucceeded",
wantMessage: "Checkpoint job completed successfully",
},
{
name: "completed job waits for terminal confirmation while worker is active",
job: makeJob(
"",
batchv1.JobCondition{Type: batchv1.JobComplete, Status: corev1.ConditionTrue},
),
checkpointWorkerActive: true,
wantPhase: ObservationPhaseWaitingForConfirmation,
},
{
name: "completed job fails without confirmation once worker is inactive",
job: makeJob(
"",
batchv1.JobCondition{Type: batchv1.JobComplete, Status: corev1.ConditionTrue},
),
wantPhase: ObservationPhaseFailed,
wantReason: "CheckpointVerificationFailed",
wantMessage: "Checkpoint job completed without snapshot-agent completion confirmation",
},
{
name: "failed checkpoint annotation wins over completed job",
job: makeJob(
snapshotprotocol.CheckpointStatusFailed,
batchv1.JobCondition{Type: batchv1.JobComplete, Status: corev1.ConditionTrue},
),
checkpointWorkerActive: true,
wantPhase: ObservationPhaseFailed,
wantReason: "CheckpointVerificationFailed",
wantMessage: "Checkpoint job completed but snapshot-agent reported checkpoint failure",
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
observation := Observe(tc.job, tc.checkpointWorkerActive)
if observation.Phase != tc.wantPhase {
t.Fatalf("phase = %q, want %q", observation.Phase, tc.wantPhase)
}
if observation.Reason != tc.wantReason {
t.Fatalf("reason = %q, want %q", observation.Reason, tc.wantReason)
}
if observation.Message != tc.wantMessage {
t.Fatalf("message = %q, want %q", observation.Message, tc.wantMessage)
}
})
}
}
......@@ -140,17 +140,16 @@ const (
ResourceStateUnknown = "unknown"
// Checkpoint/restore constants
// CROSS-REFERENCE: Some constants below are duplicated in the snapshot package at
// deploy/snapshot/pkg/config/constants.go. If you change a value here, update there too.
// CROSS-REFERENCE: Some constants below are duplicated in deploy/snapshot/protocol.
// If you change a value here, update there too.
// Kubernetes labels
KubeLabelIsCheckpointSource = "nvidia.com/snapshot-is-checkpoint-source" // Pod label that triggers DaemonSet auto-checkpoint
KubeLabelCheckpointHash = "nvidia.com/snapshot-checkpoint-hash" // Checkpoint identity hash used for lookup/reuse (may differ from DynamoCheckpoint metadata.name)
KubeLabelCheckpointID = "nvidia.com/snapshot-checkpoint-id" // Checkpoint identity label; the operator stores the resolved identity hash as the value
KubeLabelIsRestoreTarget = "nvidia.com/snapshot-is-restore-target" // Pod label that triggers DaemonSet auto-restore
KubeAnnotationCheckpointArtifactVersion = "nvidia.com/snapshot-artifact-version" // Checkpoint artifact generation; changing it triggers a new immutable capture attempt
DefaultCheckpointArtifactVersion = "1"
KubeAnnotationCheckpointLocation = "nvidia.com/snapshot-checkpoint-location" // Pod annotation that tells snapshot-agent where the checkpoint lives
KubeAnnotationCheckpointStorageType = "nvidia.com/snapshot-checkpoint-storage-type" // Pod annotation that tells snapshot-agent which storage backend owns the checkpoint
DefaultCheckpointJobTTLSeconds = int32(300)
// Environment variables injected into pods
EnvReadyForCheckpointFile = "DYN_READY_FOR_CHECKPOINT_FILE" // Ready-for-checkpoint file path — checkpoint job pods
......
......@@ -20,19 +20,15 @@ package controller
import (
"context"
"fmt"
"strings"
"time"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/dynamo"
batchv1 "k8s.io/api/batch/v1"
coordinationv1 "k8s.io/api/coordination/v1"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/tools/record"
"k8s.io/utils/ptr"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/builder"
"sigs.k8s.io/controller-runtime/pkg/client"
......@@ -43,16 +39,11 @@ import (
configv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/config/v1alpha1"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/checkpoint"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/checkpointjob"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
commonController "github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common"
)
const (
checkpointStatusAnnotation = "nvidia.com/snapshot-checkpoint-status"
checkpointStatusCompleted = "completed"
checkpointStatusFailed = "failed"
)
// CheckpointReconciler reconciles a DynamoCheckpoint object
type CheckpointReconciler struct {
client.Client
......@@ -66,37 +57,6 @@ func (r *CheckpointReconciler) GetRecorder() record.EventRecorder {
return r.Recorder
}
func checkpointLeaseExpired(lease *coordinationv1.Lease, now time.Time) bool {
if lease.Spec.LeaseDurationSeconds == nil {
return true
}
leaseTime := lease.Spec.RenewTime
if leaseTime == nil {
leaseTime = lease.Spec.AcquireTime
}
if leaseTime == nil {
return true
}
return now.After(leaseTime.Time.Add(time.Duration(*lease.Spec.LeaseDurationSeconds) * time.Second))
}
func desiredArtifactVersion(ckpt *nvidiacomv1alpha1.DynamoCheckpoint) string {
version := consts.DefaultCheckpointArtifactVersion
if ckpt.Annotations == nil {
return version
}
annotatedVersion := strings.TrimSpace(ckpt.Annotations[consts.KubeAnnotationCheckpointArtifactVersion])
if annotatedVersion != "" {
version = annotatedVersion
}
return version
}
func desiredCheckpointJobName(ckpt *nvidiacomv1alpha1.DynamoCheckpoint, identityHash string) string {
return "checkpoint-job-" + identityHash + "-" + desiredArtifactVersion(ckpt)
}
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamocheckpoints,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamocheckpoints/status,verbs=get;update;patch
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamocheckpoints/finalizers,verbs=update
......@@ -126,8 +86,8 @@ func (r *CheckpointReconciler) Reconcile(ctx context.Context, req ctrl.Request)
if ckpt.Labels == nil {
ckpt.Labels = map[string]string{}
}
if ckpt.Labels[consts.KubeLabelCheckpointHash] != identityHash {
ckpt.Labels[consts.KubeLabelCheckpointHash] = identityHash
if ckpt.Labels[consts.KubeLabelCheckpointID] != identityHash {
ckpt.Labels[consts.KubeLabelCheckpointID] = identityHash
if err := r.Update(ctx, ckpt); err != nil {
return ctrl.Result{}, err
}
......@@ -157,7 +117,7 @@ func (r *CheckpointReconciler) Reconcile(ctx context.Context, req ctrl.Request)
}
return ctrl.Result{}, nil
}
desiredJobName := desiredCheckpointJobName(ckpt, identityHash)
desiredJobName := checkpointjob.DesiredCheckpointJobName(identityHash, ckpt.Annotations)
switch ckpt.Status.Phase {
case "", nvidiacomv1alpha1.DynamoCheckpointPhasePending, nvidiacomv1alpha1.DynamoCheckpointPhaseCreating, nvidiacomv1alpha1.DynamoCheckpointPhaseReady, nvidiacomv1alpha1.DynamoCheckpointPhaseFailed:
default:
......@@ -221,17 +181,12 @@ func (r *CheckpointReconciler) handlePending(ctx context.Context, ckpt *nvidiaco
return ctrl.Result{}, fmt.Errorf("failed to compute checkpoint identity hash: %w", err)
}
}
version := desiredArtifactVersion(ckpt)
jobName := desiredCheckpointJobName(ckpt, hash)
location, storageType, err := checkpoint.ResolveCheckpointStorage(hash, version, &r.Config.Checkpoint)
if err != nil {
return ctrl.Result{}, err
}
jobName := checkpointjob.DesiredCheckpointJobName(hash, ckpt.Annotations)
// Use SyncResource to create/update the checkpoint Job
modified, _, err := commonController.SyncResource(ctx, r, ckpt, func(ctx context.Context) (*batchv1.Job, bool, error) {
job := r.buildCheckpointJob(ckpt, jobName)
return job, false, nil
job, err := checkpointjob.BuildCheckpointJob(r.Config, ckpt, jobName)
return job, false, err
})
if err != nil {
logger.Error(err, "Failed to sync checkpoint Job")
......@@ -245,8 +200,6 @@ func (r *CheckpointReconciler) handlePending(ctx context.Context, ckpt *nvidiaco
// Update status to Creating phase
ckpt.Status.Phase = nvidiacomv1alpha1.DynamoCheckpointPhaseCreating
ckpt.Status.JobName = jobName
ckpt.Status.Location = location
ckpt.Status.StorageType = storageType
ckpt.Status.CreatedAt = nil
ckpt.Status.Message = ""
meta.SetStatusCondition(&ckpt.Status.Conditions, metav1.Condition{
......@@ -298,103 +251,39 @@ func (r *CheckpointReconciler) handleCreating(ctx context.Context, ckpt *nvidiac
return ctrl.Result{}, err
}
jobComplete := false
jobFailed := false
for _, condition := range job.Status.Conditions {
if condition.Status != corev1.ConditionTrue {
continue
var lease *coordinationv1.Lease
leaseKey := client.ObjectKey{Namespace: job.Namespace, Name: job.Name}
lease = &coordinationv1.Lease{}
if err := r.Get(ctx, leaseKey, lease); err != nil {
if !apierrors.IsNotFound(err) {
return ctrl.Result{}, err
}
if condition.Type == batchv1.JobComplete {
jobComplete = true
continue
lease = nil
}
now := time.Now()
checkpointWorkerActive := false
if lease != nil && lease.Spec.LeaseDurationSeconds != nil {
// The snapshot-agent owns and renews this lease while it is still finalizing
// checkpoint state. A Job can complete before the agent writes the terminal
// checkpoint annotation, so we keep requeuing until the lease is no longer active.
lastRenewal := lease.Spec.RenewTime
if lastRenewal == nil {
lastRenewal = lease.Spec.AcquireTime
}
if condition.Type == batchv1.JobFailed {
jobFailed = true
if lastRenewal != nil {
checkpointWorkerActive = !now.After(lastRenewal.Time.Add(time.Duration(*lease.Spec.LeaseDurationSeconds) * time.Second))
}
}
status := job.Annotations[checkpointStatusAnnotation]
if status == checkpointStatusFailed {
reason := "JobFailed"
message := "Checkpoint job failed"
if jobComplete {
reason = "CheckpointVerificationFailed"
message = "Checkpoint job completed but snapshot-agent reported checkpoint failure"
}
logger.Info("Checkpoint Job failed", "job", job.Name, "checkpoint_status", status)
r.Recorder.Event(ckpt, corev1.EventTypeWarning, "CheckpointFailed", message)
ckpt.Status.Phase = nvidiacomv1alpha1.DynamoCheckpointPhaseFailed
ckpt.Status.Message = message
meta.SetStatusCondition(&ckpt.Status.Conditions, metav1.Condition{
Type: string(nvidiacomv1alpha1.DynamoCheckpointConditionJobCompleted),
Status: metav1.ConditionFalse,
Reason: reason,
Message: message,
LastTransitionTime: metav1.Now(),
})
if err := r.Status().Update(ctx, ckpt); err != nil {
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
}
if jobComplete {
if status != checkpointStatusCompleted {
lease := &coordinationv1.Lease{}
leaseKey := client.ObjectKey{Namespace: job.Namespace, Name: job.Name}
if err := r.Get(ctx, leaseKey, lease); err != nil {
if !apierrors.IsNotFound(err) {
return ctrl.Result{}, err
}
} else if !checkpointLeaseExpired(lease, time.Now()) {
logger.V(1).Info("Checkpoint job is complete but checkpoint lease is still active; waiting for terminal watcher status", "job", job.Name)
return ctrl.Result{RequeueAfter: time.Second}, nil
}
reason := "CheckpointVerificationFailed"
message := "Checkpoint job completed without snapshot-agent completion confirmation"
if status == checkpointStatusFailed {
message = "Checkpoint job completed but snapshot-agent reported checkpoint failure"
}
logger.Info("Checkpoint Job completed without usable artifact", "job", job.Name, "checkpoint_status", status)
r.Recorder.Event(ckpt, corev1.EventTypeWarning, "CheckpointFailed", message)
ckpt.Status.Phase = nvidiacomv1alpha1.DynamoCheckpointPhaseFailed
ckpt.Status.Message = message
meta.SetStatusCondition(&ckpt.Status.Conditions, metav1.Condition{
Type: string(nvidiacomv1alpha1.DynamoCheckpointConditionJobCompleted),
Status: metav1.ConditionFalse,
Reason: reason,
Message: message,
LastTransitionTime: metav1.Now(),
})
if err := r.Status().Update(ctx, ckpt); err != nil {
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
}
observation := checkpointjob.Observe(job, checkpointWorkerActive)
switch observation.Phase {
case checkpointjob.ObservationPhaseWaitingForConfirmation:
logger.V(1).Info("Checkpoint job is complete but checkpoint worker is still active; waiting for terminal watcher status", "job", job.Name)
return ctrl.Result{RequeueAfter: time.Second}, nil
case checkpointjob.ObservationPhaseReady:
logger.Info("Checkpoint Job succeeded", "job", job.Name)
r.Recorder.Event(ckpt, corev1.EventTypeNormal, "CheckpointReady", "Checkpoint creation completed successfully")
if ckpt.Status.Location == "" || ckpt.Status.StorageType == "" {
version := desiredArtifactVersion(ckpt)
location, storageType, err := checkpoint.ResolveCheckpointStorage(
ckpt.Status.IdentityHash,
version,
&r.Config.Checkpoint,
)
if err != nil {
return ctrl.Result{}, err
}
ckpt.Status.Location = location
ckpt.Status.StorageType = storageType
}
r.Recorder.Event(ckpt, corev1.EventTypeNormal, "CheckpointReady", observation.Message)
now := metav1.Now()
ckpt.Status.Phase = nvidiacomv1alpha1.DynamoCheckpointPhaseReady
......@@ -403,275 +292,34 @@ func (r *CheckpointReconciler) handleCreating(ctx context.Context, ckpt *nvidiac
meta.SetStatusCondition(&ckpt.Status.Conditions, metav1.Condition{
Type: string(nvidiacomv1alpha1.DynamoCheckpointConditionJobCompleted),
Status: metav1.ConditionTrue,
Reason: "JobSucceeded",
Message: fmt.Sprintf("Checkpoint job completed, available at %s", ckpt.Status.Location),
Reason: observation.Reason,
Message: observation.Message,
LastTransitionTime: metav1.Now(),
})
if err := r.Status().Update(ctx, ckpt); err != nil {
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
}
if jobFailed {
logger.Info("Checkpoint Job failed", "job", job.Name)
r.Recorder.Event(ckpt, corev1.EventTypeWarning, "CheckpointFailed", "Checkpoint creation failed")
case checkpointjob.ObservationPhaseFailed:
logger.Info("Checkpoint Job failed", "job", job.Name, "message", observation.Message)
r.Recorder.Event(ckpt, corev1.EventTypeWarning, "CheckpointFailed", observation.Message)
ckpt.Status.Phase = nvidiacomv1alpha1.DynamoCheckpointPhaseFailed
ckpt.Status.Message = "Checkpoint job failed"
ckpt.Status.Message = observation.Message
meta.SetStatusCondition(&ckpt.Status.Conditions, metav1.Condition{
Type: string(nvidiacomv1alpha1.DynamoCheckpointConditionJobCompleted),
Status: metav1.ConditionFalse,
Reason: "JobFailed",
Message: "Checkpoint job failed",
Reason: observation.Reason,
Message: observation.Message,
LastTransitionTime: metav1.Now(),
})
if err := r.Status().Update(ctx, ckpt); err != nil {
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
default:
return ctrl.Result{}, nil
}
// Job is still running - we'll be notified via Update event when status changes
return ctrl.Result{}, nil
}
func (r *CheckpointReconciler) buildCheckpointWorkerDefaultEnv(
ckpt *nvidiacomv1alpha1.DynamoCheckpoint,
podTemplate *corev1.PodTemplateSpec,
) []corev1.EnvVar {
componentType := consts.ComponentTypeWorker
dynamoNamespace := consts.GlobalDynamoNamespace
parentGraphDeploymentName := podTemplate.Labels[consts.KubeLabelDynamoGraphDeploymentName]
workerHashSuffix := podTemplate.Labels[consts.KubeLabelDynamoWorkerHash]
discoveryBackend := configv1alpha1.DiscoveryBackendKubernetes
if podTemplate.Labels[consts.KubeLabelDynamoNamespace] != "" {
dynamoNamespace = podTemplate.Labels[consts.KubeLabelDynamoNamespace]
}
if podTemplate.Labels[consts.KubeLabelDynamoComponentType] != "" &&
dynamo.IsWorkerComponent(podTemplate.Labels[consts.KubeLabelDynamoComponentType]) {
componentType = podTemplate.Labels[consts.KubeLabelDynamoComponentType]
}
defaultContainer, _ := dynamo.NewWorkerDefaults().GetBaseContainer(dynamo.ComponentContext{
ComponentType: componentType,
DynamoNamespace: dynamoNamespace,
ParentGraphDeploymentName: parentGraphDeploymentName,
ParentGraphDeploymentNamespace: ckpt.Namespace,
DiscoveryBackend: discoveryBackend,
WorkerHashSuffix: workerHashSuffix,
})
return defaultContainer.Env
}
func (r *CheckpointReconciler) buildCheckpointJob(ckpt *nvidiacomv1alpha1.DynamoCheckpoint, jobName string) *batchv1.Job {
// Use the pod template from the spec
podTemplate := ckpt.Spec.Job.PodTemplateSpec.DeepCopy()
hash := ckpt.Status.IdentityHash
if hash == "" {
hash, _ = checkpoint.ComputeIdentityHash(ckpt.Spec.Identity)
}
version := desiredArtifactVersion(ckpt)
// Add checkpoint-related labels
if podTemplate.Labels == nil {
podTemplate.Labels = make(map[string]string)
}
if podTemplate.Annotations == nil {
podTemplate.Annotations = make(map[string]string)
}
location, storageType, err := checkpoint.ResolveCheckpointStorage(
hash,
version,
&r.Config.Checkpoint,
)
if err != nil {
location = ""
storageType = ""
}
checkpoint.ApplyCheckpointSourcePodMetadata(podTemplate.Labels, podTemplate.Annotations, hash, location, storageType)
hasPodInfoVolume := false
for _, volume := range podTemplate.Spec.Volumes {
if volume.Name == consts.PodInfoVolumeName {
hasPodInfoVolume = true
break
}
}
if !hasPodInfoVolume {
podTemplate.Spec.Volumes = append(podTemplate.Spec.Volumes, corev1.Volume{
Name: consts.PodInfoVolumeName,
VolumeSource: corev1.VolumeSource{
DownwardAPI: &corev1.DownwardAPIVolumeSource{
Items: []corev1.DownwardAPIVolumeFile{
{
Path: consts.PodInfoFileDynNamespace,
FieldRef: &corev1.ObjectFieldSelector{
APIVersion: "v1",
FieldPath: "metadata.labels['" + consts.KubeLabelDynamoNamespace + "']",
},
},
{
Path: consts.PodInfoFileDynNamespaceWorkerSuffix,
FieldRef: &corev1.ObjectFieldSelector{
APIVersion: "v1",
FieldPath: "metadata.labels['" + consts.KubeLabelDynamoWorkerHash + "']",
},
},
{
Path: consts.PodInfoFileDynComponent,
FieldRef: &corev1.ObjectFieldSelector{
APIVersion: "v1",
FieldPath: "metadata.labels['" + consts.KubeLabelDynamoComponentType + "']",
},
},
{
Path: consts.PodInfoFileDynParentDGDName,
FieldRef: &corev1.ObjectFieldSelector{
APIVersion: "v1",
FieldPath: "metadata.labels['" + consts.KubeLabelDynamoGraphDeploymentName + "']",
},
},
{
Path: consts.PodInfoFileDynParentDGDNamespace,
FieldRef: &corev1.ObjectFieldSelector{
APIVersion: "v1",
FieldPath: "metadata.namespace",
},
},
{
Path: "pod_name",
FieldRef: &corev1.ObjectFieldSelector{
APIVersion: "v1",
FieldPath: consts.PodInfoFieldPodName,
},
},
{
Path: "pod_uid",
FieldRef: &corev1.ObjectFieldSelector{
APIVersion: "v1",
FieldPath: consts.PodInfoFieldPodUID,
},
},
{
Path: "pod_namespace",
FieldRef: &corev1.ObjectFieldSelector{
APIVersion: "v1",
FieldPath: consts.PodInfoFieldPodNamespace,
},
},
},
},
},
})
}
// Configure the main container for checkpoint mode.
if len(podTemplate.Spec.Containers) > 0 {
mainContainer := &podTemplate.Spec.Containers[0]
// Manual checkpoints start from a raw pod template, so re-apply the worker
// runtime env defaults before layering checkpoint-specific env on top.
mainContainer.Env = dynamo.MergeEnvs(
r.buildCheckpointWorkerDefaultEnv(ckpt, podTemplate),
mainContainer.Env,
)
dynamo.AddStandardEnvVars(mainContainer, r.Config)
// Add the ready-for-checkpoint signal path.
mainContainer.Env = append(mainContainer.Env,
corev1.EnvVar{
Name: consts.EnvReadyForCheckpointFile,
Value: r.Config.Checkpoint.ReadyForCheckpointFilePath,
},
)
if gpus, ok := mainContainer.Resources.Limits[corev1.ResourceName(consts.KubeResourceGPUNvidia)]; ok && gpus.Cmp(*resource.NewQuantity(1, resource.DecimalSI)) > 0 {
mainContainer.Command = append([]string{"cuda-checkpoint", "--launch-job"}, mainContainer.Command...)
}
// Override probes for checkpoint mode
// Checkpoint jobs need different probe behavior than regular worker pods:
// - Readiness: Wait for model to load before checkpoint
// - Liveness/Startup: Remove to prevent restarts during slow model loading
mainContainer.ReadinessProbe = &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
Exec: &corev1.ExecAction{
Command: []string{"cat", r.Config.Checkpoint.ReadyForCheckpointFilePath},
},
},
InitialDelaySeconds: 15,
PeriodSeconds: 2,
}
// Remove liveness probe - we don't want restarts during model loading
mainContainer.LivenessProbe = nil
// Remove startup probe - not needed for checkpoint jobs
mainContainer.StartupProbe = nil
hasPodInfoMount := false
for _, mount := range mainContainer.VolumeMounts {
if mount.Name == consts.PodInfoVolumeName {
hasPodInfoMount = true
break
}
}
if !hasPodInfoMount {
mainContainer.VolumeMounts = append(mainContainer.VolumeMounts, corev1.VolumeMount{
Name: consts.PodInfoVolumeName,
MountPath: consts.PodInfoMountPath,
ReadOnly: true,
})
}
dynamo.ApplySharedMemoryVolumeAndMount(&podTemplate.Spec, mainContainer, ckpt.Spec.Job.SharedMemory)
}
// Set restart policy to Never for Jobs
podTemplate.Spec.RestartPolicy = corev1.RestartPolicyNever
// Apply seccomp profile to block io_uring syscalls
// CRIU doesn't support io_uring memory mappings, so we must block these syscalls
if podTemplate.Spec.SecurityContext == nil {
podTemplate.Spec.SecurityContext = &corev1.PodSecurityContext{}
}
podTemplate.Spec.SecurityContext.SeccompProfile = &corev1.SeccompProfile{
Type: corev1.SeccompProfileTypeLocalhost,
LocalhostProfile: ptr.To(consts.SeccompProfilePath),
}
// Build the Job
activeDeadlineSeconds := ckpt.Spec.Job.ActiveDeadlineSeconds
if activeDeadlineSeconds == nil {
defaultDeadline := int64(3600) // 1 hour
activeDeadlineSeconds = &defaultDeadline
}
ttlSeconds := ckpt.Spec.Job.TTLSecondsAfterFinished
if ttlSeconds == nil {
defaultTTL := int32(300) // 5 minutes
ttlSeconds = &defaultTTL
}
job := &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Name: jobName,
Namespace: ckpt.Namespace,
Labels: map[string]string{
consts.KubeLabelCheckpointHash: hash,
},
},
Spec: batchv1.JobSpec{
ActiveDeadlineSeconds: activeDeadlineSeconds,
// Checkpoint jobs are single-attempt to keep snapshot-agent status terminal.
BackoffLimit: ptr.To[int32](0),
TTLSecondsAfterFinished: ttlSeconds,
Template: *podTemplate,
},
}
return job
}
// SetupWithManager sets up the controller with the Manager.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment