"vscode:/vscode.git/clone" did not exist on "69e44e98dbbf05c8a69470dec4dff286cdd01358"
Unverified Commit 43e810a4 authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

refactor(snapshot): add manifest-based snapshotctl flow and shared workload builders (#7671)


Signed-off-by: default avatarSchwinn Saereesitthipitak <schwinns@nvidia.com>
parent 23144df5
...@@ -21,7 +21,8 @@ import ( ...@@ -21,7 +21,8 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
) )
// Checkpoint storage type constants // Checkpoint storage type constants retained for compatibility with older
// operator configuration files.
const ( const (
CheckpointStorageTypePVC = "pvc" CheckpointStorageTypePVC = "pvc"
CheckpointStorageTypeS3 = "s3" CheckpointStorageTypeS3 = "s3"
...@@ -245,46 +246,49 @@ type CheckpointConfiguration struct { ...@@ -245,46 +246,49 @@ type CheckpointConfiguration struct {
// ReadyForCheckpointFilePath signals model readiness for checkpoint jobs // ReadyForCheckpointFilePath signals model readiness for checkpoint jobs
// +kubebuilder:default="/tmp/ready-for-checkpoint" // +kubebuilder:default="/tmp/ready-for-checkpoint"
ReadyForCheckpointFilePath string `json:"readyForCheckpointFilePath"` ReadyForCheckpointFilePath string `json:"readyForCheckpointFilePath"`
// Storage holds storage backend configuration // Deprecated: Storage is retained for compatibility and ignored by the
// current snapshot flow. Snapshot storage is discovered from the
// snapshot-agent DaemonSet instead.
Storage CheckpointStorageConfiguration `json:"storage"` Storage CheckpointStorageConfiguration `json:"storage"`
} }
// CheckpointStorageConfiguration holds storage backend configuration for checkpoints. // Deprecated: CheckpointStorageConfiguration is retained for compatibility and
// ignored by the current snapshot flow.
type CheckpointStorageConfiguration struct { type CheckpointStorageConfiguration struct {
// Type is the storage backend type: pvc, s3, or oci // Type is the legacy storage backend type: pvc, s3, or oci.
// +kubebuilder:default="pvc"
Type string `json:"type"` Type string `json:"type"`
// PVC configuration (used when Type=pvc) // PVC configuration for legacy pvc-based settings.
PVC CheckpointPVCConfig `json:"pvc"` PVC CheckpointPVCConfig `json:"pvc"`
// S3 configuration (used when Type=s3) // S3 configuration for legacy s3-based settings.
S3 CheckpointS3Config `json:"s3"` S3 CheckpointS3Config `json:"s3"`
// OCI configuration (used when Type=oci) // OCI configuration for legacy oci-based settings.
OCI CheckpointOCIConfig `json:"oci"` OCI CheckpointOCIConfig `json:"oci"`
} }
// CheckpointPVCConfig holds PVC storage configuration. // Deprecated: CheckpointPVCConfig is retained for compatibility and ignored by
// the current snapshot flow.
type CheckpointPVCConfig struct { type CheckpointPVCConfig struct {
// PVCName is the name of the PVC // PVCName is the legacy PVC name.
// +kubebuilder:default="snapshot-pvc"
PVCName string `json:"pvcName"` PVCName string `json:"pvcName"`
// BasePath is the base directory within the PVC // BasePath is the legacy base directory within the PVC.
// +kubebuilder:default="/checkpoints"
BasePath string `json:"basePath"` BasePath string `json:"basePath"`
} }
// CheckpointS3Config holds S3 storage configuration. // Deprecated: CheckpointS3Config is retained for compatibility and ignored by
// the current snapshot flow.
type CheckpointS3Config struct { type CheckpointS3Config struct {
// URI is the S3 URI (s3://[endpoint/]bucket/prefix) // URI is the legacy S3 URI (s3://[endpoint/]bucket/prefix).
URI string `json:"uri"` URI string `json:"uri"`
// CredentialsSecretRef is the name of the credentials secret // CredentialsSecretRef is the legacy credentials secret name.
CredentialsSecretRef string `json:"credentialsSecretRef"` CredentialsSecretRef string `json:"credentialsSecretRef"`
} }
// CheckpointOCIConfig holds OCI registry storage configuration. // Deprecated: CheckpointOCIConfig is retained for compatibility and ignored by
// the current snapshot flow.
type CheckpointOCIConfig struct { type CheckpointOCIConfig struct {
// URI is the OCI URI (oci://registry/repository) // URI is the legacy OCI URI (oci://registry/repository).
URI string `json:"uri"` URI string `json:"uri"`
// CredentialsSecretRef is the name of the docker config secret // CredentialsSecretRef is the legacy docker config secret name.
CredentialsSecretRef string `json:"credentialsSecretRef"` CredentialsSecretRef string `json:"credentialsSecretRef"`
} }
......
...@@ -37,7 +37,6 @@ func ValidateOperatorConfiguration(config *configv1alpha1.OperatorConfiguration) ...@@ -37,7 +37,6 @@ func ValidateOperatorConfiguration(config *configv1alpha1.OperatorConfiguration)
allErrs = append(allErrs, validateMPI(&config.MPI, field.NewPath("mpi"))...) allErrs = append(allErrs, validateMPI(&config.MPI, field.NewPath("mpi"))...)
allErrs = append(allErrs, validateInfrastructure(&config.Infrastructure, field.NewPath("infrastructure"))...) allErrs = append(allErrs, validateInfrastructure(&config.Infrastructure, field.NewPath("infrastructure"))...)
allErrs = append(allErrs, validateDiscovery(&config.Discovery, field.NewPath("discovery"))...) allErrs = append(allErrs, validateDiscovery(&config.Discovery, field.NewPath("discovery"))...)
allErrs = append(allErrs, validateCheckpoint(&config.Checkpoint, field.NewPath("checkpoint"))...)
allErrs = append(allErrs, validateRBAC(config)...) allErrs = append(allErrs, validateRBAC(config)...)
allErrs = append(allErrs, validateOrchestrators(&config.Orchestrators, field.NewPath("orchestrators"))...) allErrs = append(allErrs, validateOrchestrators(&config.Orchestrators, field.NewPath("orchestrators"))...)
allErrs = append(allErrs, validateIngress(&config.Ingress, field.NewPath("ingress"))...) allErrs = append(allErrs, validateIngress(&config.Ingress, field.NewPath("ingress"))...)
...@@ -127,33 +126,6 @@ func validateDiscovery(discovery *configv1alpha1.DiscoveryConfiguration, fldPath ...@@ -127,33 +126,6 @@ func validateDiscovery(discovery *configv1alpha1.DiscoveryConfiguration, fldPath
return allErrs return allErrs
} }
func validateCheckpoint(checkpoint *configv1alpha1.CheckpointConfiguration, fldPath *field.Path) field.ErrorList {
allErrs := field.ErrorList{}
if !checkpoint.Enabled {
return allErrs
}
storagePath := fldPath.Child("storage")
switch checkpoint.Storage.Type {
case configv1alpha1.CheckpointStorageTypePVC:
// PVC is the default, no additional required fields
case configv1alpha1.CheckpointStorageTypeS3:
if checkpoint.Storage.S3.URI == "" {
allErrs = append(allErrs, field.Required(storagePath.Child("s3", "uri"), "S3 URI is required when storage type is s3"))
}
case configv1alpha1.CheckpointStorageTypeOCI:
if checkpoint.Storage.OCI.URI == "" {
allErrs = append(allErrs, field.Required(storagePath.Child("oci", "uri"), "OCI URI is required when storage type is oci"))
}
default:
allErrs = append(allErrs, field.NotSupported(storagePath.Child("type"), checkpoint.Storage.Type,
[]string{configv1alpha1.CheckpointStorageTypePVC, configv1alpha1.CheckpointStorageTypeS3, configv1alpha1.CheckpointStorageTypeOCI}))
}
return allErrs
}
// validateRBAC is mode-aware: validates RBAC fields based on namespace mode. // validateRBAC is mode-aware: validates RBAC fields based on namespace mode.
func validateRBAC(config *configv1alpha1.OperatorConfiguration) field.ErrorList { func validateRBAC(config *configv1alpha1.OperatorConfiguration) field.ErrorList {
allErrs := field.ErrorList{} allErrs := field.ErrorList{}
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
package validation package validation
import ( import (
"encoding/json"
"testing" "testing"
"time" "time"
...@@ -31,7 +32,10 @@ func validConfig() *configv1alpha1.OperatorConfiguration { ...@@ -31,7 +32,10 @@ func validConfig() *configv1alpha1.OperatorConfiguration {
configv1alpha1.SetDefaultsOperatorConfiguration(cfg) configv1alpha1.SetDefaultsOperatorConfiguration(cfg)
cfg.MPI.SSHSecretName = "mpi-ssh" cfg.MPI.SSHSecretName = "mpi-ssh"
cfg.MPI.SSHSecretNamespace = "default" cfg.MPI.SSHSecretNamespace = "default"
// Cluster-wide validation requires chart-provided RBAC names.
cfg.RBAC.PlannerClusterRoleName = "planner-role" cfg.RBAC.PlannerClusterRoleName = "planner-role"
cfg.RBAC.DGDRProfilingClusterRoleName = "dgdr-profiling-role"
cfg.RBAC.EPPClusterRoleName = "epp-role"
return cfg return cfg
} }
...@@ -41,6 +45,8 @@ func validNamespaceScopedConfig() *configv1alpha1.OperatorConfiguration { ...@@ -41,6 +45,8 @@ func validNamespaceScopedConfig() *configv1alpha1.OperatorConfiguration {
cfg.Namespace.Restricted = "my-namespace" cfg.Namespace.Restricted = "my-namespace"
// RBAC not required in namespace mode // RBAC not required in namespace mode
cfg.RBAC.PlannerClusterRoleName = "" cfg.RBAC.PlannerClusterRoleName = ""
cfg.RBAC.DGDRProfilingClusterRoleName = ""
cfg.RBAC.EPPClusterRoleName = ""
return cfg return cfg
} }
...@@ -120,45 +126,42 @@ func TestValidateOperatorConfiguration_NamespaceScopedLeaseRenewExceedsDuration( ...@@ -120,45 +126,42 @@ func TestValidateOperatorConfiguration_NamespaceScopedLeaseRenewExceedsDuration(
} }
} }
func TestValidateOperatorConfiguration_CheckpointS3MissingURI(t *testing.T) { func TestValidateOperatorConfiguration_CheckpointEnabledRequiresNoStorageConfig(t *testing.T) {
cfg := validConfig() cfg := validConfig()
cfg.Checkpoint.Enabled = true cfg.Checkpoint.Enabled = true
cfg.Checkpoint.Storage.Type = configv1alpha1.CheckpointStorageTypeS3
cfg.Checkpoint.Storage.S3.URI = ""
errs := ValidateOperatorConfiguration(cfg) errs := ValidateOperatorConfiguration(cfg)
if len(errs) != 1 { if len(errs) != 0 {
t.Errorf("expected 1 error for missing S3 URI, got %d: %v", len(errs), errs) t.Errorf("expected no errors for checkpoint config without storage settings, got: %v", errs)
} }
} }
func TestValidateOperatorConfiguration_CheckpointOCIMissingURI(t *testing.T) { func TestValidateOperatorConfiguration_CheckpointDeprecatedStorageConfigIsAccepted(t *testing.T) {
cfg := validConfig() cfg := validConfig()
cfg.Checkpoint.Enabled = true rawConfig := []byte(`{
cfg.Checkpoint.Storage.Type = configv1alpha1.CheckpointStorageTypeOCI "checkpoint": {
cfg.Checkpoint.Storage.OCI.URI = "" "enabled": true,
"storage": {
errs := ValidateOperatorConfiguration(cfg) "type": "s3",
if len(errs) != 1 { "s3": {
t.Errorf("expected 1 error for missing OCI URI, got %d: %v", len(errs), errs) "uri": "s3://legacy-bucket/checkpoints"
}
}
}
}`)
if err := json.Unmarshal(rawConfig, cfg); err != nil {
t.Fatalf("failed to unmarshal compatibility config: %v", err)
} }
}
func TestValidateOperatorConfiguration_CheckpointInvalidStorageType(t *testing.T) {
cfg := validConfig()
cfg.Checkpoint.Enabled = true
cfg.Checkpoint.Storage.Type = "nfs"
errs := ValidateOperatorConfiguration(cfg) errs := ValidateOperatorConfiguration(cfg)
if len(errs) != 1 { if len(errs) != 0 {
t.Errorf("expected 1 error for invalid storage type, got %d: %v", len(errs), errs) t.Errorf("expected no errors for deprecated checkpoint storage config, got: %v", errs)
} }
} }
func TestValidateOperatorConfiguration_CheckpointDisabledSkipsValidation(t *testing.T) { func TestValidateOperatorConfiguration_CheckpointDisabledSkipsValidation(t *testing.T) {
cfg := validConfig() cfg := validConfig()
cfg.Checkpoint.Enabled = false cfg.Checkpoint.Enabled = false
cfg.Checkpoint.Storage.Type = "invalid"
errs := ValidateOperatorConfiguration(cfg) errs := ValidateOperatorConfiguration(cfg)
if len(errs) != 0 { if len(errs) != 0 {
......
...@@ -31,13 +31,15 @@ const ( ...@@ -31,13 +31,15 @@ const (
DynamoCheckpointPhasePending DynamoCheckpointPhase = "Pending" DynamoCheckpointPhasePending DynamoCheckpointPhase = "Pending"
// DynamoCheckpointPhaseCreating indicates the checkpoint Job is running // DynamoCheckpointPhaseCreating indicates the checkpoint Job is running
DynamoCheckpointPhaseCreating DynamoCheckpointPhase = "Creating" DynamoCheckpointPhaseCreating DynamoCheckpointPhase = "Creating"
// DynamoCheckpointPhaseReady indicates the checkpoint tar file is available on the PVC // DynamoCheckpointPhaseReady indicates the checkpoint artifact is available
DynamoCheckpointPhaseReady DynamoCheckpointPhase = "Ready" DynamoCheckpointPhaseReady DynamoCheckpointPhase = "Ready"
// DynamoCheckpointPhaseFailed indicates the checkpoint creation failed // DynamoCheckpointPhaseFailed indicates the checkpoint creation failed
DynamoCheckpointPhaseFailed DynamoCheckpointPhase = "Failed" DynamoCheckpointPhaseFailed DynamoCheckpointPhase = "Failed"
) )
// DynamoCheckpointStorageType defines the supported storage backends for checkpoints // Deprecated: StorageType is retained for compatibility with older
// DynamoCheckpoint status consumers. The current checkpoint flow publishes
// PVC-backed artifacts discovered from the snapshot-agent DaemonSet.
// +kubebuilder:validation:Enum=pvc;s3;oci // +kubebuilder:validation:Enum=pvc;s3;oci
type DynamoCheckpointStorageType string type DynamoCheckpointStorageType string
...@@ -109,10 +111,10 @@ type DynamoCheckpointJobConfig struct { ...@@ -109,10 +111,10 @@ type DynamoCheckpointJobConfig struct {
// +kubebuilder:validation:Minimum=0 // +kubebuilder:validation:Minimum=0
BackoffLimit *int32 `json:"backoffLimit,omitempty"` BackoffLimit *int32 `json:"backoffLimit,omitempty"`
// TTLSecondsAfterFinished specifies how long to keep the Job after completion // Deprecated: TTLSecondsAfterFinished is ignored. Checkpoint Jobs use a fixed
// 300 second TTL.
// +optional // +optional
// +kubebuilder:validation:Minimum=0 // +kubebuilder:validation:Minimum=0
// +kubebuilder:default=300
TTLSecondsAfterFinished *int32 `json:"ttlSecondsAfterFinished,omitempty"` TTLSecondsAfterFinished *int32 `json:"ttlSecondsAfterFinished,omitempty"`
} }
...@@ -148,14 +150,13 @@ type DynamoCheckpointStatus struct { ...@@ -148,14 +150,13 @@ type DynamoCheckpointStatus struct {
// +optional // +optional
IdentityHash string `json:"identityHash,omitempty"` IdentityHash string `json:"identityHash,omitempty"`
// Location is the full URI/path to the checkpoint in the storage backend // Deprecated: Location is ignored and no longer populated. It is retained
// For PVC: same as TarPath (e.g., /checkpoints/{hash}.tar) // only so older objects continue to validate.
// For S3: s3://bucket/prefix/{hash}.tar
// For OCI: oci://registry/repo:{hash}
// +optional // +optional
Location string `json:"location,omitempty"` Location string `json:"location,omitempty"`
// StorageType indicates the storage backend type used for this checkpoint // Deprecated: StorageType is ignored and no longer populated. It is retained
// only so older objects continue to validate.
// +optional // +optional
StorageType DynamoCheckpointStorageType `json:"storageType,omitempty"` StorageType DynamoCheckpointStorageType `json:"storageType,omitempty"`
...@@ -163,7 +164,7 @@ type DynamoCheckpointStatus struct { ...@@ -163,7 +164,7 @@ type DynamoCheckpointStatus struct {
// +optional // +optional
JobName string `json:"jobName,omitempty"` JobName string `json:"jobName,omitempty"`
// CreatedAt is the timestamp when the checkpoint tar was created // CreatedAt is the timestamp when the checkpoint became ready
// +optional // +optional
CreatedAt *metav1.Time `json:"createdAt,omitempty"` CreatedAt *metav1.Time `json:"createdAt,omitempty"`
......
...@@ -8173,8 +8173,9 @@ spec: ...@@ -8173,8 +8173,9 @@ spec:
- message: sharedMemory.size must not be set when sharedMemory.disabled is true - message: sharedMemory.size must not be set when sharedMemory.disabled is true
rule: '!(has(self.disabled) && self.disabled && has(self.size))' rule: '!(has(self.disabled) && self.disabled && has(self.size))'
ttlSecondsAfterFinished: ttlSecondsAfterFinished:
default: 300 description: |-
description: TTLSecondsAfterFinished specifies how long to keep the Job after completion Deprecated: TTLSecondsAfterFinished is ignored. Checkpoint Jobs use a fixed
300 second TTL.
format: int32 format: int32
minimum: 0 minimum: 0
type: integer type: integer
...@@ -8245,7 +8246,7 @@ spec: ...@@ -8245,7 +8246,7 @@ spec:
type: object type: object
type: array type: array
createdAt: createdAt:
description: CreatedAt is the timestamp when the checkpoint tar was created description: CreatedAt is the timestamp when the checkpoint became ready
format: date-time format: date-time
type: string type: string
identityHash: identityHash:
...@@ -8258,10 +8259,8 @@ spec: ...@@ -8258,10 +8259,8 @@ spec:
type: string type: string
location: location:
description: |- description: |-
Location is the full URI/path to the checkpoint in the storage backend Deprecated: Location is ignored and no longer populated. It is retained
For PVC: same as TarPath (e.g., /checkpoints/{hash}.tar) only so older objects continue to validate.
For S3: s3://bucket/prefix/{hash}.tar
For OCI: oci://registry/repo:{hash}
type: string type: string
message: message:
description: Message provides additional information about the current state description: Message provides additional information about the current state
...@@ -8275,7 +8274,9 @@ spec: ...@@ -8275,7 +8274,9 @@ spec:
- Failed - Failed
type: string type: string
storageType: storageType:
description: StorageType indicates the storage backend type used for this checkpoint description: |-
Deprecated: StorageType is ignored and no longer populated. It is retained
only so older objects continue to validate.
enum: enum:
- pvc - pvc
- s3 - s3
......
...@@ -38,6 +38,14 @@ rules: ...@@ -38,6 +38,14 @@ rules:
- get - get
- list - list
- watch - watch
- apiGroups:
- apps
resources:
- daemonsets
verbs:
- get
- list
- watch
- apiGroups: - apiGroups:
- apps - apps
resources: resources:
......
...@@ -16,46 +16,60 @@ ...@@ -16,46 +16,60 @@
apiVersion: nvidia.com/v1alpha1 apiVersion: nvidia.com/v1alpha1
kind: DynamoCheckpoint kind: DynamoCheckpoint
metadata: metadata:
name: llama3-8b-bf16 name: qwen3-06b-bf16
spec: spec:
# Identity - determines the checkpoint hash # Identity - determines the checkpoint hash
identity: identity:
model: "meta-llama/Meta-Llama-3-8B-Instruct" model: "Qwen/Qwen3-0.6B"
backendFramework: "vllm" backendFramework: "vllm"
dynamoVersion: "0.6.0"
tensorParallelSize: 1 tensorParallelSize: 1
pipelineParallelSize: 1 pipelineParallelSize: 1
dtype: "bfloat16" dtype: "bfloat16"
maxModelLen: 8192 maxModelLen: 2048
extraParameters:
enableChunkedPrefill: "true"
# Job configuration for checkpoint creation # Job configuration for checkpoint creation
job: job:
activeDeadlineSeconds: 3600 activeDeadlineSeconds: 3600
ttlSecondsAfterFinished: 300
podTemplateSpec: podTemplateSpec:
spec: spec:
restartPolicy: Never
imagePullSecrets:
- name: ngc-secret
volumes:
- name: hf-cache
persistentVolumeClaim:
claimName: hf-cache-pvc
containers: containers:
- name: checkpoint-worker - name: worker
image: nvcr.io/nvidia/ai-dynamo/dynamo-vllm:latest image: registry.example.com/dynamo/vllm-placeholder:1.0.0
command: ["python", "-m", "vllm.entrypoints.openai.api_server"] command:
- python3
args: args:
- "-m"
- "dynamo.vllm"
- "--model" - "--model"
- "meta-llama/Meta-Llama-3-8B-Instruct" - "Qwen/Qwen3-0.6B"
- "--tensor-parallel-size"
- "1"
- "--dtype" - "--dtype"
- "bfloat16" - "bfloat16"
- "--tensor-parallel-size"
- "1"
- "--max-model-len" - "--max-model-len"
- "8192" - "2048"
envFrom:
- secretRef:
name: hf-token-secret
env: env:
- name: HF_TOKEN - name: HF_HOME
valueFrom: value: /home/dynamo/.cache/huggingface
secretKeyRef: - name: NCCL_DEBUG
name: hf-secret value: ERROR
key: token - name: TORCH_CPP_LOG_LEVEL
value: ERROR
- name: TORCH_DISTRIBUTED_DEBUG
value: "OFF"
volumeMounts:
- name: hf-cache
mountPath: /home/dynamo/.cache/huggingface
resources: resources:
limits: limits:
nvidia.com/gpu: 1 nvidia.com/gpu: "1"
restartPolicy: Never
...@@ -5,6 +5,7 @@ go 1.25.0 ...@@ -5,6 +5,7 @@ go 1.25.0
require ( require (
emperror.dev/errors v0.8.1 emperror.dev/errors v0.8.1
github.com/Masterminds/semver/v3 v3.4.0 github.com/Masterminds/semver/v3 v3.4.0
github.com/ai-dynamo/dynamo/deploy/snapshot v0.0.0
github.com/ai-dynamo/grove/operator/api v0.1.0-alpha.6 github.com/ai-dynamo/grove/operator/api v0.1.0-alpha.6
github.com/bsm/gomega v1.27.10 github.com/bsm/gomega v1.27.10
github.com/go-logr/logr v1.4.3 github.com/go-logr/logr v1.4.3
...@@ -15,6 +16,8 @@ require ( ...@@ -15,6 +16,8 @@ require (
github.com/open-policy-agent/cert-controller v0.14.0 github.com/open-policy-agent/cert-controller v0.14.0
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.71.2 github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.71.2
github.com/prometheus/client_golang v1.23.2 github.com/prometheus/client_golang v1.23.2
github.com/prometheus/client_model v0.6.2
github.com/prometheus/common v0.67.5
github.com/stretchr/testify v1.11.1 github.com/stretchr/testify v1.11.1
golang.org/x/crypto v0.48.0 golang.org/x/crypto v0.48.0
istio.io/api v1.23.1 istio.io/api v1.23.1
...@@ -31,6 +34,8 @@ require ( ...@@ -31,6 +34,8 @@ require (
volcano.sh/apis v1.12.2 volcano.sh/apis v1.12.2
) )
replace github.com/ai-dynamo/dynamo/deploy/snapshot => ../snapshot
require ( require (
cel.dev/expr v0.25.1 // indirect cel.dev/expr v0.25.1 // indirect
github.com/antlr4-go/antlr/v4 v4.13.0 // indirect github.com/antlr4-go/antlr/v4 v4.13.0 // indirect
...@@ -68,8 +73,6 @@ require ( ...@@ -68,8 +73,6 @@ require (
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/pkg/errors v0.9.1 // indirect github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus/client_model v0.6.2 // indirect
github.com/prometheus/common v0.67.5 // indirect
github.com/prometheus/procfs v0.17.0 // indirect github.com/prometheus/procfs v0.17.0 // indirect
github.com/spf13/cobra v1.9.1 // indirect github.com/spf13/cobra v1.9.1 // indirect
github.com/spf13/pflag v1.0.10 // indirect github.com/spf13/pflag v1.0.10 // indirect
......
cel.dev/expr v0.24.0 h1:56OvJKSH3hDGL0ml5uSxZmz3/3Pq4tJ+fb1unVLAFcY=
cel.dev/expr v0.24.0/go.mod h1:hLPLo1W4QUmuYdA72RBX06QTs6MXw941piREPl3Yfiw=
cel.dev/expr v0.25.1 h1:1KrZg61W6TWSxuNZ37Xy49ps13NUovb66QLprthtwi4= cel.dev/expr v0.25.1 h1:1KrZg61W6TWSxuNZ37Xy49ps13NUovb66QLprthtwi4=
cel.dev/expr v0.25.1/go.mod h1:hrXvqGP6G6gyx8UAHSHJ5RGk//1Oj5nXQ2NI02Nrsg4= cel.dev/expr v0.25.1/go.mod h1:hrXvqGP6G6gyx8UAHSHJ5RGk//1Oj5nXQ2NI02Nrsg4=
emperror.dev/errors v0.8.1 h1:UavXZ5cSX/4u9iyvH6aDcuGkVjeexUGJ7Ij7G4VfQT0= emperror.dev/errors v0.8.1 h1:UavXZ5cSX/4u9iyvH6aDcuGkVjeexUGJ7Ij7G4VfQT0=
...@@ -258,12 +256,8 @@ gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= ...@@ -258,12 +256,8 @@ gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk=
gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E=
google.golang.org/genproto/googleapis/api v0.0.0-20251202230838-ff82c1b0f217 h1:fCvbg86sFXwdrl5LgVcTEvNC+2txB5mgROGmRL5mrls= google.golang.org/genproto/googleapis/api v0.0.0-20251202230838-ff82c1b0f217 h1:fCvbg86sFXwdrl5LgVcTEvNC+2txB5mgROGmRL5mrls=
google.golang.org/genproto/googleapis/api v0.0.0-20251202230838-ff82c1b0f217/go.mod h1:+rXWjjaukWZun3mLfjmVnQi18E1AsFbDN9QdJ5YXLto= google.golang.org/genproto/googleapis/api v0.0.0-20251202230838-ff82c1b0f217/go.mod h1:+rXWjjaukWZun3mLfjmVnQi18E1AsFbDN9QdJ5YXLto=
google.golang.org/genproto/googleapis/rpc v0.0.0-20251124214823-79d6a2a48846 h1:Wgl1rcDNThT+Zn47YyCXOXyX/COgMTIdhJ717F0l4xk=
google.golang.org/genproto/googleapis/rpc v0.0.0-20251124214823-79d6a2a48846/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk=
google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217 h1:gRkg/vSppuSQoDjxyiGfN4Upv/h/DQmIR10ZU8dh4Ww= google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217 h1:gRkg/vSppuSQoDjxyiGfN4Upv/h/DQmIR10ZU8dh4Ww=
google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk= google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk=
google.golang.org/grpc v1.76.0 h1:UnVkv1+uMLYXoIz6o7chp59WfQUYA2ex/BXQ9rHZu7A=
google.golang.org/grpc v1.76.0/go.mod h1:Ju12QI8M6iQJtbcsV+awF5a4hfJMLi4X0JLo94ULZ6c=
google.golang.org/grpc v1.79.3 h1:sybAEdRIEtvcD68Gx7dmnwjZKlyfuc61Dyo9pGXXkKE= google.golang.org/grpc v1.79.3 h1:sybAEdRIEtvcD68Gx7dmnwjZKlyfuc61Dyo9pGXXkKE=
google.golang.org/grpc v1.79.3/go.mod h1:KmT0Kjez+0dde/v2j9vzwoAScgEPx/Bw1CYChhHLrHQ= google.golang.org/grpc v1.79.3/go.mod h1:KmT0Kjez+0dde/v2j9vzwoAScgEPx/Bw1CYChhHLrHQ=
google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE=
......
...@@ -21,11 +21,12 @@ import ( ...@@ -21,11 +21,12 @@ import (
"context" "context"
"testing" "testing"
configv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/config/v1alpha1"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts" "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1" corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime"
...@@ -39,19 +40,6 @@ const ( ...@@ -39,19 +40,6 @@ const (
testNamespace = "default" testNamespace = "default"
) )
func testPVCConfig() *configv1alpha1.CheckpointConfiguration {
return &configv1alpha1.CheckpointConfiguration{
Enabled: true,
Storage: configv1alpha1.CheckpointStorageConfiguration{
Type: configv1alpha1.CheckpointStorageTypePVC,
PVC: configv1alpha1.CheckpointPVCConfig{
PVCName: "snapshot-pvc",
BasePath: "/checkpoints",
},
},
}
}
func testIdentity() nvidiacomv1alpha1.DynamoCheckpointIdentity { func testIdentity() nvidiacomv1alpha1.DynamoCheckpointIdentity {
return nvidiacomv1alpha1.DynamoCheckpointIdentity{ return nvidiacomv1alpha1.DynamoCheckpointIdentity{
Model: "meta-llama/Llama-2-7b-hf", Model: "meta-llama/Llama-2-7b-hf",
...@@ -74,6 +62,7 @@ func testScheme() *runtime.Scheme { ...@@ -74,6 +62,7 @@ func testScheme() *runtime.Scheme {
s := runtime.NewScheme() s := runtime.NewScheme()
_ = nvidiacomv1alpha1.AddToScheme(s) _ = nvidiacomv1alpha1.AddToScheme(s)
_ = corev1.AddToScheme(s) _ = corev1.AddToScheme(s)
_ = appsv1.AddToScheme(s)
return s return s
} }
...@@ -81,6 +70,39 @@ func testInfo() *CheckpointInfo { ...@@ -81,6 +70,39 @@ func testInfo() *CheckpointInfo {
return &CheckpointInfo{Enabled: true, Hash: testHash} return &CheckpointInfo{Enabled: true, Hash: testHash}
} }
func testSnapshotAgentDaemonSet() *appsv1.DaemonSet {
return &appsv1.DaemonSet{
ObjectMeta: metav1.ObjectMeta{
Name: "snapshot-agent",
Namespace: testNamespace,
Labels: map[string]string{
snapshotprotocol.SnapshotAgentLabelKey: snapshotprotocol.SnapshotAgentLabelValue,
},
},
Spec: appsv1.DaemonSetSpec{
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Containers: []corev1.Container{{
Name: snapshotprotocol.SnapshotAgentContainerName,
VolumeMounts: []corev1.VolumeMount{{
Name: "checkpoints",
MountPath: "/checkpoints",
}},
}},
Volumes: []corev1.Volume{{
Name: "checkpoints",
VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
ClaimName: "snapshot-pvc",
},
},
}},
},
},
},
}
}
type createHookClient struct { type createHookClient struct {
client.Client client.Client
onCreate func(ctx context.Context, obj client.Object) error onCreate func(ctx context.Context, obj client.Object) error
...@@ -97,71 +119,6 @@ func (c *createHookClient) Create(ctx context.Context, obj client.Object, opts . ...@@ -97,71 +119,6 @@ func (c *createHookClient) Create(ctx context.Context, obj client.Object, opts .
return c.Client.Create(ctx, obj, opts...) return c.Client.Create(ctx, obj, opts...)
} }
// --- Resource helper tests ---
func TestHelpers(t *testing.T) {
// checkpointInfoFromObject — ready
hash, err := ComputeIdentityHash(testIdentity())
require.NoError(t, err)
ckpt := &nvidiacomv1alpha1.DynamoCheckpoint{
ObjectMeta: metav1.ObjectMeta{Name: hash},
Spec: nvidiacomv1alpha1.DynamoCheckpointSpec{Identity: testIdentity()},
Status: nvidiacomv1alpha1.DynamoCheckpointStatus{
Phase: nvidiacomv1alpha1.DynamoCheckpointPhaseReady,
IdentityHash: hash,
Location: "/checkpoints/" + hash,
StorageType: "pvc",
},
}
info, err := checkpointInfoFromObject(ckpt)
require.NoError(t, err)
assert.True(t, info.Enabled)
assert.True(t, info.Ready)
assert.Equal(t, hash, info.Hash)
assert.Equal(t, "/checkpoints/"+hash, info.Location)
assert.Equal(t, ckpt.Name, info.CheckpointName)
// checkpointInfoFromObject — not ready
ckpt.Status.Phase = nvidiacomv1alpha1.DynamoCheckpointPhaseCreating
info, err = checkpointInfoFromObject(ckpt)
require.NoError(t, err)
assert.False(t, info.Ready)
}
func TestArtifactVersionHelpers(t *testing.T) {
t.Run("new checkpoints default to version 1", func(t *testing.T) {
ckpt := &nvidiacomv1alpha1.DynamoCheckpoint{}
assert.Nil(t, ckpt.Annotations)
assert.Equal(t, "checkpoint-job-"+testHash+"-"+consts.DefaultCheckpointArtifactVersion, "checkpoint-job-"+testHash+"-"+consts.DefaultCheckpointArtifactVersion)
})
t.Run("annotation overrides desired version", func(t *testing.T) {
ckpt := &nvidiacomv1alpha1.DynamoCheckpoint{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{
consts.KubeAnnotationCheckpointArtifactVersion: "3",
},
},
}
assert.Equal(t, "3", ckpt.Annotations[consts.KubeAnnotationCheckpointArtifactVersion])
assert.Equal(t, "checkpoint-job-"+testHash+"-3", "checkpoint-job-"+testHash+"-"+ckpt.Annotations[consts.KubeAnnotationCheckpointArtifactVersion])
})
}
func TestResolveCheckpointStorage(t *testing.T) {
config := testPVCConfig()
location, storageType, err := ResolveCheckpointStorage(testHash, "", config)
require.NoError(t, err)
assert.Equal(t, "/checkpoints/"+testHash+"/versions/"+consts.DefaultCheckpointArtifactVersion, location)
assert.Equal(t, nvidiacomv1alpha1.DynamoCheckpointStorageType("pvc"), storageType)
location, storageType, err = ResolveCheckpointStorage(testHash, "7", config)
require.NoError(t, err)
assert.Equal(t, "/checkpoints/"+testHash+"/versions/7", location)
assert.Equal(t, nvidiacomv1alpha1.DynamoCheckpointStorageType("pvc"), storageType)
}
func TestCreateOrGetAutoCheckpointDeduplicatesConcurrentSameHashCheckpoint(t *testing.T) { func TestCreateOrGetAutoCheckpointDeduplicatesConcurrentSameHashCheckpoint(t *testing.T) {
ctx := context.Background() ctx := context.Background()
s := testScheme() s := testScheme()
...@@ -175,7 +132,7 @@ func TestCreateOrGetAutoCheckpointDeduplicatesConcurrentSameHashCheckpoint(t *te ...@@ -175,7 +132,7 @@ func TestCreateOrGetAutoCheckpointDeduplicatesConcurrentSameHashCheckpoint(t *te
Name: "friendly-checkpoint", Name: "friendly-checkpoint",
Namespace: testNamespace, Namespace: testNamespace,
Labels: map[string]string{ Labels: map[string]string{
consts.KubeLabelCheckpointHash: hash, consts.KubeLabelCheckpointID: hash,
}, },
}, },
Spec: nvidiacomv1alpha1.DynamoCheckpointSpec{ Spec: nvidiacomv1alpha1.DynamoCheckpointSpec{
...@@ -223,184 +180,59 @@ func TestCreateOrGetAutoCheckpointSetsDefaultArtifactVersion(t *testing.T) { ...@@ -223,184 +180,59 @@ func TestCreateOrGetAutoCheckpointSetsDefaultArtifactVersion(t *testing.T) {
assert.Equal(t, consts.DefaultCheckpointArtifactVersion, ckpt.Annotations[consts.KubeAnnotationCheckpointArtifactVersion]) assert.Equal(t, consts.DefaultCheckpointArtifactVersion, ckpt.Annotations[consts.KubeAnnotationCheckpointArtifactVersion])
} }
// --- Injection idempotency tests ---
func TestInjectionIdempotency(t *testing.T) {
// Volume injection is idempotent
podSpec := &corev1.PodSpec{Volumes: []corev1.Volume{{Name: consts.CheckpointVolumeName}, {Name: consts.PodInfoVolumeName}}}
InjectCheckpointVolume(podSpec, "snapshot-pvc")
InjectPodInfoVolume(podSpec)
assert.Len(t, podSpec.Volumes, 2)
// Mount injection is idempotent
container := &corev1.Container{VolumeMounts: []corev1.VolumeMount{
{Name: consts.CheckpointVolumeName}, {Name: consts.PodInfoVolumeName},
}}
InjectCheckpointVolumeMount(container, "/checkpoints")
InjectPodInfoVolumeMount(container)
assert.Len(t, container.VolumeMounts, 2)
}
func TestApplyCheckpointPodMetadata(t *testing.T) {
t.Run("checkpoint source metadata uses annotations for location and storage", func(t *testing.T) {
labels := map[string]string{}
annotations := map[string]string{}
ApplyCheckpointSourcePodMetadata(labels, annotations, testHash, "/checkpoints/"+testHash, "pvc")
assert.Equal(t, consts.KubeLabelValueTrue, labels[consts.KubeLabelIsCheckpointSource])
assert.Equal(t, testHash, labels[consts.KubeLabelCheckpointHash])
assert.Equal(t, "/checkpoints/"+testHash, annotations[consts.KubeAnnotationCheckpointLocation])
assert.Equal(t, "pvc", annotations[consts.KubeAnnotationCheckpointStorageType])
})
t.Run("restore metadata clears stale values when checkpoint is not ready", func(t *testing.T) {
labels := map[string]string{
consts.KubeLabelIsRestoreTarget: consts.KubeLabelValueTrue,
consts.KubeLabelCheckpointHash: "stale-hash",
}
annotations := map[string]string{
consts.KubeAnnotationCheckpointLocation: "/checkpoints/stale-hash",
consts.KubeAnnotationCheckpointStorageType: "pvc",
}
ApplyRestorePodMetadata(labels, annotations, &CheckpointInfo{Enabled: true, Ready: false})
_, hasRestoreTarget := labels[consts.KubeLabelIsRestoreTarget]
_, hasCheckpointHash := labels[consts.KubeLabelCheckpointHash]
_, hasLocation := annotations[consts.KubeAnnotationCheckpointLocation]
_, hasStorageType := annotations[consts.KubeAnnotationCheckpointStorageType]
assert.False(t, hasRestoreTarget)
assert.False(t, hasCheckpointHash)
assert.False(t, hasLocation)
assert.False(t, hasStorageType)
})
}
// --- InjectCheckpointIntoPodSpec tests --- // --- InjectCheckpointIntoPodSpec tests ---
func TestInjectCheckpointIntoPodSpec(t *testing.T) { func TestInjectCheckpointIntoPodSpec(t *testing.T) {
t.Run("nil or disabled info is a no-op", func(t *testing.T) { t.Run("ready checkpoint injects podinfo and overrides command", func(t *testing.T) {
for _, info := range []*CheckpointInfo{nil, {Enabled: false}} {
podSpec := testPodSpec()
require.NoError(t, InjectCheckpointIntoPodSpec(podSpec, info, testPVCConfig()))
assert.Equal(t, []string{"python3"}, podSpec.Containers[0].Command)
}
})
t.Run("ready checkpoint overrides command to sleep infinity", func(t *testing.T) {
podSpec := testPodSpec() podSpec := testPodSpec()
info := &CheckpointInfo{Enabled: true, Ready: true, Hash: testHash} info := &CheckpointInfo{Enabled: true, Ready: true, Identity: ptr.To(testIdentity())}
require.NoError(t, InjectCheckpointIntoPodSpec(podSpec, info, testPVCConfig())) reader := fake.NewClientBuilder().WithScheme(testScheme()).WithObjects(testSnapshotAgentDaemonSet()).Build()
require.NoError(t, InjectCheckpointIntoPodSpec(context.Background(), reader, testNamespace, podSpec, info))
assert.Equal(t, []string{"sleep", "infinity"}, podSpec.Containers[0].Command) assert.Equal(t, []string{"sleep", "infinity"}, podSpec.Containers[0].Command)
assert.Nil(t, podSpec.Containers[0].Args) assert.Nil(t, podSpec.Containers[0].Args)
}) assert.Len(t, info.Hash, 16)
t.Run("ready checkpoint preserves published versioned location", func(t *testing.T) { volumes := map[string]corev1.Volume{}
podSpec := testPodSpec() for _, volume := range podSpec.Volumes {
info := &CheckpointInfo{ volumes[volume.Name] = volume
Enabled: true,
Ready: true,
Hash: testHash,
Location: "/checkpoints/" + testHash + "/versions/2",
StorageType: "pvc",
} }
require.NoError(t, InjectCheckpointIntoPodSpec(podSpec, info, testPVCConfig())) require.Contains(t, volumes, consts.PodInfoVolumeName)
assert.Equal(t, "/checkpoints/"+testHash+"/versions/2", info.Location) require.NotNil(t, volumes[consts.PodInfoVolumeName].DownwardAPI)
assert.Equal(t, nvidiacomv1alpha1.DynamoCheckpointStorageType("pvc"), info.StorageType)
})
t.Run("not-ready checkpoint preserves original command", func(t *testing.T) {
podSpec := testPodSpec()
require.NoError(t, InjectCheckpointIntoPodSpec(podSpec, testInfo(), testPVCConfig()))
assert.Equal(t, []string{"python3"}, podSpec.Containers[0].Command)
})
t.Run("sets seccomp profile", func(t *testing.T) {
podSpec := testPodSpec()
require.NoError(t, InjectCheckpointIntoPodSpec(podSpec, testInfo(), testPVCConfig()))
require.NotNil(t, podSpec.SecurityContext)
require.NotNil(t, podSpec.SecurityContext.SeccompProfile)
assert.Equal(t, corev1.SeccompProfileTypeLocalhost, podSpec.SecurityContext.SeccompProfile.Type)
assert.Equal(t, consts.SeccompProfilePath, *podSpec.SecurityContext.SeccompProfile.LocalhostProfile)
})
t.Run("preserves existing security context", func(t *testing.T) { fields := map[string]string{}
podSpec := testPodSpec() for _, item := range volumes[consts.PodInfoVolumeName].DownwardAPI.Items {
podSpec.SecurityContext = &corev1.PodSecurityContext{RunAsUser: ptr.To(int64(1000))} if item.FieldRef != nil {
require.NoError(t, InjectCheckpointIntoPodSpec(podSpec, testInfo(), testPVCConfig())) fields[item.Path] = item.FieldRef.FieldPath
assert.Equal(t, int64(1000), *podSpec.SecurityContext.RunAsUser)
require.NotNil(t, podSpec.SecurityContext.SeccompProfile)
})
t.Run("PVC storage injects volumes and mounts", func(t *testing.T) {
podSpec := testPodSpec()
require.NoError(t, InjectCheckpointIntoPodSpec(podSpec, testInfo(), testPVCConfig()))
// Volumes
volNames := make(map[string]bool)
for _, v := range podSpec.Volumes {
volNames[v.Name] = true
if v.Name == consts.CheckpointVolumeName {
assert.Equal(t, "snapshot-pvc", v.PersistentVolumeClaim.ClaimName)
}
if v.Name == consts.PodInfoVolumeName {
require.NotNil(t, v.DownwardAPI)
fieldPaths := map[string]string{}
for _, item := range v.DownwardAPI.Items {
if item.FieldRef != nil {
fieldPaths[item.Path] = item.FieldRef.FieldPath
}
}
assert.Equal(t, "metadata.labels['"+consts.KubeLabelDynamoNamespace+"']", fieldPaths[consts.PodInfoFileDynNamespace])
assert.Equal(t, "metadata.labels['"+consts.KubeLabelDynamoWorkerHash+"']", fieldPaths[consts.PodInfoFileDynNamespaceWorkerSuffix])
assert.Equal(t, "metadata.labels['"+consts.KubeLabelDynamoComponentType+"']", fieldPaths[consts.PodInfoFileDynComponent])
assert.Equal(t, "metadata.labels['"+consts.KubeLabelDynamoGraphDeploymentName+"']", fieldPaths[consts.PodInfoFileDynParentDGDName])
assert.Equal(t, consts.PodInfoFieldPodNamespace, fieldPaths[consts.PodInfoFileDynParentDGDNamespace])
} }
} }
assert.True(t, volNames[consts.CheckpointVolumeName]) assert.Equal(t, "metadata.labels['"+consts.KubeLabelDynamoNamespace+"']", fields[consts.PodInfoFileDynNamespace])
assert.True(t, volNames[consts.PodInfoVolumeName]) assert.Equal(t, "metadata.labels['"+consts.KubeLabelDynamoWorkerHash+"']", fields[consts.PodInfoFileDynNamespaceWorkerSuffix])
assert.Equal(t, "metadata.labels['"+consts.KubeLabelDynamoComponentType+"']", fields[consts.PodInfoFileDynComponent])
// Mounts assert.Equal(t, "metadata.labels['"+consts.KubeLabelDynamoGraphDeploymentName+"']", fields[consts.PodInfoFileDynParentDGDName])
mountPaths := make(map[string]string) assert.Equal(t, consts.PodInfoFieldPodNamespace, fields[consts.PodInfoFileDynParentDGDNamespace])
for _, m := range podSpec.Containers[0].VolumeMounts {
mountPaths[m.Name] = m.MountPath mountPaths := map[string]string{}
for _, mount := range podSpec.Containers[0].VolumeMounts {
mountPaths[mount.Name] = mount.MountPath
} }
assert.Equal(t, "/checkpoints", mountPaths[consts.CheckpointVolumeName])
assert.Equal(t, consts.PodInfoMountPath, mountPaths[consts.PodInfoVolumeName]) assert.Equal(t, consts.PodInfoMountPath, mountPaths[consts.PodInfoVolumeName])
}) })
t.Run("computes hash from identity when hash is empty", func(t *testing.T) { t.Run("ready checkpoint targets the container named main", func(t *testing.T) {
podSpec := testPodSpec() podSpec := &corev1.PodSpec{
identity := testIdentity() Containers: []corev1.Container{
info := &CheckpointInfo{Enabled: true, Identity: &identity} {Name: "sidecar", Image: "sidecar:latest", Command: []string{"sidecar"}, Args: []string{"run"}},
require.NoError(t, InjectCheckpointIntoPodSpec(podSpec, info, testPVCConfig())) {Name: consts.MainContainerName, Image: "main:latest", Command: []string{"python3"}, Args: []string{"-m", "dynamo.vllm"}},
assert.Len(t, info.Hash, 16) },
})
t.Run("S3 and OCI storage set location", func(t *testing.T) {
for _, tc := range []struct {
storageType string
config configv1alpha1.CheckpointStorageConfiguration
wantLoc string
}{
{"s3", configv1alpha1.CheckpointStorageConfiguration{
Type: configv1alpha1.CheckpointStorageTypeS3,
S3: configv1alpha1.CheckpointS3Config{URI: "s3://bucket/prefix"},
}, "s3://bucket/prefix/" + testHash + ".tar"},
{"oci", configv1alpha1.CheckpointStorageConfiguration{
Type: configv1alpha1.CheckpointStorageTypeOCI,
OCI: configv1alpha1.CheckpointOCIConfig{URI: "oci://registry/repo"},
}, "oci://registry/repo:" + testHash},
} {
t.Run(tc.storageType, func(t *testing.T) {
podSpec := testPodSpec()
info := &CheckpointInfo{Enabled: true, Hash: testHash}
require.NoError(t, InjectCheckpointIntoPodSpec(podSpec, info, &configv1alpha1.CheckpointConfiguration{Storage: tc.config}))
assert.Equal(t, tc.wantLoc, info.Location)
})
} }
info := &CheckpointInfo{Enabled: true, Ready: true, Hash: testHash}
reader := fake.NewClientBuilder().WithScheme(testScheme()).WithObjects(testSnapshotAgentDaemonSet()).Build()
require.NoError(t, InjectCheckpointIntoPodSpec(context.Background(), reader, testNamespace, podSpec, info))
assert.Equal(t, []string{"sidecar"}, podSpec.Containers[0].Command)
assert.Equal(t, []string{"run"}, podSpec.Containers[0].Args)
assert.Equal(t, []string{"sleep", "infinity"}, podSpec.Containers[1].Command)
assert.Nil(t, podSpec.Containers[1].Args)
}) })
t.Run("error cases", func(t *testing.T) { t.Run("error cases", func(t *testing.T) {
...@@ -408,35 +240,21 @@ func TestInjectCheckpointIntoPodSpec(t *testing.T) { ...@@ -408,35 +240,21 @@ func TestInjectCheckpointIntoPodSpec(t *testing.T) {
name string name string
podSpec *corev1.PodSpec podSpec *corev1.PodSpec
info *CheckpointInfo info *CheckpointInfo
config *configv1alpha1.CheckpointConfiguration reader client.Reader
errMsg string errMsg string
}{ }{
{"hash empty and identity nil", testPodSpec(), &CheckpointInfo{Enabled: true}, testPVCConfig(), "identity is nil"}, {"hash empty and identity nil", testPodSpec(), &CheckpointInfo{Enabled: true}, fake.NewClientBuilder().WithScheme(testScheme()).WithObjects(testSnapshotAgentDaemonSet()).Build(), "identity is nil"},
{"no containers", &corev1.PodSpec{}, testInfo(), testPVCConfig(), "no container found"}, {"no containers", &corev1.PodSpec{}, testInfo(), fake.NewClientBuilder().WithScheme(testScheme()).WithObjects(testSnapshotAgentDaemonSet()).Build(), "no container found"},
{"PVC name missing", testPodSpec(), testInfo(), &configv1alpha1.CheckpointConfiguration{ {"main container missing", &corev1.PodSpec{Containers: []corev1.Container{{Name: "sidecar", Image: "img", Command: []string{"python3"}}}}, testInfo(), fake.NewClientBuilder().WithScheme(testScheme()).WithObjects(testSnapshotAgentDaemonSet()).Build(), "main container not found"},
Storage: configv1alpha1.CheckpointStorageConfiguration{Type: "pvc", PVC: configv1alpha1.CheckpointPVCConfig{BasePath: "/checkpoints"}}, {"snapshot daemonset missing", testPodSpec(), testInfo(), fake.NewClientBuilder().WithScheme(testScheme()).Build(), "no snapshot-agent daemonset found"},
}, "no PVC name"},
{"S3 URI missing", testPodSpec(), testInfo(), &configv1alpha1.CheckpointConfiguration{
Storage: configv1alpha1.CheckpointStorageConfiguration{Type: "s3"},
}, "S3"},
{"OCI URI missing", testPodSpec(), testInfo(), &configv1alpha1.CheckpointConfiguration{
Storage: configv1alpha1.CheckpointStorageConfiguration{Type: "oci"},
}, "OCI"},
} { } {
t.Run(tc.name, func(t *testing.T) { t.Run(tc.name, func(t *testing.T) {
err := InjectCheckpointIntoPodSpec(tc.podSpec, tc.info, tc.config) err := InjectCheckpointIntoPodSpec(context.Background(), tc.reader, testNamespace, tc.podSpec, tc.info)
require.Error(t, err) require.Error(t, err)
assert.Contains(t, err.Error(), tc.errMsg) assert.Contains(t, err.Error(), tc.errMsg)
}) })
} }
}) })
t.Run("falls back to first container when main not found", func(t *testing.T) {
podSpec := &corev1.PodSpec{Containers: []corev1.Container{{Name: "sidecar", Image: "img", Command: []string{"python3"}}}}
info := &CheckpointInfo{Enabled: true, Ready: true, Hash: testHash}
require.NoError(t, InjectCheckpointIntoPodSpec(podSpec, info, testPVCConfig()))
assert.Equal(t, []string{"sleep", "infinity"}, podSpec.Containers[0].Command)
})
} }
// --- ResolveCheckpointForService tests --- // --- ResolveCheckpointForService tests ---
...@@ -463,8 +281,6 @@ func TestResolveCheckpointForService(t *testing.T) { ...@@ -463,8 +281,6 @@ func TestResolveCheckpointForService(t *testing.T) {
Status: nvidiacomv1alpha1.DynamoCheckpointStatus{ Status: nvidiacomv1alpha1.DynamoCheckpointStatus{
Phase: nvidiacomv1alpha1.DynamoCheckpointPhaseReady, Phase: nvidiacomv1alpha1.DynamoCheckpointPhaseReady,
IdentityHash: hash, IdentityHash: hash,
Location: "/checkpoints/" + hash,
StorageType: "pvc",
}, },
} }
c := fake.NewClientBuilder().WithScheme(s).WithObjects(ckpt).WithStatusSubresource(ckpt).Build() c := fake.NewClientBuilder().WithScheme(s).WithObjects(ckpt).WithStatusSubresource(ckpt).Build()
...@@ -477,7 +293,6 @@ func TestResolveCheckpointForService(t *testing.T) { ...@@ -477,7 +293,6 @@ func TestResolveCheckpointForService(t *testing.T) {
assert.True(t, info.Exists) assert.True(t, info.Exists)
assert.True(t, info.Ready) assert.True(t, info.Ready)
assert.Equal(t, hash, info.Hash) assert.Equal(t, hash, info.Hash)
assert.Equal(t, "/checkpoints/"+hash, info.Location)
assert.Equal(t, hash, info.CheckpointName) assert.Equal(t, hash, info.CheckpointName)
}) })
...@@ -541,8 +356,6 @@ func TestResolveCheckpointForService(t *testing.T) { ...@@ -541,8 +356,6 @@ func TestResolveCheckpointForService(t *testing.T) {
Status: nvidiacomv1alpha1.DynamoCheckpointStatus{ Status: nvidiacomv1alpha1.DynamoCheckpointStatus{
Phase: nvidiacomv1alpha1.DynamoCheckpointPhaseReady, Phase: nvidiacomv1alpha1.DynamoCheckpointPhaseReady,
IdentityHash: hash, IdentityHash: hash,
Location: "/checkpoints/" + hash,
StorageType: "pvc",
}, },
} }
c := fake.NewClientBuilder().WithScheme(s).WithObjects(ckpt).WithStatusSubresource(ckpt).Build() c := fake.NewClientBuilder().WithScheme(s).WithObjects(ckpt).WithStatusSubresource(ckpt).Build()
......
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
package checkpoint
import (
commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
corev1 "k8s.io/api/core/v1"
)
func EnsurePodInfoVolume(podSpec *corev1.PodSpec) {
for _, volume := range podSpec.Volumes {
if volume.Name == commonconsts.PodInfoVolumeName {
return
}
}
podSpec.Volumes = append(podSpec.Volumes, corev1.Volume{
Name: commonconsts.PodInfoVolumeName,
VolumeSource: corev1.VolumeSource{
DownwardAPI: &corev1.DownwardAPIVolumeSource{
Items: []corev1.DownwardAPIVolumeFile{
{
Path: "pod_name",
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: commonconsts.PodInfoFieldPodName,
},
},
{
Path: "pod_uid",
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: commonconsts.PodInfoFieldPodUID,
},
},
{
Path: "pod_namespace",
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: commonconsts.PodInfoFieldPodNamespace,
},
},
{
Path: commonconsts.PodInfoFileDynNamespace,
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.labels['" + commonconsts.KubeLabelDynamoNamespace + "']",
},
},
{
Path: commonconsts.PodInfoFileDynNamespaceWorkerSuffix,
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.labels['" + commonconsts.KubeLabelDynamoWorkerHash + "']",
},
},
{
Path: commonconsts.PodInfoFileDynComponent,
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.labels['" + commonconsts.KubeLabelDynamoComponentType + "']",
},
},
{
Path: commonconsts.PodInfoFileDynParentDGDName,
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.labels['" + commonconsts.KubeLabelDynamoGraphDeploymentName + "']",
},
},
{
Path: commonconsts.PodInfoFileDynParentDGDNamespace,
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: commonconsts.PodInfoFieldPodNamespace,
},
},
},
},
},
})
}
func EnsurePodInfoMount(container *corev1.Container) {
for _, mount := range container.VolumeMounts {
if mount.Name == commonconsts.PodInfoVolumeName {
return
}
}
container.VolumeMounts = append(container.VolumeMounts, corev1.VolumeMount{
Name: commonconsts.PodInfoVolumeName,
MountPath: commonconsts.PodInfoMountPath,
ReadOnly: true,
})
}
...@@ -18,177 +18,32 @@ ...@@ -18,177 +18,32 @@
package checkpoint package checkpoint
import ( import (
"context"
"fmt" "fmt"
configv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/config/v1alpha1"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts" commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol"
corev1 "k8s.io/api/core/v1" corev1 "k8s.io/api/core/v1"
"k8s.io/utils/ptr" ctrlclient "sigs.k8s.io/controller-runtime/pkg/client"
) )
func ApplyCheckpointSourcePodMetadata(
labels map[string]string,
annotations map[string]string,
hash string,
location string,
storageType nvidiacomv1alpha1.DynamoCheckpointStorageType,
) {
delete(labels, commonconsts.KubeLabelIsRestoreTarget)
delete(labels, commonconsts.KubeLabelCheckpointHash)
delete(annotations, commonconsts.KubeAnnotationCheckpointLocation)
delete(annotations, commonconsts.KubeAnnotationCheckpointStorageType)
labels[commonconsts.KubeLabelIsCheckpointSource] = commonconsts.KubeLabelValueTrue
if hash != "" {
labels[commonconsts.KubeLabelCheckpointHash] = hash
}
if location != "" {
annotations[commonconsts.KubeAnnotationCheckpointLocation] = location
}
if storageType != "" {
annotations[commonconsts.KubeAnnotationCheckpointStorageType] = string(storageType)
}
}
func ApplyRestorePodMetadata(labels map[string]string, annotations map[string]string, checkpointInfo *CheckpointInfo) { func ApplyRestorePodMetadata(labels map[string]string, annotations map[string]string, checkpointInfo *CheckpointInfo) {
delete(labels, commonconsts.KubeLabelIsRestoreTarget) enabled := checkpointInfo != nil && checkpointInfo.Enabled && checkpointInfo.Ready
delete(labels, commonconsts.KubeLabelCheckpointHash) hash := ""
delete(annotations, commonconsts.KubeAnnotationCheckpointLocation) artifactVersion := ""
delete(annotations, commonconsts.KubeAnnotationCheckpointStorageType) if enabled {
hash = checkpointInfo.Hash
if checkpointInfo == nil || !checkpointInfo.Enabled || !checkpointInfo.Ready { artifactVersion = checkpointInfo.ArtifactVersion
return }
} snapshotprotocol.ApplyRestoreTargetMetadata(labels, annotations, enabled, hash, artifactVersion)
labels[commonconsts.KubeLabelIsRestoreTarget] = commonconsts.KubeLabelValueTrue
if checkpointInfo.Hash != "" {
labels[commonconsts.KubeLabelCheckpointHash] = checkpointInfo.Hash
}
if checkpointInfo.Location != "" {
annotations[commonconsts.KubeAnnotationCheckpointLocation] = checkpointInfo.Location
}
if checkpointInfo.StorageType != "" {
annotations[commonconsts.KubeAnnotationCheckpointStorageType] = string(checkpointInfo.StorageType)
}
}
func InjectCheckpointVolume(podSpec *corev1.PodSpec, pvcName string) {
for _, volume := range podSpec.Volumes {
if volume.Name == commonconsts.CheckpointVolumeName {
return
}
}
podSpec.Volumes = append(podSpec.Volumes, corev1.Volume{
Name: commonconsts.CheckpointVolumeName,
VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
ClaimName: pvcName,
ReadOnly: false,
},
},
})
}
func InjectCheckpointVolumeMount(container *corev1.Container, basePath string) {
for _, mount := range container.VolumeMounts {
if mount.Name == commonconsts.CheckpointVolumeName {
return
}
}
container.VolumeMounts = append(container.VolumeMounts, corev1.VolumeMount{
Name: commonconsts.CheckpointVolumeName,
MountPath: basePath,
ReadOnly: false,
})
}
func InjectPodInfoVolume(podSpec *corev1.PodSpec) {
for _, volume := range podSpec.Volumes {
if volume.Name == commonconsts.PodInfoVolumeName {
return
}
}
podSpec.Volumes = append(podSpec.Volumes, corev1.Volume{
Name: commonconsts.PodInfoVolumeName,
VolumeSource: corev1.VolumeSource{
DownwardAPI: &corev1.DownwardAPIVolumeSource{
Items: []corev1.DownwardAPIVolumeFile{
{
Path: "pod_name",
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: commonconsts.PodInfoFieldPodName,
},
},
{
Path: "pod_uid",
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: commonconsts.PodInfoFieldPodUID,
},
},
{
Path: "pod_namespace",
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: commonconsts.PodInfoFieldPodNamespace,
},
},
{
Path: commonconsts.PodInfoFileDynNamespace,
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.labels['" + commonconsts.KubeLabelDynamoNamespace + "']",
},
},
{
Path: commonconsts.PodInfoFileDynNamespaceWorkerSuffix,
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.labels['" + commonconsts.KubeLabelDynamoWorkerHash + "']",
},
},
{
Path: commonconsts.PodInfoFileDynComponent,
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.labels['" + commonconsts.KubeLabelDynamoComponentType + "']",
},
},
{
Path: commonconsts.PodInfoFileDynParentDGDName,
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.labels['" + commonconsts.KubeLabelDynamoGraphDeploymentName + "']",
},
},
{
Path: commonconsts.PodInfoFileDynParentDGDNamespace,
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: commonconsts.PodInfoFieldPodNamespace,
},
},
},
},
},
})
}
func InjectPodInfoVolumeMount(container *corev1.Container) {
for _, mount := range container.VolumeMounts {
if mount.Name == commonconsts.PodInfoVolumeName {
return
}
}
container.VolumeMounts = append(container.VolumeMounts, corev1.VolumeMount{
Name: commonconsts.PodInfoVolumeName,
MountPath: commonconsts.PodInfoMountPath,
ReadOnly: true,
})
} }
func InjectCheckpointIntoPodSpec( func InjectCheckpointIntoPodSpec(
ctx context.Context,
reader ctrlclient.Reader,
namespace string,
podSpec *corev1.PodSpec, podSpec *corev1.PodSpec,
checkpointInfo *CheckpointInfo, checkpointInfo *CheckpointInfo,
checkpointConfig *configv1alpha1.CheckpointConfiguration,
) error { ) error {
if checkpointInfo == nil || !checkpointInfo.Enabled { if checkpointInfo == nil || !checkpointInfo.Enabled {
return nil return nil
...@@ -207,6 +62,9 @@ func InjectCheckpointIntoPodSpec( ...@@ -207,6 +62,9 @@ func InjectCheckpointIntoPodSpec(
info.Hash = hash info.Hash = hash
} }
if len(podSpec.Containers) == 0 {
return fmt.Errorf("no container found to inject checkpoint config")
}
var mainContainer *corev1.Container var mainContainer *corev1.Container
for i := range podSpec.Containers { for i := range podSpec.Containers {
if podSpec.Containers[i].Name == commonconsts.MainContainerName { if podSpec.Containers[i].Name == commonconsts.MainContainerName {
...@@ -214,83 +72,27 @@ func InjectCheckpointIntoPodSpec( ...@@ -214,83 +72,27 @@ func InjectCheckpointIntoPodSpec(
break break
} }
} }
if mainContainer == nil && len(podSpec.Containers) > 0 {
mainContainer = &podSpec.Containers[0]
}
if mainContainer == nil { if mainContainer == nil {
return fmt.Errorf("no container found to inject checkpoint config") return fmt.Errorf("main container not found in pod spec")
} }
if reader == nil {
if info.Ready { return fmt.Errorf("checkpoint client is required")
mainContainer.Command = []string{"sleep", "infinity"} }
mainContainer.Args = nil if err := snapshotprotocol.PrepareRestorePodSpecForCheckpoint(
} ctx,
reader,
if podSpec.SecurityContext == nil { namespace,
podSpec.SecurityContext = &corev1.PodSecurityContext{} podSpec,
} mainContainer,
podSpec.SecurityContext.SeccompProfile = &corev1.SeccompProfile{ info.Hash,
Type: corev1.SeccompProfileTypeLocalhost, info.ArtifactVersion,
LocalhostProfile: ptr.To(commonconsts.SeccompProfilePath), commonconsts.SeccompProfilePath,
} info.Ready,
); err != nil {
storageType := configv1alpha1.CheckpointStorageTypePVC
var storageConfig *configv1alpha1.CheckpointStorageConfiguration
if checkpointConfig != nil {
storageConfig = &checkpointConfig.Storage
if storageConfig.Type != "" {
storageType = storageConfig.Type
}
}
if err := injectCheckpointStorage(podSpec, mainContainer, info, storageType, storageConfig); err != nil {
return err return err
} }
InjectPodInfoVolume(podSpec) EnsurePodInfoVolume(podSpec)
InjectPodInfoVolumeMount(mainContainer) EnsurePodInfoMount(mainContainer)
return nil return nil
} }
func injectCheckpointStorage(
podSpec *corev1.PodSpec,
mainContainer *corev1.Container,
info *CheckpointInfo,
storageType string,
storageConfig *configv1alpha1.CheckpointStorageConfiguration,
) error {
if info.StorageType == "" {
info.StorageType = nvidiacomv1alpha1.DynamoCheckpointStorageType(storageType)
}
switch storageType {
case configv1alpha1.CheckpointStorageTypeS3:
if storageConfig == nil || storageConfig.S3.URI == "" {
return fmt.Errorf("S3 storage type selected but no S3 URI configured (set checkpoint.storage.s3.uri)")
}
if info.Location == "" {
info.Location = fmt.Sprintf("%s/%s.tar", storageConfig.S3.URI, info.Hash)
}
return nil
case configv1alpha1.CheckpointStorageTypeOCI:
if storageConfig == nil || storageConfig.OCI.URI == "" {
return fmt.Errorf("OCI storage type selected but no OCI URI configured (set checkpoint.storage.oci.uri)")
}
if info.Location == "" {
info.Location = fmt.Sprintf("%s:%s", storageConfig.OCI.URI, info.Hash)
}
return nil
default:
if storageConfig == nil || storageConfig.PVC.PVCName == "" {
return fmt.Errorf("PVC storage type selected but no PVC name configured (set checkpoint.storage.pvc.pvcName)")
}
if storageConfig.PVC.BasePath == "" {
return fmt.Errorf("PVC storage type selected but no PVC base path configured (set checkpoint.storage.pvc.basePath)")
}
if info.Location == "" {
info.Location = fmt.Sprintf("%s/%s", storageConfig.PVC.BasePath, info.Hash)
}
InjectCheckpointVolume(podSpec, storageConfig.PVC.PVCName)
InjectCheckpointVolumeMount(mainContainer, storageConfig.PVC.BasePath)
return nil
}
}
...@@ -20,24 +20,21 @@ package checkpoint ...@@ -20,24 +20,21 @@ package checkpoint
import ( import (
"context" "context"
"fmt" "fmt"
"strings"
configv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/config/v1alpha1"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts" snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol"
"k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client"
) )
type CheckpointInfo struct { type CheckpointInfo struct {
Enabled bool Enabled bool
Exists bool Exists bool
Identity *nvidiacomv1alpha1.DynamoCheckpointIdentity Identity *nvidiacomv1alpha1.DynamoCheckpointIdentity
Hash string Hash string
Location string ArtifactVersion string
StorageType nvidiacomv1alpha1.DynamoCheckpointStorageType CheckpointName string
CheckpointName string Ready bool
Ready bool
} }
func checkpointInfoFromObject(ckpt *nvidiacomv1alpha1.DynamoCheckpoint) (*CheckpointInfo, error) { func checkpointInfoFromObject(ckpt *nvidiacomv1alpha1.DynamoCheckpoint) (*CheckpointInfo, error) {
...@@ -47,17 +44,23 @@ func checkpointInfoFromObject(ckpt *nvidiacomv1alpha1.DynamoCheckpoint) (*Checkp ...@@ -47,17 +44,23 @@ func checkpointInfoFromObject(ckpt *nvidiacomv1alpha1.DynamoCheckpoint) (*Checkp
} }
return &CheckpointInfo{ return &CheckpointInfo{
Enabled: true, Enabled: true,
Exists: true, Exists: true,
Identity: &ckpt.Spec.Identity, Identity: &ckpt.Spec.Identity,
Hash: hash, Hash: hash,
Location: ckpt.Status.Location, ArtifactVersion: checkpointArtifactVersion(ckpt),
StorageType: ckpt.Status.StorageType, CheckpointName: ckpt.Name,
CheckpointName: ckpt.Name, Ready: ckpt.Status.Phase == nvidiacomv1alpha1.DynamoCheckpointPhaseReady,
Ready: ckpt.Status.Phase == nvidiacomv1alpha1.DynamoCheckpointPhaseReady,
}, nil }, nil
} }
func checkpointArtifactVersion(ckpt *nvidiacomv1alpha1.DynamoCheckpoint) string {
if ckpt == nil {
return snapshotprotocol.DefaultCheckpointArtifactVersion
}
return snapshotprotocol.ArtifactVersion(ckpt.Annotations[snapshotprotocol.CheckpointArtifactVersionAnnotation])
}
func ResolveCheckpointForService( func ResolveCheckpointForService(
ctx context.Context, ctx context.Context,
c client.Client, c client.Client,
...@@ -105,37 +108,3 @@ func ResolveCheckpointForService( ...@@ -105,37 +108,3 @@ func ResolveCheckpointForService(
info.Identity = config.Identity info.Identity = config.Identity
return info, nil return info, nil
} }
func ResolveCheckpointStorage(
hash string,
version string,
config *configv1alpha1.CheckpointConfiguration,
) (string, nvidiacomv1alpha1.DynamoCheckpointStorageType, error) {
version = strings.TrimSpace(version)
if version == "" {
version = consts.DefaultCheckpointArtifactVersion
}
storageType := configv1alpha1.CheckpointStorageTypePVC
if config != nil && config.Storage.Type != "" {
storageType = config.Storage.Type
}
switch storageType {
case configv1alpha1.CheckpointStorageTypeS3:
if config == nil || config.Storage.S3.URI == "" {
return "", "", fmt.Errorf("S3 storage type selected but no S3 URI configured (set checkpoint.storage.s3.uri)")
}
return fmt.Sprintf("%s/%s/versions/%s.tar", config.Storage.S3.URI, hash, version), nvidiacomv1alpha1.DynamoCheckpointStorageType(storageType), nil
case configv1alpha1.CheckpointStorageTypeOCI:
if config == nil || config.Storage.OCI.URI == "" {
return "", "", fmt.Errorf("OCI storage type selected but no OCI URI configured (set checkpoint.storage.oci.uri)")
}
return fmt.Sprintf("%s:%s-%s", config.Storage.OCI.URI, hash, version), nvidiacomv1alpha1.DynamoCheckpointStorageType(storageType), nil
default:
if config == nil || config.Storage.PVC.BasePath == "" {
return "", "", fmt.Errorf("PVC storage type selected but no PVC base path configured (set checkpoint.storage.pvc.basePath)")
}
return fmt.Sprintf("%s/%s/versions/%s", config.Storage.PVC.BasePath, hash, version), nvidiacomv1alpha1.DynamoCheckpointStorageType(storageType), nil
}
}
...@@ -23,6 +23,7 @@ import ( ...@@ -23,6 +23,7 @@ import (
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts" "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol"
corev1 "k8s.io/api/core/v1" corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors" apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
...@@ -55,7 +56,7 @@ func FindCheckpointByIdentityHash( ...@@ -55,7 +56,7 @@ func FindCheckpointByIdentityHash(
ctx, ctx,
checkpoints, checkpoints,
client.InNamespace(namespace), client.InNamespace(namespace),
client.MatchingLabels{consts.KubeLabelCheckpointHash: hash}, client.MatchingLabels{consts.KubeLabelCheckpointID: hash},
); err != nil { ); err != nil {
return nil, fmt.Errorf("failed to list checkpoints by hash label: %w", err) return nil, fmt.Errorf("failed to list checkpoints by hash label: %w", err)
} }
...@@ -118,10 +119,10 @@ func CreateOrGetAutoCheckpoint( ...@@ -118,10 +119,10 @@ func CreateOrGetAutoCheckpoint(
Name: fmt.Sprintf("checkpoint-%s", hash), Name: fmt.Sprintf("checkpoint-%s", hash),
Namespace: namespace, Namespace: namespace,
Labels: map[string]string{ Labels: map[string]string{
consts.KubeLabelCheckpointHash: hash, consts.KubeLabelCheckpointID: hash,
}, },
Annotations: map[string]string{ Annotations: map[string]string{
consts.KubeAnnotationCheckpointArtifactVersion: consts.DefaultCheckpointArtifactVersion, snapshotprotocol.CheckpointArtifactVersionAnnotation: snapshotprotocol.DefaultCheckpointArtifactVersion,
}, },
}, },
Spec: nvidiacomv1alpha1.DynamoCheckpointSpec{ Spec: nvidiacomv1alpha1.DynamoCheckpointSpec{
......
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
package checkpointjob
import (
"fmt"
configv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/config/v1alpha1"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/checkpoint"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/dynamo"
snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
)
func DesiredCheckpointJobName(identityHash string, annotations map[string]string) string {
return "checkpoint-job-" + identityHash + "-" + snapshotprotocol.ArtifactVersion(annotations[snapshotprotocol.CheckpointArtifactVersionAnnotation])
}
func buildCheckpointWorkerDefaultEnv(
ckpt *nvidiacomv1alpha1.DynamoCheckpoint,
podTemplate *corev1.PodTemplateSpec,
) []corev1.EnvVar {
componentType := consts.ComponentTypeWorker
dynamoNamespace := consts.GlobalDynamoNamespace
parentGraphDeploymentName := podTemplate.Labels[consts.KubeLabelDynamoGraphDeploymentName]
workerHashSuffix := podTemplate.Labels[consts.KubeLabelDynamoWorkerHash]
discoveryBackend := configv1alpha1.DiscoveryBackendKubernetes
if podTemplate.Labels[consts.KubeLabelDynamoNamespace] != "" {
dynamoNamespace = podTemplate.Labels[consts.KubeLabelDynamoNamespace]
}
if podTemplate.Labels[consts.KubeLabelDynamoComponentType] != "" &&
dynamo.IsWorkerComponent(podTemplate.Labels[consts.KubeLabelDynamoComponentType]) {
componentType = podTemplate.Labels[consts.KubeLabelDynamoComponentType]
}
defaultContainer, _ := dynamo.NewWorkerDefaults().GetBaseContainer(dynamo.ComponentContext{
ComponentType: componentType,
DynamoNamespace: dynamoNamespace,
ParentGraphDeploymentName: parentGraphDeploymentName,
ParentGraphDeploymentNamespace: ckpt.Namespace,
DiscoveryBackend: discoveryBackend,
WorkerHashSuffix: workerHashSuffix,
})
return defaultContainer.Env
}
func BuildCheckpointJob(
config *configv1alpha1.OperatorConfiguration,
ckpt *nvidiacomv1alpha1.DynamoCheckpoint,
jobName string,
) (*batchv1.Job, error) {
podTemplate := ckpt.Spec.Job.PodTemplateSpec.DeepCopy()
hash := ckpt.Status.IdentityHash
if hash == "" {
var err error
hash, err = checkpoint.ComputeIdentityHash(ckpt.Spec.Identity)
if err != nil {
return nil, fmt.Errorf("failed to compute identity hash: %w", err)
}
}
if podTemplate.Labels == nil {
podTemplate.Labels = make(map[string]string)
}
if podTemplate.Annotations == nil {
podTemplate.Annotations = make(map[string]string)
}
checkpoint.EnsurePodInfoVolume(&podTemplate.Spec)
if len(podTemplate.Spec.Containers) > 0 {
mainContainer := &podTemplate.Spec.Containers[0]
mainContainer.Env = dynamo.MergeEnvs(
buildCheckpointWorkerDefaultEnv(ckpt, podTemplate),
mainContainer.Env,
)
dynamo.AddStandardEnvVars(mainContainer, config)
mainContainer.Env = append(mainContainer.Env, corev1.EnvVar{
Name: consts.EnvReadyForCheckpointFile,
Value: config.Checkpoint.ReadyForCheckpointFilePath,
})
mainContainer.ReadinessProbe = &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
Exec: &corev1.ExecAction{
Command: []string{"cat", config.Checkpoint.ReadyForCheckpointFilePath},
},
},
InitialDelaySeconds: 15,
PeriodSeconds: 2,
}
mainContainer.LivenessProbe = nil
mainContainer.StartupProbe = nil
checkpoint.EnsurePodInfoMount(mainContainer)
dynamo.ApplySharedMemoryVolumeAndMount(&podTemplate.Spec, mainContainer, ckpt.Spec.Job.SharedMemory)
}
activeDeadlineSeconds := ckpt.Spec.Job.ActiveDeadlineSeconds
if activeDeadlineSeconds == nil {
defaultDeadline := int64(3600)
activeDeadlineSeconds = &defaultDeadline
}
wrapLaunchJob := false
if len(podTemplate.Spec.Containers) != 0 {
if gpus, ok := podTemplate.Spec.Containers[0].Resources.Limits[corev1.ResourceName(consts.KubeResourceGPUNvidia)]; ok {
wrapLaunchJob = gpus.Cmp(*resource.NewQuantity(1, resource.DecimalSI)) > 0
}
}
ttlSecondsAfterFinish := snapshotprotocol.DefaultCheckpointJobTTLSeconds
return snapshotprotocol.NewCheckpointJob(podTemplate, snapshotprotocol.CheckpointJobOptions{
Namespace: ckpt.Namespace,
CheckpointID: hash,
ArtifactVersion: snapshotprotocol.ArtifactVersion(ckpt.Annotations[snapshotprotocol.CheckpointArtifactVersionAnnotation]),
SeccompProfile: consts.SeccompProfilePath,
Name: jobName,
ActiveDeadlineSeconds: activeDeadlineSeconds,
TTLSecondsAfterFinish: &ttlSecondsAfterFinish,
WrapLaunchJob: wrapLaunchJob,
})
}
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
package checkpointjob
import (
"testing"
snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol"
)
func TestDesiredCheckpointJobName(t *testing.T) {
name := DesiredCheckpointJobName("abc123def4567890", map[string]string{
snapshotprotocol.CheckpointArtifactVersionAnnotation: "2",
})
if name != "checkpoint-job-abc123def4567890-2" {
t.Fatalf("unexpected checkpoint job name: %s", name)
}
defaultName := DesiredCheckpointJobName("abc123def4567890", nil)
if defaultName != "checkpoint-job-abc123def4567890-"+snapshotprotocol.DefaultCheckpointArtifactVersion {
t.Fatalf("unexpected default checkpoint job name: %s", defaultName)
}
}
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
package checkpointjob
import (
snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
)
type ObservationPhase string
const (
ObservationPhaseRunning ObservationPhase = "running"
ObservationPhaseWaitingForConfirmation ObservationPhase = "waiting_for_confirmation"
ObservationPhaseReady ObservationPhase = "ready"
ObservationPhaseFailed ObservationPhase = "failed"
)
type Observation struct {
Phase ObservationPhase
Reason string
Message string
}
func Observe(job *batchv1.Job, checkpointWorkerActive bool) Observation {
jobComplete := false
jobFailed := false
for _, condition := range job.Status.Conditions {
if condition.Status != corev1.ConditionTrue {
continue
}
if condition.Type == batchv1.JobComplete {
jobComplete = true
continue
}
if condition.Type == batchv1.JobFailed {
jobFailed = true
}
}
status := job.Annotations[snapshotprotocol.CheckpointStatusAnnotation]
if status == snapshotprotocol.CheckpointStatusFailed {
observation := Observation{
Phase: ObservationPhaseFailed,
Reason: "JobFailed",
Message: "Checkpoint job failed",
}
if jobComplete {
observation.Reason = "CheckpointVerificationFailed"
observation.Message = "Checkpoint job completed but snapshot-agent reported checkpoint failure"
}
return observation
}
if jobComplete {
if status == snapshotprotocol.CheckpointStatusCompleted {
return Observation{
Phase: ObservationPhaseReady,
Reason: "JobSucceeded",
Message: "Checkpoint job completed successfully",
}
}
if checkpointWorkerActive {
return Observation{Phase: ObservationPhaseWaitingForConfirmation}
}
return Observation{
Phase: ObservationPhaseFailed,
Reason: "CheckpointVerificationFailed",
Message: "Checkpoint job completed without snapshot-agent completion confirmation",
}
}
if jobFailed {
return Observation{
Phase: ObservationPhaseFailed,
Reason: "JobFailed",
Message: "Checkpoint job failed",
}
}
return Observation{Phase: ObservationPhaseRunning}
}
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
package checkpointjob
import (
"testing"
snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
func TestObserve(t *testing.T) {
makeJob := func(annotation string, conditions ...batchv1.JobCondition) *batchv1.Job {
job := &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{},
},
Status: batchv1.JobStatus{
Conditions: conditions,
},
}
if annotation != "" {
job.Annotations[snapshotprotocol.CheckpointStatusAnnotation] = annotation
}
return job
}
tests := []struct {
name string
job *batchv1.Job
checkpointWorkerActive bool
wantPhase ObservationPhase
wantReason string
wantMessage string
}{
{
name: "running job stays running",
job: makeJob(""),
wantPhase: ObservationPhaseRunning,
},
{
name: "completed job with completion annotation is ready",
job: makeJob(
snapshotprotocol.CheckpointStatusCompleted,
batchv1.JobCondition{Type: batchv1.JobComplete, Status: corev1.ConditionTrue},
),
wantPhase: ObservationPhaseReady,
wantReason: "JobSucceeded",
wantMessage: "Checkpoint job completed successfully",
},
{
name: "completed job waits for terminal confirmation while worker is active",
job: makeJob(
"",
batchv1.JobCondition{Type: batchv1.JobComplete, Status: corev1.ConditionTrue},
),
checkpointWorkerActive: true,
wantPhase: ObservationPhaseWaitingForConfirmation,
},
{
name: "completed job fails without confirmation once worker is inactive",
job: makeJob(
"",
batchv1.JobCondition{Type: batchv1.JobComplete, Status: corev1.ConditionTrue},
),
wantPhase: ObservationPhaseFailed,
wantReason: "CheckpointVerificationFailed",
wantMessage: "Checkpoint job completed without snapshot-agent completion confirmation",
},
{
name: "failed checkpoint annotation wins over completed job",
job: makeJob(
snapshotprotocol.CheckpointStatusFailed,
batchv1.JobCondition{Type: batchv1.JobComplete, Status: corev1.ConditionTrue},
),
checkpointWorkerActive: true,
wantPhase: ObservationPhaseFailed,
wantReason: "CheckpointVerificationFailed",
wantMessage: "Checkpoint job completed but snapshot-agent reported checkpoint failure",
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
observation := Observe(tc.job, tc.checkpointWorkerActive)
if observation.Phase != tc.wantPhase {
t.Fatalf("phase = %q, want %q", observation.Phase, tc.wantPhase)
}
if observation.Reason != tc.wantReason {
t.Fatalf("reason = %q, want %q", observation.Reason, tc.wantReason)
}
if observation.Message != tc.wantMessage {
t.Fatalf("message = %q, want %q", observation.Message, tc.wantMessage)
}
})
}
}
...@@ -140,17 +140,16 @@ const ( ...@@ -140,17 +140,16 @@ const (
ResourceStateUnknown = "unknown" ResourceStateUnknown = "unknown"
// Checkpoint/restore constants // Checkpoint/restore constants
// CROSS-REFERENCE: Some constants below are duplicated in the snapshot package at // CROSS-REFERENCE: Some constants below are duplicated in deploy/snapshot/protocol.
// deploy/snapshot/pkg/config/constants.go. If you change a value here, update there too. // If you change a value here, update there too.
// Kubernetes labels // Kubernetes labels
KubeLabelIsCheckpointSource = "nvidia.com/snapshot-is-checkpoint-source" // Pod label that triggers DaemonSet auto-checkpoint KubeLabelIsCheckpointSource = "nvidia.com/snapshot-is-checkpoint-source" // Pod label that triggers DaemonSet auto-checkpoint
KubeLabelCheckpointHash = "nvidia.com/snapshot-checkpoint-hash" // Checkpoint identity hash used for lookup/reuse (may differ from DynamoCheckpoint metadata.name) KubeLabelCheckpointID = "nvidia.com/snapshot-checkpoint-id" // Checkpoint identity label; the operator stores the resolved identity hash as the value
KubeLabelIsRestoreTarget = "nvidia.com/snapshot-is-restore-target" // Pod label that triggers DaemonSet auto-restore KubeLabelIsRestoreTarget = "nvidia.com/snapshot-is-restore-target" // Pod label that triggers DaemonSet auto-restore
KubeAnnotationCheckpointArtifactVersion = "nvidia.com/snapshot-artifact-version" // Checkpoint artifact generation; changing it triggers a new immutable capture attempt KubeAnnotationCheckpointArtifactVersion = "nvidia.com/snapshot-artifact-version" // Checkpoint artifact generation; changing it triggers a new immutable capture attempt
DefaultCheckpointArtifactVersion = "1" DefaultCheckpointArtifactVersion = "1"
KubeAnnotationCheckpointLocation = "nvidia.com/snapshot-checkpoint-location" // Pod annotation that tells snapshot-agent where the checkpoint lives DefaultCheckpointJobTTLSeconds = int32(300)
KubeAnnotationCheckpointStorageType = "nvidia.com/snapshot-checkpoint-storage-type" // Pod annotation that tells snapshot-agent which storage backend owns the checkpoint
// Environment variables injected into pods // Environment variables injected into pods
EnvReadyForCheckpointFile = "DYN_READY_FOR_CHECKPOINT_FILE" // Ready-for-checkpoint file path — checkpoint job pods EnvReadyForCheckpointFile = "DYN_READY_FOR_CHECKPOINT_FILE" // Ready-for-checkpoint file path — checkpoint job pods
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment