/* * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package checkpoint import ( "context" "fmt" nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol" "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" ) type CheckpointInfo struct { Enabled bool Exists bool Identity *nvidiacomv1alpha1.DynamoCheckpointIdentity Hash string ArtifactVersion string CheckpointName string Ready bool } func checkpointInfoFromObject(ckpt *nvidiacomv1alpha1.DynamoCheckpoint) (*CheckpointInfo, error) { hash, err := checkpointIdentityHash(ckpt) if err != nil { return nil, err } return &CheckpointInfo{ Enabled: true, Exists: true, Identity: &ckpt.Spec.Identity, Hash: hash, ArtifactVersion: checkpointArtifactVersion(ckpt), CheckpointName: ckpt.Name, Ready: ckpt.Status.Phase == nvidiacomv1alpha1.DynamoCheckpointPhaseReady, }, nil } func checkpointArtifactVersion(ckpt *nvidiacomv1alpha1.DynamoCheckpoint) string { if ckpt == nil { return snapshotprotocol.DefaultCheckpointArtifactVersion } return snapshotprotocol.ArtifactVersion(ckpt.Annotations[snapshotprotocol.CheckpointArtifactVersionAnnotation]) } func ResolveCheckpointForService( ctx context.Context, c client.Client, namespace string, config *nvidiacomv1alpha1.ServiceCheckpointConfig, ) (*CheckpointInfo, error) { switch { case config == nil || !config.Enabled: return &CheckpointInfo{Enabled: false}, nil case config.CheckpointRef != nil && *config.CheckpointRef != "": ckpt := &nvidiacomv1alpha1.DynamoCheckpoint{} if err := c.Get(ctx, types.NamespacedName{ Namespace: namespace, Name: *config.CheckpointRef, }, ckpt); err != nil { return nil, fmt.Errorf("failed to get referenced checkpoint %s: %w", *config.CheckpointRef, err) } return checkpointInfoFromObject(ckpt) case config.Identity == nil: return nil, fmt.Errorf("checkpoint enabled but no checkpointRef or identity provided") } hash, err := ComputeIdentityHash(*config.Identity) if err != nil { return nil, fmt.Errorf("failed to compute identity hash: %w", err) } existing, err := FindCheckpointByIdentityHash(ctx, c, namespace, hash, "") if err != nil { return nil, err } if existing == nil { return &CheckpointInfo{ Enabled: true, Identity: config.Identity, Hash: hash, }, nil } info, err := checkpointInfoFromObject(existing) if err != nil { return nil, err } info.Identity = config.Identity return info, nil }