Unverified Commit 43e810a4 authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

refactor(snapshot): add manifest-based snapshotctl flow and shared workload builders (#7671)


Signed-off-by: default avatarSchwinn Saereesitthipitak <schwinns@nvidia.com>
parent 23144df5
......@@ -11,8 +11,8 @@ import (
"github.com/go-logr/logr"
"google.golang.org/protobuf/proto"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/logging"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/logging"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
)
// RestoreLogFilename is the CRIU restore log filename (also used by executor/restore.go).
......
......@@ -9,7 +9,7 @@ import (
"golang.org/x/sys/unix"
"google.golang.org/protobuf/proto"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
)
// parseManageCgroupsMode normalizes and validates the CRIU cgroup mode setting.
......
......@@ -5,7 +5,7 @@ import (
criurpc "github.com/checkpoint-restore/go-criu/v8/rpc"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
)
func TestParseManageCgroupsMode(t *testing.T) {
......
......@@ -10,7 +10,7 @@ import (
"github.com/go-logr/logr"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/common"
snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime"
)
const (
......@@ -57,7 +57,15 @@ func runAction(ctx context.Context, pid int, action, deviceMap string, log logr.
args = append(args, "--device-map", deviceMap)
}
cmd := exec.CommandContext(ctx, cudaCheckpointHelperBinary, args...)
details := common.ReadProcessDetailsOrDefault("/proc", pid)
details := snapshotruntime.ProcessDetails{
ObservedPID: pid,
OutermostPID: pid,
InnermostPID: pid,
NamespacePIDs: []int{pid},
}
if process, err := snapshotruntime.ReadProcessDetails("/proc", pid); err == nil {
details = process
}
start := time.Now()
output, err := cmd.CombinedOutput()
duration := time.Since(start)
......
......@@ -7,27 +7,27 @@ import (
"fmt"
"os"
"path/filepath"
"strings"
"time"
criurpc "github.com/checkpoint-restore/go-criu/v8/rpc"
"github.com/containerd/containerd"
"github.com/go-logr/logr"
"k8s.io/client-go/kubernetes"
"github.com/google/uuid"
"k8s.io/client-go/kubernetes"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/common"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/criu"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/cuda"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/criu"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/cuda"
snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
)
// CheckpointRequest holds per-checkpoint identifiers for a checkpoint operation.
type CheckpointRequest struct {
ContainerID string
ContainerName string
CheckpointHash string
CheckpointID string
CheckpointLocation string
CheckpointStorageType string
NodeName string
PodName string
PodNamespace string
......@@ -44,8 +44,8 @@ func Checkpoint(ctx context.Context, ctrd *containerd.Client, log logr.Logger, r
checkpointStart := time.Now()
log.Info("=== Starting checkpoint operation ===")
if req.CheckpointStorageType != "pvc" {
return fmt.Errorf("checkpoint storage type %q is not supported", req.CheckpointStorageType)
if strings.TrimSpace(req.CheckpointID) == "" {
return fmt.Errorf("checkpoint ID is required")
}
if req.CheckpointLocation == "" {
return fmt.Errorf("checkpoint location is required")
......@@ -99,33 +99,33 @@ func Checkpoint(ctx context.Context, ctrd *containerd.Client, log logr.Logger, r
func inspectContainer(ctx context.Context, ctrd *containerd.Client, log logr.Logger, req CheckpointRequest) (*types.CheckpointContainerSnapshot, error) {
containerID := req.ContainerID
pid, ociSpec, err := common.ResolveContainer(ctx, ctrd, containerID)
pid, ociSpec, err := snapshotruntime.ResolveContainer(ctx, ctrd, containerID)
if err != nil {
return nil, fmt.Errorf("failed to resolve container: %w", err)
}
var hostCgroupPath string
if cgPath, err := common.ResolveCgroupRootFromHostPID(pid); err == nil && cgPath != "" {
hostCgroupPath = filepath.Join(common.HostCgroupPath, cgPath)
if cgPath, err := snapshotruntime.ResolveCgroupRootFromHostPID(pid); err == nil && cgPath != "" {
hostCgroupPath = filepath.Join(snapshotruntime.HostCgroupPath, cgPath)
}
rootFS, err := common.GetRootFS(pid)
rootFS, err := snapshotruntime.GetRootFS(pid)
if err != nil {
return nil, fmt.Errorf("failed to get rootfs: %w", err)
}
upperDir, err := common.GetOverlayUpperDir(pid)
upperDir, err := snapshotruntime.GetOverlayUpperDir(pid)
if err != nil {
return nil, fmt.Errorf("failed to get overlay upperdir: %w", err)
}
mountInfo, err := common.ReadMountInfo(pid)
mountInfo, err := snapshotruntime.ReadMountInfo(pid)
if err != nil {
return nil, fmt.Errorf("failed to parse mountinfo: %w", err)
}
mounts := common.ClassifyMounts(mountInfo, ociSpec, rootFS)
mounts := snapshotruntime.ClassifyMounts(mountInfo, ociSpec, rootFS)
netNSInode, err := common.GetNetNSInode(pid)
netNSInode, err := snapshotruntime.GetNetNSInode(pid)
if err != nil {
return nil, fmt.Errorf("failed to get net namespace inode: %w", err)
}
......@@ -133,7 +133,7 @@ func inspectContainer(ctx context.Context, ctrd *containerd.Client, log logr.Log
// Read stdio FD targets (like runc's getPipeFds / descriptors.json).
stdioFDs := make([]string, 3)
for i := range 3 {
target, err := os.Readlink(fmt.Sprintf("%s/%d/fd/%d", common.HostProcPath, pid, i))
target, err := os.Readlink(fmt.Sprintf("%s/%d/fd/%d", snapshotruntime.HostProcPath, pid, i))
if err != nil {
log.V(1).Info("Failed to readlink stdio FD", "fd", i, "error", err)
continue
......@@ -142,11 +142,11 @@ func inspectContainer(ctx context.Context, ctrd *containerd.Client, log logr.Log
}
// Discover CUDA processes and GPU UUIDs
allPIDs := common.ProcessTreePIDs(pid)
allPIDs := snapshotruntime.ProcessTreePIDs(pid)
cudaHostPIDs := cuda.FilterProcesses(ctx, allPIDs, log)
cudaNamespacePIDs := make([]int, 0, len(cudaHostPIDs))
for _, cudaHostPID := range cudaHostPIDs {
process, err := common.ReadProcessDetails(common.HostProcPath, cudaHostPID)
process, err := snapshotruntime.ReadProcessDetails(snapshotruntime.HostProcPath, cudaHostPID)
if err != nil {
return nil, fmt.Errorf("failed to read process details for CUDA process %d: %w", cudaHostPID, err)
}
......@@ -166,7 +166,7 @@ func inspectContainer(ctx context.Context, ctrd *containerd.Client, log logr.Log
}
if len(gpuUUIDs) == 0 {
log.Info("PodResources API returned no GPU UUIDs, falling back to nvidia-smi", "pid", pid)
gpuUUIDs, err = cuda.GetGPUUUIDsViaNvidiaSmi(ctx, common.HostProcPath, pid)
gpuUUIDs, err = cuda.GetGPUUUIDsViaNvidiaSmi(ctx, snapshotruntime.HostProcPath, pid)
if err != nil {
return nil, fmt.Errorf("nvidia-smi GPU UUID fallback failed: %w", err)
}
......@@ -202,7 +202,7 @@ func configureCheckpoint(
}
m := types.NewCheckpointManifest(
req.CheckpointHash,
req.CheckpointID,
types.NewCRIUDumpManifest(criuOpts, cfg.CRIU),
types.NewSourcePodManifest(req.ContainerID, state.PID, req.NodeName, req.PodName, req.PodNamespace, state.StdioFDs),
types.NewOverlayManifest(cfg.Overlay, state.UpperDir, state.OCISpec),
......@@ -235,10 +235,10 @@ func captureCheckpoint(ctx context.Context, criuOpts *criurpc.CriuOpts, criuSett
// propagated — a checkpoint without overlay diffs is still valid for restore
// (the base container image provides the filesystem).
if state.UpperDir != "" {
if _, err := common.CaptureRootfsDiff(state.UpperDir, checkpointDir, data.Overlay.Exclusions, data.Overlay.BindMountDests); err != nil {
if _, err := snapshotruntime.CaptureRootfsDiff(state.UpperDir, checkpointDir, data.Overlay.Exclusions, data.Overlay.BindMountDests); err != nil {
log.Error(err, "Failed to capture rootfs diff")
}
if _, err := common.CaptureDeletedFiles(state.UpperDir, checkpointDir); err != nil {
if _, err := snapshotruntime.CaptureDeletedFiles(state.UpperDir, checkpointDir); err != nil {
log.Error(err, "Failed to capture deleted files")
}
}
......
......@@ -9,7 +9,7 @@ import (
"github.com/go-logr/logr"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/common"
snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime"
)
// LogProcessDiagnostics logs process state and CRIU restore log for debugging a failed restore.
......@@ -30,7 +30,7 @@ func LogProcessDiagnostics(procRoot string, pid int, restoreLogPath string, log
// Exit code from /proc/stat
if data, err := os.ReadFile(filepath.Join(procRoot, pidStr, "stat")); err == nil {
if ws, err := common.ParseProcExitCode(string(data)); err == nil {
if ws, err := snapshotruntime.ParseProcExitCode(string(data)); err == nil {
entry.Info("Process exit code", "exit_status", ws.ExitStatus(), "term_signal", ws.Signal(), "core_dumped", ws.CoreDump())
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment