Unverified Commit 43e810a4 authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

refactor(snapshot): add manifest-based snapshotctl flow and shared workload builders (#7671)


Signed-off-by: default avatarSchwinn Saereesitthipitak <schwinns@nvidia.com>
parent 23144df5
......@@ -11,8 +11,8 @@ import (
"github.com/go-logr/logr"
"google.golang.org/protobuf/proto"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/logging"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/logging"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
)
// RestoreLogFilename is the CRIU restore log filename (also used by executor/restore.go).
......
......@@ -9,7 +9,7 @@ import (
"golang.org/x/sys/unix"
"google.golang.org/protobuf/proto"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
)
// parseManageCgroupsMode normalizes and validates the CRIU cgroup mode setting.
......
......@@ -5,7 +5,7 @@ import (
criurpc "github.com/checkpoint-restore/go-criu/v8/rpc"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
)
func TestParseManageCgroupsMode(t *testing.T) {
......
......@@ -10,7 +10,7 @@ import (
"github.com/go-logr/logr"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/common"
snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime"
)
const (
......@@ -57,7 +57,15 @@ func runAction(ctx context.Context, pid int, action, deviceMap string, log logr.
args = append(args, "--device-map", deviceMap)
}
cmd := exec.CommandContext(ctx, cudaCheckpointHelperBinary, args...)
details := common.ReadProcessDetailsOrDefault("/proc", pid)
details := snapshotruntime.ProcessDetails{
ObservedPID: pid,
OutermostPID: pid,
InnermostPID: pid,
NamespacePIDs: []int{pid},
}
if process, err := snapshotruntime.ReadProcessDetails("/proc", pid); err == nil {
details = process
}
start := time.Now()
output, err := cmd.CombinedOutput()
duration := time.Since(start)
......
......@@ -7,27 +7,27 @@ import (
"fmt"
"os"
"path/filepath"
"strings"
"time"
criurpc "github.com/checkpoint-restore/go-criu/v8/rpc"
"github.com/containerd/containerd"
"github.com/go-logr/logr"
"k8s.io/client-go/kubernetes"
"github.com/google/uuid"
"k8s.io/client-go/kubernetes"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/common"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/criu"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/cuda"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/criu"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/cuda"
snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
)
// CheckpointRequest holds per-checkpoint identifiers for a checkpoint operation.
type CheckpointRequest struct {
ContainerID string
ContainerName string
CheckpointHash string
CheckpointID string
CheckpointLocation string
CheckpointStorageType string
NodeName string
PodName string
PodNamespace string
......@@ -44,8 +44,8 @@ func Checkpoint(ctx context.Context, ctrd *containerd.Client, log logr.Logger, r
checkpointStart := time.Now()
log.Info("=== Starting checkpoint operation ===")
if req.CheckpointStorageType != "pvc" {
return fmt.Errorf("checkpoint storage type %q is not supported", req.CheckpointStorageType)
if strings.TrimSpace(req.CheckpointID) == "" {
return fmt.Errorf("checkpoint ID is required")
}
if req.CheckpointLocation == "" {
return fmt.Errorf("checkpoint location is required")
......@@ -99,33 +99,33 @@ func Checkpoint(ctx context.Context, ctrd *containerd.Client, log logr.Logger, r
func inspectContainer(ctx context.Context, ctrd *containerd.Client, log logr.Logger, req CheckpointRequest) (*types.CheckpointContainerSnapshot, error) {
containerID := req.ContainerID
pid, ociSpec, err := common.ResolveContainer(ctx, ctrd, containerID)
pid, ociSpec, err := snapshotruntime.ResolveContainer(ctx, ctrd, containerID)
if err != nil {
return nil, fmt.Errorf("failed to resolve container: %w", err)
}
var hostCgroupPath string
if cgPath, err := common.ResolveCgroupRootFromHostPID(pid); err == nil && cgPath != "" {
hostCgroupPath = filepath.Join(common.HostCgroupPath, cgPath)
if cgPath, err := snapshotruntime.ResolveCgroupRootFromHostPID(pid); err == nil && cgPath != "" {
hostCgroupPath = filepath.Join(snapshotruntime.HostCgroupPath, cgPath)
}
rootFS, err := common.GetRootFS(pid)
rootFS, err := snapshotruntime.GetRootFS(pid)
if err != nil {
return nil, fmt.Errorf("failed to get rootfs: %w", err)
}
upperDir, err := common.GetOverlayUpperDir(pid)
upperDir, err := snapshotruntime.GetOverlayUpperDir(pid)
if err != nil {
return nil, fmt.Errorf("failed to get overlay upperdir: %w", err)
}
mountInfo, err := common.ReadMountInfo(pid)
mountInfo, err := snapshotruntime.ReadMountInfo(pid)
if err != nil {
return nil, fmt.Errorf("failed to parse mountinfo: %w", err)
}
mounts := common.ClassifyMounts(mountInfo, ociSpec, rootFS)
mounts := snapshotruntime.ClassifyMounts(mountInfo, ociSpec, rootFS)
netNSInode, err := common.GetNetNSInode(pid)
netNSInode, err := snapshotruntime.GetNetNSInode(pid)
if err != nil {
return nil, fmt.Errorf("failed to get net namespace inode: %w", err)
}
......@@ -133,7 +133,7 @@ func inspectContainer(ctx context.Context, ctrd *containerd.Client, log logr.Log
// Read stdio FD targets (like runc's getPipeFds / descriptors.json).
stdioFDs := make([]string, 3)
for i := range 3 {
target, err := os.Readlink(fmt.Sprintf("%s/%d/fd/%d", common.HostProcPath, pid, i))
target, err := os.Readlink(fmt.Sprintf("%s/%d/fd/%d", snapshotruntime.HostProcPath, pid, i))
if err != nil {
log.V(1).Info("Failed to readlink stdio FD", "fd", i, "error", err)
continue
......@@ -142,11 +142,11 @@ func inspectContainer(ctx context.Context, ctrd *containerd.Client, log logr.Log
}
// Discover CUDA processes and GPU UUIDs
allPIDs := common.ProcessTreePIDs(pid)
allPIDs := snapshotruntime.ProcessTreePIDs(pid)
cudaHostPIDs := cuda.FilterProcesses(ctx, allPIDs, log)
cudaNamespacePIDs := make([]int, 0, len(cudaHostPIDs))
for _, cudaHostPID := range cudaHostPIDs {
process, err := common.ReadProcessDetails(common.HostProcPath, cudaHostPID)
process, err := snapshotruntime.ReadProcessDetails(snapshotruntime.HostProcPath, cudaHostPID)
if err != nil {
return nil, fmt.Errorf("failed to read process details for CUDA process %d: %w", cudaHostPID, err)
}
......@@ -166,7 +166,7 @@ func inspectContainer(ctx context.Context, ctrd *containerd.Client, log logr.Log
}
if len(gpuUUIDs) == 0 {
log.Info("PodResources API returned no GPU UUIDs, falling back to nvidia-smi", "pid", pid)
gpuUUIDs, err = cuda.GetGPUUUIDsViaNvidiaSmi(ctx, common.HostProcPath, pid)
gpuUUIDs, err = cuda.GetGPUUUIDsViaNvidiaSmi(ctx, snapshotruntime.HostProcPath, pid)
if err != nil {
return nil, fmt.Errorf("nvidia-smi GPU UUID fallback failed: %w", err)
}
......@@ -202,7 +202,7 @@ func configureCheckpoint(
}
m := types.NewCheckpointManifest(
req.CheckpointHash,
req.CheckpointID,
types.NewCRIUDumpManifest(criuOpts, cfg.CRIU),
types.NewSourcePodManifest(req.ContainerID, state.PID, req.NodeName, req.PodName, req.PodNamespace, state.StdioFDs),
types.NewOverlayManifest(cfg.Overlay, state.UpperDir, state.OCISpec),
......@@ -235,10 +235,10 @@ func captureCheckpoint(ctx context.Context, criuOpts *criurpc.CriuOpts, criuSett
// propagated — a checkpoint without overlay diffs is still valid for restore
// (the base container image provides the filesystem).
if state.UpperDir != "" {
if _, err := common.CaptureRootfsDiff(state.UpperDir, checkpointDir, data.Overlay.Exclusions, data.Overlay.BindMountDests); err != nil {
if _, err := snapshotruntime.CaptureRootfsDiff(state.UpperDir, checkpointDir, data.Overlay.Exclusions, data.Overlay.BindMountDests); err != nil {
log.Error(err, "Failed to capture rootfs diff")
}
if _, err := common.CaptureDeletedFiles(state.UpperDir, checkpointDir); err != nil {
if _, err := snapshotruntime.CaptureDeletedFiles(state.UpperDir, checkpointDir); err != nil {
log.Error(err, "Failed to capture deleted files")
}
}
......
......@@ -9,10 +9,10 @@ import (
criurpc "github.com/checkpoint-restore/go-criu/v8/rpc"
"github.com/go-logr/logr"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/common"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/criu"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/cuda"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/criu"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/cuda"
snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
)
// RestoreOptions holds configuration for an in-namespace restore.
......@@ -59,10 +59,10 @@ func RestoreInNamespace(ctx context.Context, opts RestoreOptions, log logr.Logge
func executeRestore(ctx context.Context, criuOpts *criurpc.CriuOpts, m *types.CheckpointManifest, opts RestoreOptions, log logr.Logger) (int, error) {
// Apply rootfs diff inside the namespace (target root is /)
if err := common.ApplyRootfsDiff(opts.CheckpointPath, "/", log); err != nil {
if err := snapshotruntime.ApplyRootfsDiff(opts.CheckpointPath, "/", log); err != nil {
return 0, fmt.Errorf("rootfs diff failed: %w", err)
}
if err := common.ApplyDeletedFiles(opts.CheckpointPath, "/", log); err != nil {
if err := snapshotruntime.ApplyDeletedFiles(opts.CheckpointPath, "/", log); err != nil {
log.Error(err, "Failed to apply deleted files")
}
......@@ -71,11 +71,11 @@ func executeRestore(ctx context.Context, criuOpts *criurpc.CriuOpts, m *types.Ch
return 0, fmt.Errorf("failed to unmount /dev/shm before restore: %w", err)
}
if err := common.RemountProcSys(true); err != nil {
if err := snapshotruntime.RemountProcSys(true); err != nil {
return 0, fmt.Errorf("failed to remount /proc/sys read-write for restore: %w", err)
}
defer func() {
if err := common.RemountProcSys(false); err != nil {
if err := snapshotruntime.RemountProcSys(false); err != nil {
log.Error(err, "Failed to remount /proc/sys read-only after restore")
}
}()
......@@ -85,7 +85,7 @@ func executeRestore(ctx context.Context, criuOpts *criurpc.CriuOpts, m *types.Ch
if err != nil {
return 0, err
}
processes, err := common.ReadProcessTable("/proc")
processes, err := snapshotruntime.ReadProcessTable("/proc")
if err != nil {
return 0, fmt.Errorf("failed to read restored process table: %w", err)
}
......@@ -109,7 +109,7 @@ func executeRestore(ctx context.Context, criuOpts *criurpc.CriuOpts, m *types.Ch
// CUDA restore — remap checkpoint-time innermost namespace PIDs onto the
// current visible restored PIDs before invoking cuda-checkpoint.
if !m.CUDA.IsEmpty() {
restorePIDs, err := common.ResolveManifestPIDsToObservedPIDs(processes, int(restoredPID), m.CUDA.PIDs)
restorePIDs, err := snapshotruntime.ResolveManifestPIDsToObservedPIDs(processes, int(restoredPID), m.CUDA.PIDs)
if err != nil {
return 0, fmt.Errorf("failed to resolve restored CUDA PIDs: %w", err)
}
......
......@@ -16,18 +16,17 @@ import (
"github.com/go-logr/logr"
"k8s.io/client-go/kubernetes"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/common"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/criu"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/cuda"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/logging"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/criu"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/cuda"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/logging"
snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
)
// RestoreRequest holds the parameters for a restore operation.
type RestoreRequest struct {
CheckpointHash string
CheckpointID string
CheckpointLocation string
CheckpointStorageType string
NSRestorePath string
PodName string
PodNamespace string
......@@ -42,7 +41,7 @@ type RestoreRequest struct {
func Restore(ctx context.Context, ctrd *containerd.Client, log logr.Logger, req RestoreRequest) (int, error) {
restoreStart := time.Now()
log.Info("=== Starting external restore ===",
"checkpoint_hash", req.CheckpointHash,
"checkpoint_id", req.CheckpointID,
"pod", req.PodName,
"namespace", req.PodNamespace,
"container", req.ContainerName,
......@@ -63,7 +62,7 @@ func Restore(ctx context.Context, ctrd *containerd.Client, log logr.Logger, req
// Validate restored process from the host side
procRoot := filepath.Join(snap.TargetRoot, "proc")
if err := common.ValidateProcessState(procRoot, restoredPID); err != nil {
if err := snapshotruntime.ValidateProcessState(procRoot, restoredPID); err != nil {
restoreLogPath := filepath.Join(snap.TargetRoot, "var", "criu-work", criu.RestoreLogFilename)
logging.LogProcessDiagnostics(procRoot, restoredPID, restoreLogPath, log)
return 0, fmt.Errorf("restored process failed post-restore validation: %w", err)
......@@ -75,9 +74,6 @@ func Restore(ctx context.Context, ctrd *containerd.Client, log logr.Logger, req
}
func inspectRestore(ctx context.Context, ctrd *containerd.Client, log logr.Logger, req RestoreRequest) (*types.RestoreContainerSnapshot, error) {
if req.CheckpointStorageType != "pvc" {
return nil, fmt.Errorf("checkpoint storage type %q is not supported", req.CheckpointStorageType)
}
if req.CheckpointLocation == "" {
return nil, fmt.Errorf("checkpoint location is required")
}
......@@ -92,7 +88,7 @@ func inspectRestore(ctx context.Context, ctrd *containerd.Client, log logr.Logge
return nil, fmt.Errorf("failed to resolve checkpoint path: %w", err)
}
if checkpointAbs != baseAbs && !strings.HasPrefix(checkpointAbs, baseAbs+string(os.PathSeparator)) {
return nil, fmt.Errorf("invalid checkpoint hash %q", req.CheckpointHash)
return nil, fmt.Errorf("invalid checkpoint id %q", req.CheckpointID)
}
m, err := types.ReadManifest(checkpointPath)
......@@ -105,13 +101,13 @@ func inspectRestore(ctx context.Context, ctrd *containerd.Client, log logr.Logge
containerName = "main"
}
placeholderPID, _, err := common.ResolveContainerByPod(ctx, ctrd, req.PodName, req.PodNamespace, containerName)
placeholderPID, _, err := snapshotruntime.ResolveContainerByPod(ctx, ctrd, req.PodName, req.PodNamespace, containerName)
if err != nil {
return nil, fmt.Errorf("failed to resolve placeholder container: %w", err)
}
log.Info("Resolved placeholder container", "pid", placeholderPID)
cgroupRoot, err := common.ResolveCgroupRootFromHostPID(placeholderPID)
cgroupRoot, err := snapshotruntime.ResolveCgroupRootFromHostPID(placeholderPID)
if err != nil {
log.Error(err, "Failed to resolve placeholder cgroup root; proceeding without explicit cgroup remap")
cgroupRoot = ""
......@@ -128,7 +124,7 @@ func inspectRestore(ctx context.Context, ctrd *containerd.Client, log logr.Logge
}
if len(targetGPUUUIDs) == 0 {
log.Info("PodResources API returned no target GPU UUIDs, falling back to nvidia-smi", "pid", placeholderPID)
targetGPUUUIDs, err = cuda.GetGPUUUIDsViaNvidiaSmi(ctx, common.HostProcPath, placeholderPID)
targetGPUUUIDs, err = cuda.GetGPUUUIDsViaNvidiaSmi(ctx, snapshotruntime.HostProcPath, placeholderPID)
if err != nil {
return nil, fmt.Errorf("nvidia-smi GPU UUID fallback failed for restore target: %w", err)
}
......@@ -151,7 +147,7 @@ func inspectRestore(ctx context.Context, ctrd *containerd.Client, log logr.Logge
return &types.RestoreContainerSnapshot{
CheckpointPath: checkpointPath,
PlaceholderPID: placeholderPID,
TargetRoot: fmt.Sprintf("%s/%d/root", common.HostProcPath, placeholderPID),
TargetRoot: fmt.Sprintf("%s/%d/root", snapshotruntime.HostProcPath, placeholderPID),
CgroupRoot: cgroupRoot,
CUDADeviceMap: cudaDeviceMap,
}, nil
......
......@@ -9,7 +9,7 @@ import (
"github.com/go-logr/logr"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/common"
snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime"
)
// LogProcessDiagnostics logs process state and CRIU restore log for debugging a failed restore.
......@@ -30,7 +30,7 @@ func LogProcessDiagnostics(procRoot string, pid int, restoreLogPath string, log
// Exit code from /proc/stat
if data, err := os.ReadFile(filepath.Join(procRoot, pidStr, "stat")); err == nil {
if ws, err := common.ParseProcExitCode(string(data)); err == nil {
if ws, err := snapshotruntime.ParseProcExitCode(string(data)); err == nil {
entry.Info("Process exit code", "exit_status", ws.ExitStatus(), "term_signal", ws.Signal(), "core_dumped", ws.CoreDump())
}
}
......
package common
package runtime
import (
"fmt"
......@@ -10,7 +10,7 @@ import (
"github.com/moby/sys/mountinfo"
specs "github.com/opencontainers/runtime-spec/specs-go"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
)
// ReadMountInfo reads and parses mountinfo for a container process via /host/proc.
......
package common
package runtime
import (
"os"
......@@ -7,7 +7,7 @@ import (
specs "github.com/opencontainers/runtime-spec/specs-go"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
)
func TestClassifyMounts(t *testing.T) {
......
// Package common provides low-level container, process, and device primitives
// shared across snapshot packages.
package common
// Package runtime provides low-level host and container-runtime primitives for snapshot execution.
package runtime
import (
"context"
......
package common
package runtime
import (
"encoding/json"
......@@ -10,7 +10,7 @@ import (
"github.com/go-logr/logr"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
)
const (
......
package common
package runtime
import (
"encoding/json"
......@@ -8,7 +8,7 @@ import (
"github.com/go-logr/logr/testr"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
)
func TestBuildExclusions(t *testing.T) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment