Unverified Commit 43e810a4 authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

refactor(snapshot): add manifest-based snapshotctl flow and shared workload builders (#7671)


Signed-off-by: default avatarSchwinn Saereesitthipitak <schwinns@nvidia.com>
parent 23144df5
......@@ -11,8 +11,8 @@ import (
"github.com/go-logr/logr"
"google.golang.org/protobuf/proto"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/logging"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/logging"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
)
// RestoreLogFilename is the CRIU restore log filename (also used by executor/restore.go).
......
......@@ -9,7 +9,7 @@ import (
"golang.org/x/sys/unix"
"google.golang.org/protobuf/proto"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
)
// parseManageCgroupsMode normalizes and validates the CRIU cgroup mode setting.
......
......@@ -5,7 +5,7 @@ import (
criurpc "github.com/checkpoint-restore/go-criu/v8/rpc"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
)
func TestParseManageCgroupsMode(t *testing.T) {
......
......@@ -10,7 +10,7 @@ import (
"github.com/go-logr/logr"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/common"
snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime"
)
const (
......@@ -57,7 +57,15 @@ func runAction(ctx context.Context, pid int, action, deviceMap string, log logr.
args = append(args, "--device-map", deviceMap)
}
cmd := exec.CommandContext(ctx, cudaCheckpointHelperBinary, args...)
details := common.ReadProcessDetailsOrDefault("/proc", pid)
details := snapshotruntime.ProcessDetails{
ObservedPID: pid,
OutermostPID: pid,
InnermostPID: pid,
NamespacePIDs: []int{pid},
}
if process, err := snapshotruntime.ReadProcessDetails("/proc", pid); err == nil {
details = process
}
start := time.Now()
output, err := cmd.CombinedOutput()
duration := time.Since(start)
......
......@@ -7,31 +7,31 @@ import (
"fmt"
"os"
"path/filepath"
"strings"
"time"
criurpc "github.com/checkpoint-restore/go-criu/v8/rpc"
"github.com/containerd/containerd"
"github.com/go-logr/logr"
"k8s.io/client-go/kubernetes"
"github.com/google/uuid"
"k8s.io/client-go/kubernetes"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/common"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/criu"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/cuda"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/criu"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/cuda"
snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
)
// CheckpointRequest holds per-checkpoint identifiers for a checkpoint operation.
type CheckpointRequest struct {
ContainerID string
ContainerName string
CheckpointHash string
CheckpointLocation string
CheckpointStorageType string
NodeName string
PodName string
PodNamespace string
Clientset kubernetes.Interface
ContainerID string
ContainerName string
CheckpointID string
CheckpointLocation string
NodeName string
PodName string
PodNamespace string
Clientset kubernetes.Interface
}
// Checkpoint performs a CRIU dump of a container.
......@@ -44,8 +44,8 @@ func Checkpoint(ctx context.Context, ctrd *containerd.Client, log logr.Logger, r
checkpointStart := time.Now()
log.Info("=== Starting checkpoint operation ===")
if req.CheckpointStorageType != "pvc" {
return fmt.Errorf("checkpoint storage type %q is not supported", req.CheckpointStorageType)
if strings.TrimSpace(req.CheckpointID) == "" {
return fmt.Errorf("checkpoint ID is required")
}
if req.CheckpointLocation == "" {
return fmt.Errorf("checkpoint location is required")
......@@ -99,33 +99,33 @@ func Checkpoint(ctx context.Context, ctrd *containerd.Client, log logr.Logger, r
func inspectContainer(ctx context.Context, ctrd *containerd.Client, log logr.Logger, req CheckpointRequest) (*types.CheckpointContainerSnapshot, error) {
containerID := req.ContainerID
pid, ociSpec, err := common.ResolveContainer(ctx, ctrd, containerID)
pid, ociSpec, err := snapshotruntime.ResolveContainer(ctx, ctrd, containerID)
if err != nil {
return nil, fmt.Errorf("failed to resolve container: %w", err)
}
var hostCgroupPath string
if cgPath, err := common.ResolveCgroupRootFromHostPID(pid); err == nil && cgPath != "" {
hostCgroupPath = filepath.Join(common.HostCgroupPath, cgPath)
if cgPath, err := snapshotruntime.ResolveCgroupRootFromHostPID(pid); err == nil && cgPath != "" {
hostCgroupPath = filepath.Join(snapshotruntime.HostCgroupPath, cgPath)
}
rootFS, err := common.GetRootFS(pid)
rootFS, err := snapshotruntime.GetRootFS(pid)
if err != nil {
return nil, fmt.Errorf("failed to get rootfs: %w", err)
}
upperDir, err := common.GetOverlayUpperDir(pid)
upperDir, err := snapshotruntime.GetOverlayUpperDir(pid)
if err != nil {
return nil, fmt.Errorf("failed to get overlay upperdir: %w", err)
}
mountInfo, err := common.ReadMountInfo(pid)
mountInfo, err := snapshotruntime.ReadMountInfo(pid)
if err != nil {
return nil, fmt.Errorf("failed to parse mountinfo: %w", err)
}
mounts := common.ClassifyMounts(mountInfo, ociSpec, rootFS)
mounts := snapshotruntime.ClassifyMounts(mountInfo, ociSpec, rootFS)
netNSInode, err := common.GetNetNSInode(pid)
netNSInode, err := snapshotruntime.GetNetNSInode(pid)
if err != nil {
return nil, fmt.Errorf("failed to get net namespace inode: %w", err)
}
......@@ -133,7 +133,7 @@ func inspectContainer(ctx context.Context, ctrd *containerd.Client, log logr.Log
// Read stdio FD targets (like runc's getPipeFds / descriptors.json).
stdioFDs := make([]string, 3)
for i := range 3 {
target, err := os.Readlink(fmt.Sprintf("%s/%d/fd/%d", common.HostProcPath, pid, i))
target, err := os.Readlink(fmt.Sprintf("%s/%d/fd/%d", snapshotruntime.HostProcPath, pid, i))
if err != nil {
log.V(1).Info("Failed to readlink stdio FD", "fd", i, "error", err)
continue
......@@ -142,11 +142,11 @@ func inspectContainer(ctx context.Context, ctrd *containerd.Client, log logr.Log
}
// Discover CUDA processes and GPU UUIDs
allPIDs := common.ProcessTreePIDs(pid)
allPIDs := snapshotruntime.ProcessTreePIDs(pid)
cudaHostPIDs := cuda.FilterProcesses(ctx, allPIDs, log)
cudaNamespacePIDs := make([]int, 0, len(cudaHostPIDs))
for _, cudaHostPID := range cudaHostPIDs {
process, err := common.ReadProcessDetails(common.HostProcPath, cudaHostPID)
process, err := snapshotruntime.ReadProcessDetails(snapshotruntime.HostProcPath, cudaHostPID)
if err != nil {
return nil, fmt.Errorf("failed to read process details for CUDA process %d: %w", cudaHostPID, err)
}
......@@ -166,7 +166,7 @@ func inspectContainer(ctx context.Context, ctrd *containerd.Client, log logr.Log
}
if len(gpuUUIDs) == 0 {
log.Info("PodResources API returned no GPU UUIDs, falling back to nvidia-smi", "pid", pid)
gpuUUIDs, err = cuda.GetGPUUUIDsViaNvidiaSmi(ctx, common.HostProcPath, pid)
gpuUUIDs, err = cuda.GetGPUUUIDsViaNvidiaSmi(ctx, snapshotruntime.HostProcPath, pid)
if err != nil {
return nil, fmt.Errorf("nvidia-smi GPU UUID fallback failed: %w", err)
}
......@@ -202,7 +202,7 @@ func configureCheckpoint(
}
m := types.NewCheckpointManifest(
req.CheckpointHash,
req.CheckpointID,
types.NewCRIUDumpManifest(criuOpts, cfg.CRIU),
types.NewSourcePodManifest(req.ContainerID, state.PID, req.NodeName, req.PodName, req.PodNamespace, state.StdioFDs),
types.NewOverlayManifest(cfg.Overlay, state.UpperDir, state.OCISpec),
......@@ -235,10 +235,10 @@ func captureCheckpoint(ctx context.Context, criuOpts *criurpc.CriuOpts, criuSett
// propagated — a checkpoint without overlay diffs is still valid for restore
// (the base container image provides the filesystem).
if state.UpperDir != "" {
if _, err := common.CaptureRootfsDiff(state.UpperDir, checkpointDir, data.Overlay.Exclusions, data.Overlay.BindMountDests); err != nil {
if _, err := snapshotruntime.CaptureRootfsDiff(state.UpperDir, checkpointDir, data.Overlay.Exclusions, data.Overlay.BindMountDests); err != nil {
log.Error(err, "Failed to capture rootfs diff")
}
if _, err := common.CaptureDeletedFiles(state.UpperDir, checkpointDir); err != nil {
if _, err := snapshotruntime.CaptureDeletedFiles(state.UpperDir, checkpointDir); err != nil {
log.Error(err, "Failed to capture deleted files")
}
}
......
......@@ -9,10 +9,10 @@ import (
criurpc "github.com/checkpoint-restore/go-criu/v8/rpc"
"github.com/go-logr/logr"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/common"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/criu"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/cuda"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/criu"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/cuda"
snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
)
// RestoreOptions holds configuration for an in-namespace restore.
......@@ -59,10 +59,10 @@ func RestoreInNamespace(ctx context.Context, opts RestoreOptions, log logr.Logge
func executeRestore(ctx context.Context, criuOpts *criurpc.CriuOpts, m *types.CheckpointManifest, opts RestoreOptions, log logr.Logger) (int, error) {
// Apply rootfs diff inside the namespace (target root is /)
if err := common.ApplyRootfsDiff(opts.CheckpointPath, "/", log); err != nil {
if err := snapshotruntime.ApplyRootfsDiff(opts.CheckpointPath, "/", log); err != nil {
return 0, fmt.Errorf("rootfs diff failed: %w", err)
}
if err := common.ApplyDeletedFiles(opts.CheckpointPath, "/", log); err != nil {
if err := snapshotruntime.ApplyDeletedFiles(opts.CheckpointPath, "/", log); err != nil {
log.Error(err, "Failed to apply deleted files")
}
......@@ -71,11 +71,11 @@ func executeRestore(ctx context.Context, criuOpts *criurpc.CriuOpts, m *types.Ch
return 0, fmt.Errorf("failed to unmount /dev/shm before restore: %w", err)
}
if err := common.RemountProcSys(true); err != nil {
if err := snapshotruntime.RemountProcSys(true); err != nil {
return 0, fmt.Errorf("failed to remount /proc/sys read-write for restore: %w", err)
}
defer func() {
if err := common.RemountProcSys(false); err != nil {
if err := snapshotruntime.RemountProcSys(false); err != nil {
log.Error(err, "Failed to remount /proc/sys read-only after restore")
}
}()
......@@ -85,7 +85,7 @@ func executeRestore(ctx context.Context, criuOpts *criurpc.CriuOpts, m *types.Ch
if err != nil {
return 0, err
}
processes, err := common.ReadProcessTable("/proc")
processes, err := snapshotruntime.ReadProcessTable("/proc")
if err != nil {
return 0, fmt.Errorf("failed to read restored process table: %w", err)
}
......@@ -109,7 +109,7 @@ func executeRestore(ctx context.Context, criuOpts *criurpc.CriuOpts, m *types.Ch
// CUDA restore — remap checkpoint-time innermost namespace PIDs onto the
// current visible restored PIDs before invoking cuda-checkpoint.
if !m.CUDA.IsEmpty() {
restorePIDs, err := common.ResolveManifestPIDsToObservedPIDs(processes, int(restoredPID), m.CUDA.PIDs)
restorePIDs, err := snapshotruntime.ResolveManifestPIDsToObservedPIDs(processes, int(restoredPID), m.CUDA.PIDs)
if err != nil {
return 0, fmt.Errorf("failed to resolve restored CUDA PIDs: %w", err)
}
......
......@@ -16,23 +16,22 @@ import (
"github.com/go-logr/logr"
"k8s.io/client-go/kubernetes"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/common"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/criu"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/cuda"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/logging"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/criu"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/cuda"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/logging"
snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
)
// RestoreRequest holds the parameters for a restore operation.
type RestoreRequest struct {
CheckpointHash string
CheckpointLocation string
CheckpointStorageType string
NSRestorePath string
PodName string
PodNamespace string
ContainerName string
Clientset kubernetes.Interface
CheckpointID string
CheckpointLocation string
NSRestorePath string
PodName string
PodNamespace string
ContainerName string
Clientset kubernetes.Interface
}
// Restore performs external restore for the given request.
......@@ -42,7 +41,7 @@ type RestoreRequest struct {
func Restore(ctx context.Context, ctrd *containerd.Client, log logr.Logger, req RestoreRequest) (int, error) {
restoreStart := time.Now()
log.Info("=== Starting external restore ===",
"checkpoint_hash", req.CheckpointHash,
"checkpoint_id", req.CheckpointID,
"pod", req.PodName,
"namespace", req.PodNamespace,
"container", req.ContainerName,
......@@ -63,7 +62,7 @@ func Restore(ctx context.Context, ctrd *containerd.Client, log logr.Logger, req
// Validate restored process from the host side
procRoot := filepath.Join(snap.TargetRoot, "proc")
if err := common.ValidateProcessState(procRoot, restoredPID); err != nil {
if err := snapshotruntime.ValidateProcessState(procRoot, restoredPID); err != nil {
restoreLogPath := filepath.Join(snap.TargetRoot, "var", "criu-work", criu.RestoreLogFilename)
logging.LogProcessDiagnostics(procRoot, restoredPID, restoreLogPath, log)
return 0, fmt.Errorf("restored process failed post-restore validation: %w", err)
......@@ -75,9 +74,6 @@ func Restore(ctx context.Context, ctrd *containerd.Client, log logr.Logger, req
}
func inspectRestore(ctx context.Context, ctrd *containerd.Client, log logr.Logger, req RestoreRequest) (*types.RestoreContainerSnapshot, error) {
if req.CheckpointStorageType != "pvc" {
return nil, fmt.Errorf("checkpoint storage type %q is not supported", req.CheckpointStorageType)
}
if req.CheckpointLocation == "" {
return nil, fmt.Errorf("checkpoint location is required")
}
......@@ -92,7 +88,7 @@ func inspectRestore(ctx context.Context, ctrd *containerd.Client, log logr.Logge
return nil, fmt.Errorf("failed to resolve checkpoint path: %w", err)
}
if checkpointAbs != baseAbs && !strings.HasPrefix(checkpointAbs, baseAbs+string(os.PathSeparator)) {
return nil, fmt.Errorf("invalid checkpoint hash %q", req.CheckpointHash)
return nil, fmt.Errorf("invalid checkpoint id %q", req.CheckpointID)
}
m, err := types.ReadManifest(checkpointPath)
......@@ -105,13 +101,13 @@ func inspectRestore(ctx context.Context, ctrd *containerd.Client, log logr.Logge
containerName = "main"
}
placeholderPID, _, err := common.ResolveContainerByPod(ctx, ctrd, req.PodName, req.PodNamespace, containerName)
placeholderPID, _, err := snapshotruntime.ResolveContainerByPod(ctx, ctrd, req.PodName, req.PodNamespace, containerName)
if err != nil {
return nil, fmt.Errorf("failed to resolve placeholder container: %w", err)
}
log.Info("Resolved placeholder container", "pid", placeholderPID)
cgroupRoot, err := common.ResolveCgroupRootFromHostPID(placeholderPID)
cgroupRoot, err := snapshotruntime.ResolveCgroupRootFromHostPID(placeholderPID)
if err != nil {
log.Error(err, "Failed to resolve placeholder cgroup root; proceeding without explicit cgroup remap")
cgroupRoot = ""
......@@ -128,7 +124,7 @@ func inspectRestore(ctx context.Context, ctrd *containerd.Client, log logr.Logge
}
if len(targetGPUUUIDs) == 0 {
log.Info("PodResources API returned no target GPU UUIDs, falling back to nvidia-smi", "pid", placeholderPID)
targetGPUUUIDs, err = cuda.GetGPUUUIDsViaNvidiaSmi(ctx, common.HostProcPath, placeholderPID)
targetGPUUUIDs, err = cuda.GetGPUUUIDsViaNvidiaSmi(ctx, snapshotruntime.HostProcPath, placeholderPID)
if err != nil {
return nil, fmt.Errorf("nvidia-smi GPU UUID fallback failed for restore target: %w", err)
}
......@@ -151,7 +147,7 @@ func inspectRestore(ctx context.Context, ctrd *containerd.Client, log logr.Logge
return &types.RestoreContainerSnapshot{
CheckpointPath: checkpointPath,
PlaceholderPID: placeholderPID,
TargetRoot: fmt.Sprintf("%s/%d/root", common.HostProcPath, placeholderPID),
TargetRoot: fmt.Sprintf("%s/%d/root", snapshotruntime.HostProcPath, placeholderPID),
CgroupRoot: cgroupRoot,
CUDADeviceMap: cudaDeviceMap,
}, nil
......
......@@ -9,7 +9,7 @@ import (
"github.com/go-logr/logr"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/common"
snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime"
)
// LogProcessDiagnostics logs process state and CRIU restore log for debugging a failed restore.
......@@ -30,7 +30,7 @@ func LogProcessDiagnostics(procRoot string, pid int, restoreLogPath string, log
// Exit code from /proc/stat
if data, err := os.ReadFile(filepath.Join(procRoot, pidStr, "stat")); err == nil {
if ws, err := common.ParseProcExitCode(string(data)); err == nil {
if ws, err := snapshotruntime.ParseProcExitCode(string(data)); err == nil {
entry.Info("Process exit code", "exit_status", ws.ExitStatus(), "term_signal", ws.Signal(), "core_dumped", ws.CoreDump())
}
}
......
package common
package runtime
import (
"fmt"
......@@ -10,7 +10,7 @@ import (
"github.com/moby/sys/mountinfo"
specs "github.com/opencontainers/runtime-spec/specs-go"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
)
// ReadMountInfo reads and parses mountinfo for a container process via /host/proc.
......
package common
package runtime
import (
"os"
......@@ -7,7 +7,7 @@ import (
specs "github.com/opencontainers/runtime-spec/specs-go"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
)
func TestClassifyMounts(t *testing.T) {
......
// Package common provides low-level container, process, and device primitives
// shared across snapshot packages.
package common
// Package runtime provides low-level host and container-runtime primitives for snapshot execution.
package runtime
import (
"context"
......
package common
package runtime
import (
"encoding/json"
......@@ -10,7 +10,7 @@ import (
"github.com/go-logr/logr"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
)
const (
......
package common
package runtime
import (
"encoding/json"
......@@ -8,7 +8,7 @@ import (
"github.com/go-logr/logr/testr"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
)
func TestBuildExclusions(t *testing.T) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment