"vscode:/vscode.git/clone" did not exist on "8392e7a190505b01d9e07981691b72659d394e3c"
Unverified Commit 43e810a4 authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

refactor(snapshot): add manifest-based snapshotctl flow and shared workload builders (#7671)


Signed-off-by: default avatarSchwinn Saereesitthipitak <schwinns@nvidia.com>
parent 23144df5
...@@ -11,8 +11,8 @@ import ( ...@@ -11,8 +11,8 @@ import (
"github.com/go-logr/logr" "github.com/go-logr/logr"
"google.golang.org/protobuf/proto" "google.golang.org/protobuf/proto"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/logging" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/logging"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
) )
// RestoreLogFilename is the CRIU restore log filename (also used by executor/restore.go). // RestoreLogFilename is the CRIU restore log filename (also used by executor/restore.go).
......
...@@ -9,7 +9,7 @@ import ( ...@@ -9,7 +9,7 @@ import (
"golang.org/x/sys/unix" "golang.org/x/sys/unix"
"google.golang.org/protobuf/proto" "google.golang.org/protobuf/proto"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
) )
// parseManageCgroupsMode normalizes and validates the CRIU cgroup mode setting. // parseManageCgroupsMode normalizes and validates the CRIU cgroup mode setting.
......
...@@ -5,7 +5,7 @@ import ( ...@@ -5,7 +5,7 @@ import (
criurpc "github.com/checkpoint-restore/go-criu/v8/rpc" criurpc "github.com/checkpoint-restore/go-criu/v8/rpc"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
) )
func TestParseManageCgroupsMode(t *testing.T) { func TestParseManageCgroupsMode(t *testing.T) {
......
...@@ -10,7 +10,7 @@ import ( ...@@ -10,7 +10,7 @@ import (
"github.com/go-logr/logr" "github.com/go-logr/logr"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/common" snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime"
) )
const ( const (
...@@ -57,7 +57,15 @@ func runAction(ctx context.Context, pid int, action, deviceMap string, log logr. ...@@ -57,7 +57,15 @@ func runAction(ctx context.Context, pid int, action, deviceMap string, log logr.
args = append(args, "--device-map", deviceMap) args = append(args, "--device-map", deviceMap)
} }
cmd := exec.CommandContext(ctx, cudaCheckpointHelperBinary, args...) cmd := exec.CommandContext(ctx, cudaCheckpointHelperBinary, args...)
details := common.ReadProcessDetailsOrDefault("/proc", pid) details := snapshotruntime.ProcessDetails{
ObservedPID: pid,
OutermostPID: pid,
InnermostPID: pid,
NamespacePIDs: []int{pid},
}
if process, err := snapshotruntime.ReadProcessDetails("/proc", pid); err == nil {
details = process
}
start := time.Now() start := time.Now()
output, err := cmd.CombinedOutput() output, err := cmd.CombinedOutput()
duration := time.Since(start) duration := time.Since(start)
......
...@@ -7,31 +7,31 @@ import ( ...@@ -7,31 +7,31 @@ import (
"fmt" "fmt"
"os" "os"
"path/filepath" "path/filepath"
"strings"
"time" "time"
criurpc "github.com/checkpoint-restore/go-criu/v8/rpc" criurpc "github.com/checkpoint-restore/go-criu/v8/rpc"
"github.com/containerd/containerd" "github.com/containerd/containerd"
"github.com/go-logr/logr" "github.com/go-logr/logr"
"k8s.io/client-go/kubernetes"
"github.com/google/uuid" "github.com/google/uuid"
"k8s.io/client-go/kubernetes"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/common" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/criu"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/criu" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/cuda"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/cuda" snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
) )
// CheckpointRequest holds per-checkpoint identifiers for a checkpoint operation. // CheckpointRequest holds per-checkpoint identifiers for a checkpoint operation.
type CheckpointRequest struct { type CheckpointRequest struct {
ContainerID string ContainerID string
ContainerName string ContainerName string
CheckpointHash string CheckpointID string
CheckpointLocation string CheckpointLocation string
CheckpointStorageType string NodeName string
NodeName string PodName string
PodName string PodNamespace string
PodNamespace string Clientset kubernetes.Interface
Clientset kubernetes.Interface
} }
// Checkpoint performs a CRIU dump of a container. // Checkpoint performs a CRIU dump of a container.
...@@ -44,8 +44,8 @@ func Checkpoint(ctx context.Context, ctrd *containerd.Client, log logr.Logger, r ...@@ -44,8 +44,8 @@ func Checkpoint(ctx context.Context, ctrd *containerd.Client, log logr.Logger, r
checkpointStart := time.Now() checkpointStart := time.Now()
log.Info("=== Starting checkpoint operation ===") log.Info("=== Starting checkpoint operation ===")
if req.CheckpointStorageType != "pvc" { if strings.TrimSpace(req.CheckpointID) == "" {
return fmt.Errorf("checkpoint storage type %q is not supported", req.CheckpointStorageType) return fmt.Errorf("checkpoint ID is required")
} }
if req.CheckpointLocation == "" { if req.CheckpointLocation == "" {
return fmt.Errorf("checkpoint location is required") return fmt.Errorf("checkpoint location is required")
...@@ -99,33 +99,33 @@ func Checkpoint(ctx context.Context, ctrd *containerd.Client, log logr.Logger, r ...@@ -99,33 +99,33 @@ func Checkpoint(ctx context.Context, ctrd *containerd.Client, log logr.Logger, r
func inspectContainer(ctx context.Context, ctrd *containerd.Client, log logr.Logger, req CheckpointRequest) (*types.CheckpointContainerSnapshot, error) { func inspectContainer(ctx context.Context, ctrd *containerd.Client, log logr.Logger, req CheckpointRequest) (*types.CheckpointContainerSnapshot, error) {
containerID := req.ContainerID containerID := req.ContainerID
pid, ociSpec, err := common.ResolveContainer(ctx, ctrd, containerID) pid, ociSpec, err := snapshotruntime.ResolveContainer(ctx, ctrd, containerID)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to resolve container: %w", err) return nil, fmt.Errorf("failed to resolve container: %w", err)
} }
var hostCgroupPath string var hostCgroupPath string
if cgPath, err := common.ResolveCgroupRootFromHostPID(pid); err == nil && cgPath != "" { if cgPath, err := snapshotruntime.ResolveCgroupRootFromHostPID(pid); err == nil && cgPath != "" {
hostCgroupPath = filepath.Join(common.HostCgroupPath, cgPath) hostCgroupPath = filepath.Join(snapshotruntime.HostCgroupPath, cgPath)
} }
rootFS, err := common.GetRootFS(pid) rootFS, err := snapshotruntime.GetRootFS(pid)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to get rootfs: %w", err) return nil, fmt.Errorf("failed to get rootfs: %w", err)
} }
upperDir, err := common.GetOverlayUpperDir(pid) upperDir, err := snapshotruntime.GetOverlayUpperDir(pid)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to get overlay upperdir: %w", err) return nil, fmt.Errorf("failed to get overlay upperdir: %w", err)
} }
mountInfo, err := common.ReadMountInfo(pid) mountInfo, err := snapshotruntime.ReadMountInfo(pid)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to parse mountinfo: %w", err) return nil, fmt.Errorf("failed to parse mountinfo: %w", err)
} }
mounts := common.ClassifyMounts(mountInfo, ociSpec, rootFS) mounts := snapshotruntime.ClassifyMounts(mountInfo, ociSpec, rootFS)
netNSInode, err := common.GetNetNSInode(pid) netNSInode, err := snapshotruntime.GetNetNSInode(pid)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to get net namespace inode: %w", err) return nil, fmt.Errorf("failed to get net namespace inode: %w", err)
} }
...@@ -133,7 +133,7 @@ func inspectContainer(ctx context.Context, ctrd *containerd.Client, log logr.Log ...@@ -133,7 +133,7 @@ func inspectContainer(ctx context.Context, ctrd *containerd.Client, log logr.Log
// Read stdio FD targets (like runc's getPipeFds / descriptors.json). // Read stdio FD targets (like runc's getPipeFds / descriptors.json).
stdioFDs := make([]string, 3) stdioFDs := make([]string, 3)
for i := range 3 { for i := range 3 {
target, err := os.Readlink(fmt.Sprintf("%s/%d/fd/%d", common.HostProcPath, pid, i)) target, err := os.Readlink(fmt.Sprintf("%s/%d/fd/%d", snapshotruntime.HostProcPath, pid, i))
if err != nil { if err != nil {
log.V(1).Info("Failed to readlink stdio FD", "fd", i, "error", err) log.V(1).Info("Failed to readlink stdio FD", "fd", i, "error", err)
continue continue
...@@ -142,11 +142,11 @@ func inspectContainer(ctx context.Context, ctrd *containerd.Client, log logr.Log ...@@ -142,11 +142,11 @@ func inspectContainer(ctx context.Context, ctrd *containerd.Client, log logr.Log
} }
// Discover CUDA processes and GPU UUIDs // Discover CUDA processes and GPU UUIDs
allPIDs := common.ProcessTreePIDs(pid) allPIDs := snapshotruntime.ProcessTreePIDs(pid)
cudaHostPIDs := cuda.FilterProcesses(ctx, allPIDs, log) cudaHostPIDs := cuda.FilterProcesses(ctx, allPIDs, log)
cudaNamespacePIDs := make([]int, 0, len(cudaHostPIDs)) cudaNamespacePIDs := make([]int, 0, len(cudaHostPIDs))
for _, cudaHostPID := range cudaHostPIDs { for _, cudaHostPID := range cudaHostPIDs {
process, err := common.ReadProcessDetails(common.HostProcPath, cudaHostPID) process, err := snapshotruntime.ReadProcessDetails(snapshotruntime.HostProcPath, cudaHostPID)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to read process details for CUDA process %d: %w", cudaHostPID, err) return nil, fmt.Errorf("failed to read process details for CUDA process %d: %w", cudaHostPID, err)
} }
...@@ -166,7 +166,7 @@ func inspectContainer(ctx context.Context, ctrd *containerd.Client, log logr.Log ...@@ -166,7 +166,7 @@ func inspectContainer(ctx context.Context, ctrd *containerd.Client, log logr.Log
} }
if len(gpuUUIDs) == 0 { if len(gpuUUIDs) == 0 {
log.Info("PodResources API returned no GPU UUIDs, falling back to nvidia-smi", "pid", pid) log.Info("PodResources API returned no GPU UUIDs, falling back to nvidia-smi", "pid", pid)
gpuUUIDs, err = cuda.GetGPUUUIDsViaNvidiaSmi(ctx, common.HostProcPath, pid) gpuUUIDs, err = cuda.GetGPUUUIDsViaNvidiaSmi(ctx, snapshotruntime.HostProcPath, pid)
if err != nil { if err != nil {
return nil, fmt.Errorf("nvidia-smi GPU UUID fallback failed: %w", err) return nil, fmt.Errorf("nvidia-smi GPU UUID fallback failed: %w", err)
} }
...@@ -202,7 +202,7 @@ func configureCheckpoint( ...@@ -202,7 +202,7 @@ func configureCheckpoint(
} }
m := types.NewCheckpointManifest( m := types.NewCheckpointManifest(
req.CheckpointHash, req.CheckpointID,
types.NewCRIUDumpManifest(criuOpts, cfg.CRIU), types.NewCRIUDumpManifest(criuOpts, cfg.CRIU),
types.NewSourcePodManifest(req.ContainerID, state.PID, req.NodeName, req.PodName, req.PodNamespace, state.StdioFDs), types.NewSourcePodManifest(req.ContainerID, state.PID, req.NodeName, req.PodName, req.PodNamespace, state.StdioFDs),
types.NewOverlayManifest(cfg.Overlay, state.UpperDir, state.OCISpec), types.NewOverlayManifest(cfg.Overlay, state.UpperDir, state.OCISpec),
...@@ -235,10 +235,10 @@ func captureCheckpoint(ctx context.Context, criuOpts *criurpc.CriuOpts, criuSett ...@@ -235,10 +235,10 @@ func captureCheckpoint(ctx context.Context, criuOpts *criurpc.CriuOpts, criuSett
// propagated — a checkpoint without overlay diffs is still valid for restore // propagated — a checkpoint without overlay diffs is still valid for restore
// (the base container image provides the filesystem). // (the base container image provides the filesystem).
if state.UpperDir != "" { if state.UpperDir != "" {
if _, err := common.CaptureRootfsDiff(state.UpperDir, checkpointDir, data.Overlay.Exclusions, data.Overlay.BindMountDests); err != nil { if _, err := snapshotruntime.CaptureRootfsDiff(state.UpperDir, checkpointDir, data.Overlay.Exclusions, data.Overlay.BindMountDests); err != nil {
log.Error(err, "Failed to capture rootfs diff") log.Error(err, "Failed to capture rootfs diff")
} }
if _, err := common.CaptureDeletedFiles(state.UpperDir, checkpointDir); err != nil { if _, err := snapshotruntime.CaptureDeletedFiles(state.UpperDir, checkpointDir); err != nil {
log.Error(err, "Failed to capture deleted files") log.Error(err, "Failed to capture deleted files")
} }
} }
......
...@@ -9,10 +9,10 @@ import ( ...@@ -9,10 +9,10 @@ import (
criurpc "github.com/checkpoint-restore/go-criu/v8/rpc" criurpc "github.com/checkpoint-restore/go-criu/v8/rpc"
"github.com/go-logr/logr" "github.com/go-logr/logr"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/common" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/criu"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/criu" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/cuda"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/cuda" snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
) )
// RestoreOptions holds configuration for an in-namespace restore. // RestoreOptions holds configuration for an in-namespace restore.
...@@ -59,10 +59,10 @@ func RestoreInNamespace(ctx context.Context, opts RestoreOptions, log logr.Logge ...@@ -59,10 +59,10 @@ func RestoreInNamespace(ctx context.Context, opts RestoreOptions, log logr.Logge
func executeRestore(ctx context.Context, criuOpts *criurpc.CriuOpts, m *types.CheckpointManifest, opts RestoreOptions, log logr.Logger) (int, error) { func executeRestore(ctx context.Context, criuOpts *criurpc.CriuOpts, m *types.CheckpointManifest, opts RestoreOptions, log logr.Logger) (int, error) {
// Apply rootfs diff inside the namespace (target root is /) // Apply rootfs diff inside the namespace (target root is /)
if err := common.ApplyRootfsDiff(opts.CheckpointPath, "/", log); err != nil { if err := snapshotruntime.ApplyRootfsDiff(opts.CheckpointPath, "/", log); err != nil {
return 0, fmt.Errorf("rootfs diff failed: %w", err) return 0, fmt.Errorf("rootfs diff failed: %w", err)
} }
if err := common.ApplyDeletedFiles(opts.CheckpointPath, "/", log); err != nil { if err := snapshotruntime.ApplyDeletedFiles(opts.CheckpointPath, "/", log); err != nil {
log.Error(err, "Failed to apply deleted files") log.Error(err, "Failed to apply deleted files")
} }
...@@ -71,11 +71,11 @@ func executeRestore(ctx context.Context, criuOpts *criurpc.CriuOpts, m *types.Ch ...@@ -71,11 +71,11 @@ func executeRestore(ctx context.Context, criuOpts *criurpc.CriuOpts, m *types.Ch
return 0, fmt.Errorf("failed to unmount /dev/shm before restore: %w", err) return 0, fmt.Errorf("failed to unmount /dev/shm before restore: %w", err)
} }
if err := common.RemountProcSys(true); err != nil { if err := snapshotruntime.RemountProcSys(true); err != nil {
return 0, fmt.Errorf("failed to remount /proc/sys read-write for restore: %w", err) return 0, fmt.Errorf("failed to remount /proc/sys read-write for restore: %w", err)
} }
defer func() { defer func() {
if err := common.RemountProcSys(false); err != nil { if err := snapshotruntime.RemountProcSys(false); err != nil {
log.Error(err, "Failed to remount /proc/sys read-only after restore") log.Error(err, "Failed to remount /proc/sys read-only after restore")
} }
}() }()
...@@ -85,7 +85,7 @@ func executeRestore(ctx context.Context, criuOpts *criurpc.CriuOpts, m *types.Ch ...@@ -85,7 +85,7 @@ func executeRestore(ctx context.Context, criuOpts *criurpc.CriuOpts, m *types.Ch
if err != nil { if err != nil {
return 0, err return 0, err
} }
processes, err := common.ReadProcessTable("/proc") processes, err := snapshotruntime.ReadProcessTable("/proc")
if err != nil { if err != nil {
return 0, fmt.Errorf("failed to read restored process table: %w", err) return 0, fmt.Errorf("failed to read restored process table: %w", err)
} }
...@@ -109,7 +109,7 @@ func executeRestore(ctx context.Context, criuOpts *criurpc.CriuOpts, m *types.Ch ...@@ -109,7 +109,7 @@ func executeRestore(ctx context.Context, criuOpts *criurpc.CriuOpts, m *types.Ch
// CUDA restore — remap checkpoint-time innermost namespace PIDs onto the // CUDA restore — remap checkpoint-time innermost namespace PIDs onto the
// current visible restored PIDs before invoking cuda-checkpoint. // current visible restored PIDs before invoking cuda-checkpoint.
if !m.CUDA.IsEmpty() { if !m.CUDA.IsEmpty() {
restorePIDs, err := common.ResolveManifestPIDsToObservedPIDs(processes, int(restoredPID), m.CUDA.PIDs) restorePIDs, err := snapshotruntime.ResolveManifestPIDsToObservedPIDs(processes, int(restoredPID), m.CUDA.PIDs)
if err != nil { if err != nil {
return 0, fmt.Errorf("failed to resolve restored CUDA PIDs: %w", err) return 0, fmt.Errorf("failed to resolve restored CUDA PIDs: %w", err)
} }
......
...@@ -16,23 +16,22 @@ import ( ...@@ -16,23 +16,22 @@ import (
"github.com/go-logr/logr" "github.com/go-logr/logr"
"k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/common" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/criu"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/criu" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/cuda"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/cuda" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/logging"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/logging" snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
) )
// RestoreRequest holds the parameters for a restore operation. // RestoreRequest holds the parameters for a restore operation.
type RestoreRequest struct { type RestoreRequest struct {
CheckpointHash string CheckpointID string
CheckpointLocation string CheckpointLocation string
CheckpointStorageType string NSRestorePath string
NSRestorePath string PodName string
PodName string PodNamespace string
PodNamespace string ContainerName string
ContainerName string Clientset kubernetes.Interface
Clientset kubernetes.Interface
} }
// Restore performs external restore for the given request. // Restore performs external restore for the given request.
...@@ -42,7 +41,7 @@ type RestoreRequest struct { ...@@ -42,7 +41,7 @@ type RestoreRequest struct {
func Restore(ctx context.Context, ctrd *containerd.Client, log logr.Logger, req RestoreRequest) (int, error) { func Restore(ctx context.Context, ctrd *containerd.Client, log logr.Logger, req RestoreRequest) (int, error) {
restoreStart := time.Now() restoreStart := time.Now()
log.Info("=== Starting external restore ===", log.Info("=== Starting external restore ===",
"checkpoint_hash", req.CheckpointHash, "checkpoint_id", req.CheckpointID,
"pod", req.PodName, "pod", req.PodName,
"namespace", req.PodNamespace, "namespace", req.PodNamespace,
"container", req.ContainerName, "container", req.ContainerName,
...@@ -63,7 +62,7 @@ func Restore(ctx context.Context, ctrd *containerd.Client, log logr.Logger, req ...@@ -63,7 +62,7 @@ func Restore(ctx context.Context, ctrd *containerd.Client, log logr.Logger, req
// Validate restored process from the host side // Validate restored process from the host side
procRoot := filepath.Join(snap.TargetRoot, "proc") procRoot := filepath.Join(snap.TargetRoot, "proc")
if err := common.ValidateProcessState(procRoot, restoredPID); err != nil { if err := snapshotruntime.ValidateProcessState(procRoot, restoredPID); err != nil {
restoreLogPath := filepath.Join(snap.TargetRoot, "var", "criu-work", criu.RestoreLogFilename) restoreLogPath := filepath.Join(snap.TargetRoot, "var", "criu-work", criu.RestoreLogFilename)
logging.LogProcessDiagnostics(procRoot, restoredPID, restoreLogPath, log) logging.LogProcessDiagnostics(procRoot, restoredPID, restoreLogPath, log)
return 0, fmt.Errorf("restored process failed post-restore validation: %w", err) return 0, fmt.Errorf("restored process failed post-restore validation: %w", err)
...@@ -75,9 +74,6 @@ func Restore(ctx context.Context, ctrd *containerd.Client, log logr.Logger, req ...@@ -75,9 +74,6 @@ func Restore(ctx context.Context, ctrd *containerd.Client, log logr.Logger, req
} }
func inspectRestore(ctx context.Context, ctrd *containerd.Client, log logr.Logger, req RestoreRequest) (*types.RestoreContainerSnapshot, error) { func inspectRestore(ctx context.Context, ctrd *containerd.Client, log logr.Logger, req RestoreRequest) (*types.RestoreContainerSnapshot, error) {
if req.CheckpointStorageType != "pvc" {
return nil, fmt.Errorf("checkpoint storage type %q is not supported", req.CheckpointStorageType)
}
if req.CheckpointLocation == "" { if req.CheckpointLocation == "" {
return nil, fmt.Errorf("checkpoint location is required") return nil, fmt.Errorf("checkpoint location is required")
} }
...@@ -92,7 +88,7 @@ func inspectRestore(ctx context.Context, ctrd *containerd.Client, log logr.Logge ...@@ -92,7 +88,7 @@ func inspectRestore(ctx context.Context, ctrd *containerd.Client, log logr.Logge
return nil, fmt.Errorf("failed to resolve checkpoint path: %w", err) return nil, fmt.Errorf("failed to resolve checkpoint path: %w", err)
} }
if checkpointAbs != baseAbs && !strings.HasPrefix(checkpointAbs, baseAbs+string(os.PathSeparator)) { if checkpointAbs != baseAbs && !strings.HasPrefix(checkpointAbs, baseAbs+string(os.PathSeparator)) {
return nil, fmt.Errorf("invalid checkpoint hash %q", req.CheckpointHash) return nil, fmt.Errorf("invalid checkpoint id %q", req.CheckpointID)
} }
m, err := types.ReadManifest(checkpointPath) m, err := types.ReadManifest(checkpointPath)
...@@ -105,13 +101,13 @@ func inspectRestore(ctx context.Context, ctrd *containerd.Client, log logr.Logge ...@@ -105,13 +101,13 @@ func inspectRestore(ctx context.Context, ctrd *containerd.Client, log logr.Logge
containerName = "main" containerName = "main"
} }
placeholderPID, _, err := common.ResolveContainerByPod(ctx, ctrd, req.PodName, req.PodNamespace, containerName) placeholderPID, _, err := snapshotruntime.ResolveContainerByPod(ctx, ctrd, req.PodName, req.PodNamespace, containerName)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to resolve placeholder container: %w", err) return nil, fmt.Errorf("failed to resolve placeholder container: %w", err)
} }
log.Info("Resolved placeholder container", "pid", placeholderPID) log.Info("Resolved placeholder container", "pid", placeholderPID)
cgroupRoot, err := common.ResolveCgroupRootFromHostPID(placeholderPID) cgroupRoot, err := snapshotruntime.ResolveCgroupRootFromHostPID(placeholderPID)
if err != nil { if err != nil {
log.Error(err, "Failed to resolve placeholder cgroup root; proceeding without explicit cgroup remap") log.Error(err, "Failed to resolve placeholder cgroup root; proceeding without explicit cgroup remap")
cgroupRoot = "" cgroupRoot = ""
...@@ -128,7 +124,7 @@ func inspectRestore(ctx context.Context, ctrd *containerd.Client, log logr.Logge ...@@ -128,7 +124,7 @@ func inspectRestore(ctx context.Context, ctrd *containerd.Client, log logr.Logge
} }
if len(targetGPUUUIDs) == 0 { if len(targetGPUUUIDs) == 0 {
log.Info("PodResources API returned no target GPU UUIDs, falling back to nvidia-smi", "pid", placeholderPID) log.Info("PodResources API returned no target GPU UUIDs, falling back to nvidia-smi", "pid", placeholderPID)
targetGPUUUIDs, err = cuda.GetGPUUUIDsViaNvidiaSmi(ctx, common.HostProcPath, placeholderPID) targetGPUUUIDs, err = cuda.GetGPUUUIDsViaNvidiaSmi(ctx, snapshotruntime.HostProcPath, placeholderPID)
if err != nil { if err != nil {
return nil, fmt.Errorf("nvidia-smi GPU UUID fallback failed for restore target: %w", err) return nil, fmt.Errorf("nvidia-smi GPU UUID fallback failed for restore target: %w", err)
} }
...@@ -151,7 +147,7 @@ func inspectRestore(ctx context.Context, ctrd *containerd.Client, log logr.Logge ...@@ -151,7 +147,7 @@ func inspectRestore(ctx context.Context, ctrd *containerd.Client, log logr.Logge
return &types.RestoreContainerSnapshot{ return &types.RestoreContainerSnapshot{
CheckpointPath: checkpointPath, CheckpointPath: checkpointPath,
PlaceholderPID: placeholderPID, PlaceholderPID: placeholderPID,
TargetRoot: fmt.Sprintf("%s/%d/root", common.HostProcPath, placeholderPID), TargetRoot: fmt.Sprintf("%s/%d/root", snapshotruntime.HostProcPath, placeholderPID),
CgroupRoot: cgroupRoot, CgroupRoot: cgroupRoot,
CUDADeviceMap: cudaDeviceMap, CUDADeviceMap: cudaDeviceMap,
}, nil }, nil
......
...@@ -9,7 +9,7 @@ import ( ...@@ -9,7 +9,7 @@ import (
"github.com/go-logr/logr" "github.com/go-logr/logr"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/common" snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime"
) )
// LogProcessDiagnostics logs process state and CRIU restore log for debugging a failed restore. // LogProcessDiagnostics logs process state and CRIU restore log for debugging a failed restore.
...@@ -30,7 +30,7 @@ func LogProcessDiagnostics(procRoot string, pid int, restoreLogPath string, log ...@@ -30,7 +30,7 @@ func LogProcessDiagnostics(procRoot string, pid int, restoreLogPath string, log
// Exit code from /proc/stat // Exit code from /proc/stat
if data, err := os.ReadFile(filepath.Join(procRoot, pidStr, "stat")); err == nil { if data, err := os.ReadFile(filepath.Join(procRoot, pidStr, "stat")); err == nil {
if ws, err := common.ParseProcExitCode(string(data)); err == nil { if ws, err := snapshotruntime.ParseProcExitCode(string(data)); err == nil {
entry.Info("Process exit code", "exit_status", ws.ExitStatus(), "term_signal", ws.Signal(), "core_dumped", ws.CoreDump()) entry.Info("Process exit code", "exit_status", ws.ExitStatus(), "term_signal", ws.Signal(), "core_dumped", ws.CoreDump())
} }
} }
......
package common package runtime
import ( import (
"fmt" "fmt"
...@@ -10,7 +10,7 @@ import ( ...@@ -10,7 +10,7 @@ import (
"github.com/moby/sys/mountinfo" "github.com/moby/sys/mountinfo"
specs "github.com/opencontainers/runtime-spec/specs-go" specs "github.com/opencontainers/runtime-spec/specs-go"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
) )
// ReadMountInfo reads and parses mountinfo for a container process via /host/proc. // ReadMountInfo reads and parses mountinfo for a container process via /host/proc.
......
package common package runtime
import ( import (
"os" "os"
...@@ -7,7 +7,7 @@ import ( ...@@ -7,7 +7,7 @@ import (
specs "github.com/opencontainers/runtime-spec/specs-go" specs "github.com/opencontainers/runtime-spec/specs-go"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
) )
func TestClassifyMounts(t *testing.T) { func TestClassifyMounts(t *testing.T) {
......
package common package runtime
import ( import (
"context" "context"
......
// Package common provides low-level container, process, and device primitives // Package runtime provides low-level host and container-runtime primitives for snapshot execution.
// shared across snapshot packages. package runtime
package common
import ( import (
"context" "context"
......
package common package runtime
import ( import (
"encoding/json" "encoding/json"
...@@ -10,7 +10,7 @@ import ( ...@@ -10,7 +10,7 @@ import (
"github.com/go-logr/logr" "github.com/go-logr/logr"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
) )
const ( const (
......
package common package runtime
import ( import (
"encoding/json" "encoding/json"
...@@ -8,7 +8,7 @@ import ( ...@@ -8,7 +8,7 @@ import (
"github.com/go-logr/logr/testr" "github.com/go-logr/logr/testr"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
) )
func TestBuildExclusions(t *testing.T) { func TestBuildExclusions(t *testing.T) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment