"lib/llm/vscode:/vscode.git/clone" did not exist on "a3cf35c3c881d36a0d80b64b3796a4462d34c9a9"
Unverified Commit 43e810a4 authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

refactor(snapshot): add manifest-based snapshotctl flow and shared workload builders (#7671)


Signed-off-by: default avatarSchwinn Saereesitthipitak <schwinns@nvidia.com>
parent 23144df5
...@@ -11,8 +11,8 @@ import ( ...@@ -11,8 +11,8 @@ import (
"github.com/go-logr/logr" "github.com/go-logr/logr"
"google.golang.org/protobuf/proto" "google.golang.org/protobuf/proto"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/logging" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/logging"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
) )
// RestoreLogFilename is the CRIU restore log filename (also used by executor/restore.go). // RestoreLogFilename is the CRIU restore log filename (also used by executor/restore.go).
......
...@@ -9,7 +9,7 @@ import ( ...@@ -9,7 +9,7 @@ import (
"golang.org/x/sys/unix" "golang.org/x/sys/unix"
"google.golang.org/protobuf/proto" "google.golang.org/protobuf/proto"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
) )
// parseManageCgroupsMode normalizes and validates the CRIU cgroup mode setting. // parseManageCgroupsMode normalizes and validates the CRIU cgroup mode setting.
......
...@@ -5,7 +5,7 @@ import ( ...@@ -5,7 +5,7 @@ import (
criurpc "github.com/checkpoint-restore/go-criu/v8/rpc" criurpc "github.com/checkpoint-restore/go-criu/v8/rpc"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
) )
func TestParseManageCgroupsMode(t *testing.T) { func TestParseManageCgroupsMode(t *testing.T) {
......
...@@ -10,7 +10,7 @@ import ( ...@@ -10,7 +10,7 @@ import (
"github.com/go-logr/logr" "github.com/go-logr/logr"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/common" snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime"
) )
const ( const (
...@@ -57,7 +57,15 @@ func runAction(ctx context.Context, pid int, action, deviceMap string, log logr. ...@@ -57,7 +57,15 @@ func runAction(ctx context.Context, pid int, action, deviceMap string, log logr.
args = append(args, "--device-map", deviceMap) args = append(args, "--device-map", deviceMap)
} }
cmd := exec.CommandContext(ctx, cudaCheckpointHelperBinary, args...) cmd := exec.CommandContext(ctx, cudaCheckpointHelperBinary, args...)
details := common.ReadProcessDetailsOrDefault("/proc", pid) details := snapshotruntime.ProcessDetails{
ObservedPID: pid,
OutermostPID: pid,
InnermostPID: pid,
NamespacePIDs: []int{pid},
}
if process, err := snapshotruntime.ReadProcessDetails("/proc", pid); err == nil {
details = process
}
start := time.Now() start := time.Now()
output, err := cmd.CombinedOutput() output, err := cmd.CombinedOutput()
duration := time.Since(start) duration := time.Since(start)
......
...@@ -7,27 +7,27 @@ import ( ...@@ -7,27 +7,27 @@ import (
"fmt" "fmt"
"os" "os"
"path/filepath" "path/filepath"
"strings"
"time" "time"
criurpc "github.com/checkpoint-restore/go-criu/v8/rpc" criurpc "github.com/checkpoint-restore/go-criu/v8/rpc"
"github.com/containerd/containerd" "github.com/containerd/containerd"
"github.com/go-logr/logr" "github.com/go-logr/logr"
"k8s.io/client-go/kubernetes"
"github.com/google/uuid" "github.com/google/uuid"
"k8s.io/client-go/kubernetes"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/common" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/criu"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/criu" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/cuda"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/cuda" snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
) )
// CheckpointRequest holds per-checkpoint identifiers for a checkpoint operation. // CheckpointRequest holds per-checkpoint identifiers for a checkpoint operation.
type CheckpointRequest struct { type CheckpointRequest struct {
ContainerID string ContainerID string
ContainerName string ContainerName string
CheckpointHash string CheckpointID string
CheckpointLocation string CheckpointLocation string
CheckpointStorageType string
NodeName string NodeName string
PodName string PodName string
PodNamespace string PodNamespace string
...@@ -44,8 +44,8 @@ func Checkpoint(ctx context.Context, ctrd *containerd.Client, log logr.Logger, r ...@@ -44,8 +44,8 @@ func Checkpoint(ctx context.Context, ctrd *containerd.Client, log logr.Logger, r
checkpointStart := time.Now() checkpointStart := time.Now()
log.Info("=== Starting checkpoint operation ===") log.Info("=== Starting checkpoint operation ===")
if req.CheckpointStorageType != "pvc" { if strings.TrimSpace(req.CheckpointID) == "" {
return fmt.Errorf("checkpoint storage type %q is not supported", req.CheckpointStorageType) return fmt.Errorf("checkpoint ID is required")
} }
if req.CheckpointLocation == "" { if req.CheckpointLocation == "" {
return fmt.Errorf("checkpoint location is required") return fmt.Errorf("checkpoint location is required")
...@@ -99,33 +99,33 @@ func Checkpoint(ctx context.Context, ctrd *containerd.Client, log logr.Logger, r ...@@ -99,33 +99,33 @@ func Checkpoint(ctx context.Context, ctrd *containerd.Client, log logr.Logger, r
func inspectContainer(ctx context.Context, ctrd *containerd.Client, log logr.Logger, req CheckpointRequest) (*types.CheckpointContainerSnapshot, error) { func inspectContainer(ctx context.Context, ctrd *containerd.Client, log logr.Logger, req CheckpointRequest) (*types.CheckpointContainerSnapshot, error) {
containerID := req.ContainerID containerID := req.ContainerID
pid, ociSpec, err := common.ResolveContainer(ctx, ctrd, containerID) pid, ociSpec, err := snapshotruntime.ResolveContainer(ctx, ctrd, containerID)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to resolve container: %w", err) return nil, fmt.Errorf("failed to resolve container: %w", err)
} }
var hostCgroupPath string var hostCgroupPath string
if cgPath, err := common.ResolveCgroupRootFromHostPID(pid); err == nil && cgPath != "" { if cgPath, err := snapshotruntime.ResolveCgroupRootFromHostPID(pid); err == nil && cgPath != "" {
hostCgroupPath = filepath.Join(common.HostCgroupPath, cgPath) hostCgroupPath = filepath.Join(snapshotruntime.HostCgroupPath, cgPath)
} }
rootFS, err := common.GetRootFS(pid) rootFS, err := snapshotruntime.GetRootFS(pid)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to get rootfs: %w", err) return nil, fmt.Errorf("failed to get rootfs: %w", err)
} }
upperDir, err := common.GetOverlayUpperDir(pid) upperDir, err := snapshotruntime.GetOverlayUpperDir(pid)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to get overlay upperdir: %w", err) return nil, fmt.Errorf("failed to get overlay upperdir: %w", err)
} }
mountInfo, err := common.ReadMountInfo(pid) mountInfo, err := snapshotruntime.ReadMountInfo(pid)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to parse mountinfo: %w", err) return nil, fmt.Errorf("failed to parse mountinfo: %w", err)
} }
mounts := common.ClassifyMounts(mountInfo, ociSpec, rootFS) mounts := snapshotruntime.ClassifyMounts(mountInfo, ociSpec, rootFS)
netNSInode, err := common.GetNetNSInode(pid) netNSInode, err := snapshotruntime.GetNetNSInode(pid)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to get net namespace inode: %w", err) return nil, fmt.Errorf("failed to get net namespace inode: %w", err)
} }
...@@ -133,7 +133,7 @@ func inspectContainer(ctx context.Context, ctrd *containerd.Client, log logr.Log ...@@ -133,7 +133,7 @@ func inspectContainer(ctx context.Context, ctrd *containerd.Client, log logr.Log
// Read stdio FD targets (like runc's getPipeFds / descriptors.json). // Read stdio FD targets (like runc's getPipeFds / descriptors.json).
stdioFDs := make([]string, 3) stdioFDs := make([]string, 3)
for i := range 3 { for i := range 3 {
target, err := os.Readlink(fmt.Sprintf("%s/%d/fd/%d", common.HostProcPath, pid, i)) target, err := os.Readlink(fmt.Sprintf("%s/%d/fd/%d", snapshotruntime.HostProcPath, pid, i))
if err != nil { if err != nil {
log.V(1).Info("Failed to readlink stdio FD", "fd", i, "error", err) log.V(1).Info("Failed to readlink stdio FD", "fd", i, "error", err)
continue continue
...@@ -142,11 +142,11 @@ func inspectContainer(ctx context.Context, ctrd *containerd.Client, log logr.Log ...@@ -142,11 +142,11 @@ func inspectContainer(ctx context.Context, ctrd *containerd.Client, log logr.Log
} }
// Discover CUDA processes and GPU UUIDs // Discover CUDA processes and GPU UUIDs
allPIDs := common.ProcessTreePIDs(pid) allPIDs := snapshotruntime.ProcessTreePIDs(pid)
cudaHostPIDs := cuda.FilterProcesses(ctx, allPIDs, log) cudaHostPIDs := cuda.FilterProcesses(ctx, allPIDs, log)
cudaNamespacePIDs := make([]int, 0, len(cudaHostPIDs)) cudaNamespacePIDs := make([]int, 0, len(cudaHostPIDs))
for _, cudaHostPID := range cudaHostPIDs { for _, cudaHostPID := range cudaHostPIDs {
process, err := common.ReadProcessDetails(common.HostProcPath, cudaHostPID) process, err := snapshotruntime.ReadProcessDetails(snapshotruntime.HostProcPath, cudaHostPID)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to read process details for CUDA process %d: %w", cudaHostPID, err) return nil, fmt.Errorf("failed to read process details for CUDA process %d: %w", cudaHostPID, err)
} }
...@@ -166,7 +166,7 @@ func inspectContainer(ctx context.Context, ctrd *containerd.Client, log logr.Log ...@@ -166,7 +166,7 @@ func inspectContainer(ctx context.Context, ctrd *containerd.Client, log logr.Log
} }
if len(gpuUUIDs) == 0 { if len(gpuUUIDs) == 0 {
log.Info("PodResources API returned no GPU UUIDs, falling back to nvidia-smi", "pid", pid) log.Info("PodResources API returned no GPU UUIDs, falling back to nvidia-smi", "pid", pid)
gpuUUIDs, err = cuda.GetGPUUUIDsViaNvidiaSmi(ctx, common.HostProcPath, pid) gpuUUIDs, err = cuda.GetGPUUUIDsViaNvidiaSmi(ctx, snapshotruntime.HostProcPath, pid)
if err != nil { if err != nil {
return nil, fmt.Errorf("nvidia-smi GPU UUID fallback failed: %w", err) return nil, fmt.Errorf("nvidia-smi GPU UUID fallback failed: %w", err)
} }
...@@ -202,7 +202,7 @@ func configureCheckpoint( ...@@ -202,7 +202,7 @@ func configureCheckpoint(
} }
m := types.NewCheckpointManifest( m := types.NewCheckpointManifest(
req.CheckpointHash, req.CheckpointID,
types.NewCRIUDumpManifest(criuOpts, cfg.CRIU), types.NewCRIUDumpManifest(criuOpts, cfg.CRIU),
types.NewSourcePodManifest(req.ContainerID, state.PID, req.NodeName, req.PodName, req.PodNamespace, state.StdioFDs), types.NewSourcePodManifest(req.ContainerID, state.PID, req.NodeName, req.PodName, req.PodNamespace, state.StdioFDs),
types.NewOverlayManifest(cfg.Overlay, state.UpperDir, state.OCISpec), types.NewOverlayManifest(cfg.Overlay, state.UpperDir, state.OCISpec),
...@@ -235,10 +235,10 @@ func captureCheckpoint(ctx context.Context, criuOpts *criurpc.CriuOpts, criuSett ...@@ -235,10 +235,10 @@ func captureCheckpoint(ctx context.Context, criuOpts *criurpc.CriuOpts, criuSett
// propagated — a checkpoint without overlay diffs is still valid for restore // propagated — a checkpoint without overlay diffs is still valid for restore
// (the base container image provides the filesystem). // (the base container image provides the filesystem).
if state.UpperDir != "" { if state.UpperDir != "" {
if _, err := common.CaptureRootfsDiff(state.UpperDir, checkpointDir, data.Overlay.Exclusions, data.Overlay.BindMountDests); err != nil { if _, err := snapshotruntime.CaptureRootfsDiff(state.UpperDir, checkpointDir, data.Overlay.Exclusions, data.Overlay.BindMountDests); err != nil {
log.Error(err, "Failed to capture rootfs diff") log.Error(err, "Failed to capture rootfs diff")
} }
if _, err := common.CaptureDeletedFiles(state.UpperDir, checkpointDir); err != nil { if _, err := snapshotruntime.CaptureDeletedFiles(state.UpperDir, checkpointDir); err != nil {
log.Error(err, "Failed to capture deleted files") log.Error(err, "Failed to capture deleted files")
} }
} }
......
...@@ -9,7 +9,7 @@ import ( ...@@ -9,7 +9,7 @@ import (
"github.com/go-logr/logr" "github.com/go-logr/logr"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/common" snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime"
) )
// LogProcessDiagnostics logs process state and CRIU restore log for debugging a failed restore. // LogProcessDiagnostics logs process state and CRIU restore log for debugging a failed restore.
...@@ -30,7 +30,7 @@ func LogProcessDiagnostics(procRoot string, pid int, restoreLogPath string, log ...@@ -30,7 +30,7 @@ func LogProcessDiagnostics(procRoot string, pid int, restoreLogPath string, log
// Exit code from /proc/stat // Exit code from /proc/stat
if data, err := os.ReadFile(filepath.Join(procRoot, pidStr, "stat")); err == nil { if data, err := os.ReadFile(filepath.Join(procRoot, pidStr, "stat")); err == nil {
if ws, err := common.ParseProcExitCode(string(data)); err == nil { if ws, err := snapshotruntime.ParseProcExitCode(string(data)); err == nil {
entry.Info("Process exit code", "exit_status", ws.ExitStatus(), "term_signal", ws.Signal(), "core_dumped", ws.CoreDump()) entry.Info("Process exit code", "exit_status", ws.ExitStatus(), "term_signal", ws.Signal(), "core_dumped", ws.CoreDump())
} }
} }
......
package common package runtime
import ( import (
"context" "context"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment