Unverified Commit 43e810a4 authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

refactor(snapshot): add manifest-based snapshotctl flow and shared workload builders (#7671)


Signed-off-by: default avatarSchwinn Saereesitthipitak <schwinns@nvidia.com>
parent 23144df5
...@@ -11,8 +11,8 @@ import ( ...@@ -11,8 +11,8 @@ import (
"github.com/go-logr/logr" "github.com/go-logr/logr"
"google.golang.org/protobuf/proto" "google.golang.org/protobuf/proto"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/logging" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/logging"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
) )
// RestoreLogFilename is the CRIU restore log filename (also used by executor/restore.go). // RestoreLogFilename is the CRIU restore log filename (also used by executor/restore.go).
......
...@@ -9,7 +9,7 @@ import ( ...@@ -9,7 +9,7 @@ import (
"golang.org/x/sys/unix" "golang.org/x/sys/unix"
"google.golang.org/protobuf/proto" "google.golang.org/protobuf/proto"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
) )
// parseManageCgroupsMode normalizes and validates the CRIU cgroup mode setting. // parseManageCgroupsMode normalizes and validates the CRIU cgroup mode setting.
......
...@@ -5,7 +5,7 @@ import ( ...@@ -5,7 +5,7 @@ import (
criurpc "github.com/checkpoint-restore/go-criu/v8/rpc" criurpc "github.com/checkpoint-restore/go-criu/v8/rpc"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
) )
func TestParseManageCgroupsMode(t *testing.T) { func TestParseManageCgroupsMode(t *testing.T) {
......
...@@ -10,7 +10,7 @@ import ( ...@@ -10,7 +10,7 @@ import (
"github.com/go-logr/logr" "github.com/go-logr/logr"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/common" snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime"
) )
const ( const (
...@@ -57,7 +57,15 @@ func runAction(ctx context.Context, pid int, action, deviceMap string, log logr. ...@@ -57,7 +57,15 @@ func runAction(ctx context.Context, pid int, action, deviceMap string, log logr.
args = append(args, "--device-map", deviceMap) args = append(args, "--device-map", deviceMap)
} }
cmd := exec.CommandContext(ctx, cudaCheckpointHelperBinary, args...) cmd := exec.CommandContext(ctx, cudaCheckpointHelperBinary, args...)
details := common.ReadProcessDetailsOrDefault("/proc", pid) details := snapshotruntime.ProcessDetails{
ObservedPID: pid,
OutermostPID: pid,
InnermostPID: pid,
NamespacePIDs: []int{pid},
}
if process, err := snapshotruntime.ReadProcessDetails("/proc", pid); err == nil {
details = process
}
start := time.Now() start := time.Now()
output, err := cmd.CombinedOutput() output, err := cmd.CombinedOutput()
duration := time.Since(start) duration := time.Since(start)
......
...@@ -9,10 +9,10 @@ import ( ...@@ -9,10 +9,10 @@ import (
criurpc "github.com/checkpoint-restore/go-criu/v8/rpc" criurpc "github.com/checkpoint-restore/go-criu/v8/rpc"
"github.com/go-logr/logr" "github.com/go-logr/logr"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/common" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/criu"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/criu" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/cuda"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/cuda" snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
) )
// RestoreOptions holds configuration for an in-namespace restore. // RestoreOptions holds configuration for an in-namespace restore.
...@@ -59,10 +59,10 @@ func RestoreInNamespace(ctx context.Context, opts RestoreOptions, log logr.Logge ...@@ -59,10 +59,10 @@ func RestoreInNamespace(ctx context.Context, opts RestoreOptions, log logr.Logge
func executeRestore(ctx context.Context, criuOpts *criurpc.CriuOpts, m *types.CheckpointManifest, opts RestoreOptions, log logr.Logger) (int, error) { func executeRestore(ctx context.Context, criuOpts *criurpc.CriuOpts, m *types.CheckpointManifest, opts RestoreOptions, log logr.Logger) (int, error) {
// Apply rootfs diff inside the namespace (target root is /) // Apply rootfs diff inside the namespace (target root is /)
if err := common.ApplyRootfsDiff(opts.CheckpointPath, "/", log); err != nil { if err := snapshotruntime.ApplyRootfsDiff(opts.CheckpointPath, "/", log); err != nil {
return 0, fmt.Errorf("rootfs diff failed: %w", err) return 0, fmt.Errorf("rootfs diff failed: %w", err)
} }
if err := common.ApplyDeletedFiles(opts.CheckpointPath, "/", log); err != nil { if err := snapshotruntime.ApplyDeletedFiles(opts.CheckpointPath, "/", log); err != nil {
log.Error(err, "Failed to apply deleted files") log.Error(err, "Failed to apply deleted files")
} }
...@@ -71,11 +71,11 @@ func executeRestore(ctx context.Context, criuOpts *criurpc.CriuOpts, m *types.Ch ...@@ -71,11 +71,11 @@ func executeRestore(ctx context.Context, criuOpts *criurpc.CriuOpts, m *types.Ch
return 0, fmt.Errorf("failed to unmount /dev/shm before restore: %w", err) return 0, fmt.Errorf("failed to unmount /dev/shm before restore: %w", err)
} }
if err := common.RemountProcSys(true); err != nil { if err := snapshotruntime.RemountProcSys(true); err != nil {
return 0, fmt.Errorf("failed to remount /proc/sys read-write for restore: %w", err) return 0, fmt.Errorf("failed to remount /proc/sys read-write for restore: %w", err)
} }
defer func() { defer func() {
if err := common.RemountProcSys(false); err != nil { if err := snapshotruntime.RemountProcSys(false); err != nil {
log.Error(err, "Failed to remount /proc/sys read-only after restore") log.Error(err, "Failed to remount /proc/sys read-only after restore")
} }
}() }()
...@@ -85,7 +85,7 @@ func executeRestore(ctx context.Context, criuOpts *criurpc.CriuOpts, m *types.Ch ...@@ -85,7 +85,7 @@ func executeRestore(ctx context.Context, criuOpts *criurpc.CriuOpts, m *types.Ch
if err != nil { if err != nil {
return 0, err return 0, err
} }
processes, err := common.ReadProcessTable("/proc") processes, err := snapshotruntime.ReadProcessTable("/proc")
if err != nil { if err != nil {
return 0, fmt.Errorf("failed to read restored process table: %w", err) return 0, fmt.Errorf("failed to read restored process table: %w", err)
} }
...@@ -109,7 +109,7 @@ func executeRestore(ctx context.Context, criuOpts *criurpc.CriuOpts, m *types.Ch ...@@ -109,7 +109,7 @@ func executeRestore(ctx context.Context, criuOpts *criurpc.CriuOpts, m *types.Ch
// CUDA restore — remap checkpoint-time innermost namespace PIDs onto the // CUDA restore — remap checkpoint-time innermost namespace PIDs onto the
// current visible restored PIDs before invoking cuda-checkpoint. // current visible restored PIDs before invoking cuda-checkpoint.
if !m.CUDA.IsEmpty() { if !m.CUDA.IsEmpty() {
restorePIDs, err := common.ResolveManifestPIDsToObservedPIDs(processes, int(restoredPID), m.CUDA.PIDs) restorePIDs, err := snapshotruntime.ResolveManifestPIDsToObservedPIDs(processes, int(restoredPID), m.CUDA.PIDs)
if err != nil { if err != nil {
return 0, fmt.Errorf("failed to resolve restored CUDA PIDs: %w", err) return 0, fmt.Errorf("failed to resolve restored CUDA PIDs: %w", err)
} }
......
...@@ -9,7 +9,7 @@ import ( ...@@ -9,7 +9,7 @@ import (
"github.com/go-logr/logr" "github.com/go-logr/logr"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/common" snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime"
) )
// LogProcessDiagnostics logs process state and CRIU restore log for debugging a failed restore. // LogProcessDiagnostics logs process state and CRIU restore log for debugging a failed restore.
...@@ -30,7 +30,7 @@ func LogProcessDiagnostics(procRoot string, pid int, restoreLogPath string, log ...@@ -30,7 +30,7 @@ func LogProcessDiagnostics(procRoot string, pid int, restoreLogPath string, log
// Exit code from /proc/stat // Exit code from /proc/stat
if data, err := os.ReadFile(filepath.Join(procRoot, pidStr, "stat")); err == nil { if data, err := os.ReadFile(filepath.Join(procRoot, pidStr, "stat")); err == nil {
if ws, err := common.ParseProcExitCode(string(data)); err == nil { if ws, err := snapshotruntime.ParseProcExitCode(string(data)); err == nil {
entry.Info("Process exit code", "exit_status", ws.ExitStatus(), "term_signal", ws.Signal(), "core_dumped", ws.CoreDump()) entry.Info("Process exit code", "exit_status", ws.ExitStatus(), "term_signal", ws.Signal(), "core_dumped", ws.CoreDump())
} }
} }
......
package common package runtime
import ( import (
"fmt" "fmt"
...@@ -10,7 +10,7 @@ import ( ...@@ -10,7 +10,7 @@ import (
"github.com/moby/sys/mountinfo" "github.com/moby/sys/mountinfo"
specs "github.com/opencontainers/runtime-spec/specs-go" specs "github.com/opencontainers/runtime-spec/specs-go"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
) )
// ReadMountInfo reads and parses mountinfo for a container process via /host/proc. // ReadMountInfo reads and parses mountinfo for a container process via /host/proc.
......
package common package runtime
import ( import (
"os" "os"
...@@ -7,7 +7,7 @@ import ( ...@@ -7,7 +7,7 @@ import (
specs "github.com/opencontainers/runtime-spec/specs-go" specs "github.com/opencontainers/runtime-spec/specs-go"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
) )
func TestClassifyMounts(t *testing.T) { func TestClassifyMounts(t *testing.T) {
......
package common package runtime
import ( import (
"context" "context"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment