"...controller/dynamographdeploymentrequest_controller.go" did not exist on "22d910a5cdd557e9457b81b7084cded9d7f93604"
Unverified Commit 43e810a4 authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

refactor(snapshot): add manifest-based snapshotctl flow and shared workload builders (#7671)


Signed-off-by: default avatarSchwinn Saereesitthipitak <schwinns@nvidia.com>
parent 23144df5
......@@ -11,8 +11,8 @@ import (
"github.com/go-logr/logr"
"google.golang.org/protobuf/proto"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/logging"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/logging"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
)
// RestoreLogFilename is the CRIU restore log filename (also used by executor/restore.go).
......
......@@ -9,7 +9,7 @@ import (
"golang.org/x/sys/unix"
"google.golang.org/protobuf/proto"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
)
// parseManageCgroupsMode normalizes and validates the CRIU cgroup mode setting.
......
......@@ -5,7 +5,7 @@ import (
criurpc "github.com/checkpoint-restore/go-criu/v8/rpc"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
)
func TestParseManageCgroupsMode(t *testing.T) {
......
......@@ -10,7 +10,7 @@ import (
"github.com/go-logr/logr"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/common"
snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime"
)
const (
......@@ -57,7 +57,15 @@ func runAction(ctx context.Context, pid int, action, deviceMap string, log logr.
args = append(args, "--device-map", deviceMap)
}
cmd := exec.CommandContext(ctx, cudaCheckpointHelperBinary, args...)
details := common.ReadProcessDetailsOrDefault("/proc", pid)
details := snapshotruntime.ProcessDetails{
ObservedPID: pid,
OutermostPID: pid,
InnermostPID: pid,
NamespacePIDs: []int{pid},
}
if process, err := snapshotruntime.ReadProcessDetails("/proc", pid); err == nil {
details = process
}
start := time.Now()
output, err := cmd.CombinedOutput()
duration := time.Since(start)
......
......@@ -9,10 +9,10 @@ import (
criurpc "github.com/checkpoint-restore/go-criu/v8/rpc"
"github.com/go-logr/logr"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/common"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/criu"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/cuda"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/criu"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/cuda"
snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
)
// RestoreOptions holds configuration for an in-namespace restore.
......@@ -59,10 +59,10 @@ func RestoreInNamespace(ctx context.Context, opts RestoreOptions, log logr.Logge
func executeRestore(ctx context.Context, criuOpts *criurpc.CriuOpts, m *types.CheckpointManifest, opts RestoreOptions, log logr.Logger) (int, error) {
// Apply rootfs diff inside the namespace (target root is /)
if err := common.ApplyRootfsDiff(opts.CheckpointPath, "/", log); err != nil {
if err := snapshotruntime.ApplyRootfsDiff(opts.CheckpointPath, "/", log); err != nil {
return 0, fmt.Errorf("rootfs diff failed: %w", err)
}
if err := common.ApplyDeletedFiles(opts.CheckpointPath, "/", log); err != nil {
if err := snapshotruntime.ApplyDeletedFiles(opts.CheckpointPath, "/", log); err != nil {
log.Error(err, "Failed to apply deleted files")
}
......@@ -71,11 +71,11 @@ func executeRestore(ctx context.Context, criuOpts *criurpc.CriuOpts, m *types.Ch
return 0, fmt.Errorf("failed to unmount /dev/shm before restore: %w", err)
}
if err := common.RemountProcSys(true); err != nil {
if err := snapshotruntime.RemountProcSys(true); err != nil {
return 0, fmt.Errorf("failed to remount /proc/sys read-write for restore: %w", err)
}
defer func() {
if err := common.RemountProcSys(false); err != nil {
if err := snapshotruntime.RemountProcSys(false); err != nil {
log.Error(err, "Failed to remount /proc/sys read-only after restore")
}
}()
......@@ -85,7 +85,7 @@ func executeRestore(ctx context.Context, criuOpts *criurpc.CriuOpts, m *types.Ch
if err != nil {
return 0, err
}
processes, err := common.ReadProcessTable("/proc")
processes, err := snapshotruntime.ReadProcessTable("/proc")
if err != nil {
return 0, fmt.Errorf("failed to read restored process table: %w", err)
}
......@@ -109,7 +109,7 @@ func executeRestore(ctx context.Context, criuOpts *criurpc.CriuOpts, m *types.Ch
// CUDA restore — remap checkpoint-time innermost namespace PIDs onto the
// current visible restored PIDs before invoking cuda-checkpoint.
if !m.CUDA.IsEmpty() {
restorePIDs, err := common.ResolveManifestPIDsToObservedPIDs(processes, int(restoredPID), m.CUDA.PIDs)
restorePIDs, err := snapshotruntime.ResolveManifestPIDsToObservedPIDs(processes, int(restoredPID), m.CUDA.PIDs)
if err != nil {
return 0, fmt.Errorf("failed to resolve restored CUDA PIDs: %w", err)
}
......
......@@ -9,7 +9,7 @@ import (
"github.com/go-logr/logr"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/common"
snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime"
)
// LogProcessDiagnostics logs process state and CRIU restore log for debugging a failed restore.
......@@ -30,7 +30,7 @@ func LogProcessDiagnostics(procRoot string, pid int, restoreLogPath string, log
// Exit code from /proc/stat
if data, err := os.ReadFile(filepath.Join(procRoot, pidStr, "stat")); err == nil {
if ws, err := common.ParseProcExitCode(string(data)); err == nil {
if ws, err := snapshotruntime.ParseProcExitCode(string(data)); err == nil {
entry.Info("Process exit code", "exit_status", ws.ExitStatus(), "term_signal", ws.Signal(), "core_dumped", ws.CoreDump())
}
}
......
package common
package runtime
import (
"fmt"
......@@ -10,7 +10,7 @@ import (
"github.com/moby/sys/mountinfo"
specs "github.com/opencontainers/runtime-spec/specs-go"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
)
// ReadMountInfo reads and parses mountinfo for a container process via /host/proc.
......
package common
package runtime
import (
"os"
......@@ -7,7 +7,7 @@ import (
specs "github.com/opencontainers/runtime-spec/specs-go"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
"github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types"
)
func TestClassifyMounts(t *testing.T) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment