criu.go 5.35 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
// criu provides CRIU-specific configuration and utilities for restore operations.
package restore

import (
	"os"

	criurpc "github.com/checkpoint-restore/go-criu/v7/rpc"
	"github.com/sirupsen/logrus"
	"golang.org/x/sys/unix"
	"google.golang.org/protobuf/proto"

	"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/common"
)

15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
// CRIURestorePlan holds configuration for CRIU restore operations.
// Most fields come from the saved CheckpointManifest.CRIUDump.CRIU settings.
type CRIURestorePlan struct {
	// File descriptors
	ImageDirFD int32
	WorkDirFD  int32
	NetNsFD    int32

	// Paths
	RootPath string
	LogFile  string

	// Options from CheckpointManifest.CRIUDump.CRIU.
	LogLevel          int32
	Timeout           uint32 // CRIU timeout in seconds (0 = no timeout, required for CUDA)
	ShellJob          bool   // Allow session leaders (containers are often session leaders)
	TcpClose          bool   // Close TCP connections (pod IPs change on restore)
	FileLocks         bool   // Allow file locks
	ExtUnixSk         bool   // Allow external Unix sockets
	LinkRemap         bool   // Handle deleted-but-open files via CRIU link remap
	ManageCgroupsMode string // Cgroup handling mode: "ignore" lets K8s manage cgroups

	// External mount mappings (from CheckpointManifest.CRIUDump.ExtMnt).
38
39
40
41
42
43
	ExtMountMaps []*criurpc.ExtMountMap
}

// OpenImageDir opens a checkpoint directory and clears CLOEXEC for CRIU.
// Returns the opened file and its FD. Caller must close the file when done.
func OpenImageDir(checkpointPath string) (*os.File, int32, error) {
44
	return common.OpenPathForCRIU(checkpointPath)
45
46
47
48
49
}

// OpenNetworkNamespace opens the target network namespace for restore.
// Returns the opened file and its FD. Caller must close the file when done.
func OpenNetworkNamespace(nsPath string) (*os.File, int32, error) {
50
	return common.OpenPathForCRIU(nsPath)
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
}

// OpenWorkDir opens a work directory for CRIU and clears CLOEXEC.
// Returns the opened file and its FD, or nil/-1 if workDir is empty or fails.
func OpenWorkDir(workDir string, log *logrus.Entry) (*os.File, int32) {
	if workDir == "" {
		return nil, -1
	}

	// Ensure work directory exists
	if err := os.MkdirAll(workDir, 0755); err != nil {
		log.WithError(err).Warn("Failed to create CRIU work directory, using default")
		return nil, -1
	}

	workDirFile, err := os.Open(workDir)
	if err != nil {
		log.WithError(err).Warn("Failed to open CRIU work directory, using default")
		return nil, -1
	}

	if _, err := unix.FcntlInt(workDirFile.Fd(), unix.F_SETFD, 0); err != nil {
		log.WithError(err).Warn("Failed to clear CLOEXEC on work dir")
		workDirFile.Close()
		return nil, -1
	}

	log.WithField("path", workDir).Info("Using custom CRIU work directory")
	return workDirFile, int32(workDirFile.Fd())
}

82
// BuildCRIURestoreOptions creates CRIU options for restore from a runtime plan.
83
//
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
// Options from CheckpointManifest.CRIUDump.CRIU (saved at checkpoint time):
//   - ShellJob, TcpClose, FileLocks, ExtUnixSk, LinkRemap, ManageCgroupsMode
//
// Hardcoded restore-specific options:
//   - RstSibling: restore in detached mode
//   - MntnsCompatMode: cross-container restore
//   - EvasiveDevices, ForceIrmap: device/inode handling
func BuildCRIURestoreOptions(plan CRIURestorePlan) *criurpc.CriuOpts {
	// Map cgroup management mode from plan.
	var cgMode criurpc.CriuCgMode
	switch plan.ManageCgroupsMode {
	case "soft":
		cgMode = criurpc.CriuCgMode_SOFT
	case "full":
		cgMode = criurpc.CriuCgMode_FULL
	case "strict":
		cgMode = criurpc.CriuCgMode_STRICT
	case "ignore", "":
		cgMode = criurpc.CriuCgMode_IGNORE
	default:
		cgMode = criurpc.CriuCgMode_IGNORE
	}
106
107

	criuOpts := &criurpc.CriuOpts{
108
109
110
		ImagesDirFd: proto.Int32(plan.ImageDirFD),
		LogLevel:    proto.Int32(plan.LogLevel),
		LogFile:     proto.String(plan.LogFile),
111
112

		// Root filesystem - use current container's root
113
		Root: proto.String(plan.RootPath),
114

115
		// Restore in detached mode - process runs in background (restore-specific)
116
117
		RstSibling: proto.Bool(true),

118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
		// Mount namespace mode:
		// - MntnsCompatMode=false (default): Uses mount-v2 with MOVE_MOUNT_SET_GROUP (kernel 5.15+)
		//   This is preferred as it doesn't create temp dirs in /tmp
		// - MntnsCompatMode=true: Uses compat mode which creates /tmp/cr-tmpfs.XXX
		//   This can cause "Device or resource busy" errors on cleanup
		// We explicitly set to false to use mount-v2 (requires kernel 5.15+)
		MntnsCompatMode: proto.Bool(false),

		// Options from saved CheckpointManifest.CRIUDump.CRIU.
		ShellJob:  proto.Bool(plan.ShellJob),
		TcpClose:  proto.Bool(plan.TcpClose),
		FileLocks: proto.Bool(plan.FileLocks),
		ExtUnixSk: proto.Bool(plan.ExtUnixSk),
		LinkRemap: proto.Bool(plan.LinkRemap),

		// Cgroup management from saved settings.
134
135
136
		ManageCgroups:     proto.Bool(true),
		ManageCgroupsMode: &cgMode,

137
		// Device and inode handling (restore-specific)
138
139
140
141
		EvasiveDevices: proto.Bool(true),
		ForceIrmap:     proto.Bool(true),

		// External mount mappings
142
		ExtMnt: plan.ExtMountMaps,
143
144
145
	}

	// Add network namespace inheritance if provided
146
	if plan.NetNsFD >= 0 {
147
148
149
		criuOpts.InheritFd = []*criurpc.InheritFd{
			{
				Key: proto.String("extNetNs"),
150
				Fd:  proto.Int32(plan.NetNsFD),
151
152
153
154
155
			},
		}
	}

	// Add work directory if specified
156
157
158
159
160
161
162
	if plan.WorkDirFD >= 0 {
		criuOpts.WorkDirFd = proto.Int32(plan.WorkDirFD)
	}

	// Add timeout if specified (required for CUDA restores)
	if plan.Timeout > 0 {
		criuOpts.Timeout = proto.Uint32(plan.Timeout)
163
164
165
166
	}

	return criuOpts
}