criu.go 8.33 KB
Newer Older
1
2
3
4
5
// criu provides CRIU-specific configuration and utilities for checkpoint operations.
package checkpoint

import (
	"fmt"
6
	"time"
7

8
	criu "github.com/checkpoint-restore/go-criu/v7"
9
	criurpc "github.com/checkpoint-restore/go-criu/v7/rpc"
10
11
	specs "github.com/opencontainers/runtime-spec/specs-go"
	"github.com/sirupsen/logrus"
12
13
14
	"google.golang.org/protobuf/proto"
)

15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
// CRIUSettings holds CRIU-specific configuration options.
// Options are categorized by how they are passed to CRIU:
//   - RPC options: Passed via go-criu CriuOpts protobuf
//   - CRIU conf file options: Written to criu.conf (NOT available via RPC)
type CRIUSettings struct {
	// === RPC Options (passed via go-criu CriuOpts) ===

	// GhostLimit is the maximum ghost file size in bytes.
	// Ghost files are deleted-but-open files that CRIU needs to checkpoint.
	// 512MB is recommended for GPU workloads with large memory allocations.
	GhostLimit uint32 `yaml:"ghostLimit"`

	// Timeout is the CRIU operation timeout in seconds.
	// 6 hours (21600s) is recommended for large GPU model checkpoints.
	Timeout uint32 `yaml:"timeout"`

	// LogLevel is the CRIU logging verbosity (0-4).
	LogLevel int32 `yaml:"logLevel"`

	// WorkDir is the CRIU work directory for temporary files.
	WorkDir string `yaml:"workDir"`

	// AutoDedup enables auto-deduplication of memory pages.
	AutoDedup bool `yaml:"autoDedup"`

	// LazyPages enables lazy page migration (experimental).
	LazyPages bool `yaml:"lazyPages"`

	// LeaveRunning keeps the process running after checkpoint (dump only).
	LeaveRunning bool `yaml:"leaveRunning"`

	// ShellJob allows checkpointing session leaders (containers are often session leaders).
	ShellJob bool `yaml:"shellJob"`

	// TcpClose closes TCP connections instead of preserving them (pod IPs change on restore).
	TcpClose bool `yaml:"tcpClose"`

	// FileLocks allows checkpointing processes with file locks.
	FileLocks bool `yaml:"fileLocks"`

	// OrphanPtsMaster allows checkpointing containers with TTYs.
	OrphanPtsMaster bool `yaml:"orphanPtsMaster"`

	// ExtUnixSk allows external Unix sockets.
	ExtUnixSk bool `yaml:"extUnixSk"`
60

61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
	// LinkRemap handles deleted-but-open files.
	LinkRemap bool `yaml:"linkRemap"`

	// ExtMasters allows external bind mount masters.
	ExtMasters bool `yaml:"extMasters"`

	// ManageCgroupsMode controls cgroup handling: "ignore" lets K8s manage cgroups.
	ManageCgroupsMode string `yaml:"manageCgroupsMode"`

	// === CRIU Conf File Options (NOT available via RPC - written to criu.conf) ===

	// LibDir is the path to CRIU plugin directory (e.g., /usr/local/lib/criu).
	// Required for CUDA checkpoint/restore.
	LibDir string `yaml:"libDir"`

	// AllowUprobes allows user-space probes (required for CUDA checkpoints).
	AllowUprobes bool `yaml:"allowUprobes"`

	// SkipInFlight skips in-flight TCP connections during checkpoint/restore.
	SkipInFlight bool `yaml:"skipInFlight"`
81
82
}

83
84
85
86
// GenerateCRIUConfContent generates the criu.conf file content for options
// that cannot be passed via RPC.
func (c *CRIUSettings) GenerateCRIUConfContent() string {
	var content string
87

88
89
	if c.LibDir != "" {
		content += "libdir " + c.LibDir + "\n"
90
	}
91
92
93
94
95
	if c.AllowUprobes {
		content += "allow-uprobes\n"
	}
	if c.SkipInFlight {
		content += "skip-in-flight\n"
96
97
	}

98
	return content
99
100
}

101
102
103
104
105
// ExternalMountManifestEntry is a serializable CRIU ext-mount entry in checkpoint manifests.
type ExternalMountManifestEntry struct {
	Key string `yaml:"key"`
	Val string `yaml:"val"`
}
106

107
108
109
110
111
112
// CRIUDumpManifest stores the resolved dump-time CRIU mount plan used for restore.
type CRIUDumpManifest struct {
	CRIU     CRIUSettings                 `yaml:"criu"`
	ExtMnt   []ExternalMountManifestEntry `yaml:"extMnt,omitempty"`
	External []string                     `yaml:"external,omitempty"`
	SkipMnt  []string                     `yaml:"skipMnt,omitempty"`
113
114
}

115
116
117
118
119
// NewCRIUDumpManifest serializes resolved dump options for restore.
func NewCRIUDumpManifest(criuOpts *criurpc.CriuOpts, settings CRIUSettings) CRIUDumpManifest {
	manifest := CRIUDumpManifest{CRIU: settings}
	if criuOpts == nil {
		return manifest
120
121
	}

122
123
	for _, mount := range criuOpts.ExtMnt {
		if mount == nil || mount.GetKey() == "" {
124
125
			continue
		}
126
127
128
		manifest.ExtMnt = append(manifest.ExtMnt, ExternalMountManifestEntry{
			Key: mount.GetKey(),
			Val: mount.GetVal(),
129
130
		})
	}
131
132
133
	manifest.External = append([]string(nil), criuOpts.External...)
	manifest.SkipMnt = append([]string(nil), criuOpts.SkipMnt...)
	return manifest
134
135
}

136
137
138
139
140
141
142
143
144
145
146
// BuildCRIUDumpOptions creates CRIU options directly from spec settings and runtime state.
func BuildCRIUDumpOptions(
	settings *CRIUSettings,
	pid int,
	imageDirFD int32,
	rootFS string,
	mountInfo []MountInfo,
	ociSpec *specs.Spec,
	namespaces map[NamespaceType]*NamespaceInfo,
) (*criurpc.CriuOpts, error) {
	mountPolicy := BuildMountPolicy(mountInfo, ociSpec, rootFS)
147

148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
	extMnt := buildExternalMountMaps(mountPolicy.Externalized)
	skipMnt := mountPolicy.Skipped
	external := buildExternalNamespaces(namespaces)
	logrus.WithFields(logrus.Fields{
		"externalized_count": len(mountPolicy.Externalized),
		"skipped_count":      len(mountPolicy.Skipped),
	}).Debug("Resolved mount policy for CRIU dump")

	criuOpts := &criurpc.CriuOpts{
		Pid:         proto.Int32(int32(pid)),
		ImagesDirFd: proto.Int32(imageDirFD),
		Root:        proto.String(rootFS),
		LogFile:     proto.String(DumpLogFilename),
	}
	criuOpts.ExtMnt = extMnt
	criuOpts.External = external
	criuOpts.SkipMnt = skipMnt

	if settings == nil {
		return criuOpts, nil
	}

	// RPC options from spec.
	criuOpts.LogLevel = proto.Int32(settings.LogLevel)
	criuOpts.LeaveRunning = proto.Bool(settings.LeaveRunning)
	criuOpts.ShellJob = proto.Bool(settings.ShellJob)
	criuOpts.TcpClose = proto.Bool(settings.TcpClose)
	criuOpts.FileLocks = proto.Bool(settings.FileLocks)
	criuOpts.OrphanPtsMaster = proto.Bool(settings.OrphanPtsMaster)
	criuOpts.ExtUnixSk = proto.Bool(settings.ExtUnixSk)
	criuOpts.LinkRemap = proto.Bool(settings.LinkRemap)
	criuOpts.ExtMasters = proto.Bool(settings.ExtMasters)
	criuOpts.AutoDedup = proto.Bool(settings.AutoDedup)
	criuOpts.LazyPages = proto.Bool(settings.LazyPages)

	// Cgroup management mode
	criuOpts.ManageCgroups = proto.Bool(true)
	cgMode := criurpc.CriuCgMode_IGNORE
	switch settings.ManageCgroupsMode {
	case "soft":
		cgMode = criurpc.CriuCgMode_SOFT
	case "full":
		cgMode = criurpc.CriuCgMode_FULL
	case "strict":
		cgMode = criurpc.CriuCgMode_STRICT
	}
	criuOpts.ManageCgroupsMode = &cgMode
195

196
197
198
199
200
201
	// Optional numeric options
	if settings.GhostLimit > 0 {
		criuOpts.GhostLimit = proto.Uint32(settings.GhostLimit)
	}
	if settings.Timeout > 0 {
		criuOpts.Timeout = proto.Uint32(settings.Timeout)
202
203
	}

204
205
	return criuOpts, nil
}
206

207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
// buildExternalMountMaps serializes externalized mount paths into CRIU map entries.
func buildExternalMountMaps(paths []string) []*criurpc.ExtMountMap {
	extMnt := make([]*criurpc.ExtMountMap, 0, len(paths))
	existing := make(map[string]struct{}, len(paths))
	for _, path := range paths {
		if path == "" {
			continue
		}
		if _, ok := existing[path]; ok {
			continue
		}
		extMnt = append(extMnt, &criurpc.ExtMountMap{
			Key: proto.String(path),
			Val: proto.String(path),
		})
		existing[path] = struct{}{}
	}
224

225
	return extMnt
226
227
}

228
229
230
// buildExternalNamespaces builds external namespace/mount references.
func buildExternalNamespaces(namespaces map[NamespaceType]*NamespaceInfo) []string {
	external := make([]string, 0, 1)
231
232
233

	// Mark network namespace as external for socket binding preservation
	if netNs, ok := namespaces[NamespaceNet]; ok {
234
235
		external = append(external, fmt.Sprintf("%s[%d]:%s", NamespaceNet, netNs.Inode, "extNetNs"))
		logrus.WithField("inode", netNs.Inode).Debug("Marked network namespace as external")
236
237
	}

238
	return external
239
240
}

241
242
243
244
245
246
247
248
249
250
251
252
// ExecuteCRIUDump runs the CRIU dump and logs timing plus dump-log location on failure.
func ExecuteCRIUDump(criuOpts *criurpc.CriuOpts, checkpointDir string, log *logrus.Entry) (time.Duration, error) {
	criuDumpStart := time.Now()
	criuClient := criu.MakeCriu()
	if err := criuClient.Dump(criuOpts, nil); err != nil {
		dumpDuration := time.Since(criuDumpStart)
		log.WithFields(logrus.Fields{
			"duration":       dumpDuration,
			"checkpoint_dir": checkpointDir,
			"dump_log_path":  fmt.Sprintf("%s/%s", checkpointDir, DumpLogFilename),
		}).Error("CRIU dump failed")
		return 0, fmt.Errorf("CRIU dump failed: %w", err)
253
254
	}

255
256
257
	criuDumpDuration := time.Since(criuDumpStart)
	log.WithField("duration", criuDumpDuration).Info("CRIU dump completed")
	return criuDumpDuration, nil
258
}