criu.go 6.33 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
// criu provides CRIU-specific configuration and utilities for checkpoint operations.
package checkpoint

import (
	"fmt"
	"os"

	criurpc "github.com/checkpoint-restore/go-criu/v7/rpc"
	"google.golang.org/protobuf/proto"

	checkpointk8s "github.com/ai-dynamo/dynamo/deploy/chrek/pkg/checkpoint/k8s"
	"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/common"
)

// CRIUConfig holds configuration for CRIU dump operations.
// Most options are always-on with safe defaults for K8s environments.
type CRIUConfig struct {
	PID        int
	ImageDirFD int32
	RootFS     string
	GhostLimit uint32 // From env CRIU_GHOST_LIMIT: max ghost file size (0 = CRIU default)
	Timeout    uint32 // From env CRIU_TIMEOUT: checkpoint timeout in seconds (0 = no timeout)
}

// OpenImageDir opens a checkpoint directory and prepares it for CRIU.
// Returns the opened file and its FD. The caller must close the file when done.
// The file descriptor has CLOEXEC cleared so it can be inherited by CRIU.
func OpenImageDir(checkpointDir string) (*os.File, int32, error) {
	return common.OpenDirForCRIU(checkpointDir)
}

// BuildCRIUOpts creates CRIU options from a config struct.
// This sets up the base options; external mounts and namespaces are added separately.
//
// Always-on options for K8s:
//   - LeaveRunning: always keep process running after checkpoint
//   - ShellJob: containers are often session leaders
//   - TcpClose: pod IPs change on restore/migration
//   - FileLocks: applications use file locks
//   - OrphanPtsMaster: containers with TTYs
//   - ExtUnixSk: containers have external Unix sockets
//   - ManageCgroups (IGNORE): let K8s manage cgroups
//   - LinkRemap: handle deleted-but-open files (safe for all workloads)
//   - ExtMasters: external bind mount masters (safe for all workloads)
func BuildCRIUOpts(cfg CRIUConfig) *criurpc.CriuOpts {
	cgMode := criurpc.CriuCgMode_IGNORE
	criuOpts := &criurpc.CriuOpts{
		Pid:               proto.Int32(int32(cfg.PID)),
		ImagesDirFd:       proto.Int32(cfg.ImageDirFD),
		LogLevel:          proto.Int32(4),
		LogFile:           proto.String("dump.log"),
		Root:              proto.String(cfg.RootFS),
		ManageCgroups:     proto.Bool(true),
		ManageCgroupsMode: &cgMode,
		// Always-on for K8s environments
		LeaveRunning:    proto.Bool(true),
		ShellJob:        proto.Bool(true),
		TcpClose:        proto.Bool(true),
		FileLocks:       proto.Bool(true),
		OrphanPtsMaster: proto.Bool(true),
		ExtUnixSk:       proto.Bool(true),
		LinkRemap:       proto.Bool(true),
		ExtMasters:      proto.Bool(true),
	}

	// Optional: ghost limit from env (0 = use CRIU default)
	if cfg.GhostLimit > 0 {
		criuOpts.GhostLimit = proto.Uint32(cfg.GhostLimit)
	}

	// Optional: timeout from env (0 = no timeout)
	if cfg.Timeout > 0 {
		criuOpts.Timeout = proto.Uint32(cfg.Timeout)
	}

	return criuOpts
}

// AddExternalMounts adds mount points as external mounts to CRIU options.
// CRIU requires all mounts to be marked as external for successful restore.
func AddExternalMounts(criuOpts *criurpc.CriuOpts, mounts []AllMountInfo) {
	addedMounts := make(map[string]bool)

	for _, m := range mounts {
		if addedMounts[m.MountPoint] {
			continue
		}
		criuOpts.ExtMnt = append(criuOpts.ExtMnt, &criurpc.ExtMountMap{
			Key: proto.String(m.MountPoint),
			Val: proto.String(m.MountPoint),
		})
		addedMounts[m.MountPoint] = true
	}
}

// AddExternalPaths adds additional paths (masked/readonly) as external mounts.
// These may not appear in mountinfo but CRIU still needs them marked as external.
func AddExternalPaths(criuOpts *criurpc.CriuOpts, paths []string) {
	// Build set of existing mount points
	existing := make(map[string]bool)
	for _, m := range criuOpts.ExtMnt {
		existing[m.GetKey()] = true
	}

	for _, path := range paths {
		if existing[path] {
			continue
		}
		criuOpts.ExtMnt = append(criuOpts.ExtMnt, &criurpc.ExtMountMap{
			Key: proto.String(path),
			Val: proto.String(path),
		})
		existing[path] = true
	}
}

// AddExternalNamespace adds a namespace as external to CRIU options.
// Format: "<type>[<inode>]:<key>"
func AddExternalNamespace(criuOpts *criurpc.CriuOpts, nsType NamespaceType, inode uint64, key string) {
	extNs := fmt.Sprintf("%s[%d]:%s", nsType, inode, key)
	criuOpts.External = append(criuOpts.External, extNs)
}

// AddExternalStrings adds raw external strings to CRIU options.
// Used for additional external mount mappings (e.g., NVIDIA firmware files).
func AddExternalStrings(criuOpts *criurpc.CriuOpts, externals []string) {
	criuOpts.External = append(criuOpts.External, externals...)
}

// ConfigureExternalMounts adds all required external mounts to CRIU options.
// This includes mounts from /proc/pid/mountinfo plus masked/readonly paths from OCI spec.
func ConfigureExternalMounts(criuOpts *criurpc.CriuOpts, pid int, hostProc string, containerInfo *checkpointk8s.ContainerInfo) error {
	// Get all mounts from mountinfo - CRIU needs every mount marked as external
	allMounts, err := GetAllMountsFromMountinfo(pid, hostProc)
	if err != nil {
		return fmt.Errorf("failed to get all mounts from mountinfo: %w", err)
	}

	// Add mounts from mountinfo
	AddExternalMounts(criuOpts, allMounts)

	// Add masked and readonly paths from OCI spec
	AddExternalPaths(criuOpts, containerInfo.GetMaskedPaths())
	AddExternalPaths(criuOpts, containerInfo.GetReadonlyPaths())

	return nil
}

// ConfigureExternalNamespaces adds external namespaces to CRIU options.
// Returns the network namespace inode if found, for logging purposes.
func ConfigureExternalNamespaces(criuOpts *criurpc.CriuOpts, namespaces map[NamespaceType]*NamespaceInfo, externalMounts []string) uint64 {
	var netNsInode uint64

	// Mark network namespace as external for socket binding preservation
	if netNs, ok := namespaces[NamespaceNet]; ok {
		AddExternalNamespace(criuOpts, NamespaceNet, netNs.Inode, "extNetNs")
		netNsInode = netNs.Inode
	}

	// Add additional external mounts (e.g., for NVIDIA firmware files)
	AddExternalStrings(criuOpts, externalMounts)

	return netNsInode
}

// BuildCRIUOptsFromCheckpointOpts constructs CRIU options from checkpoint Options.
// Returns the configured CriuOpts ready for external mount/namespace configuration.
func BuildCRIUOptsFromCheckpointOpts(opts Options, pid int, imageDirFD int32, rootFS string) *criurpc.CriuOpts {
	cfg := CRIUConfig{
		PID:        pid,
		ImageDirFD: imageDirFD,
		RootFS:     rootFS,
		GhostLimit: opts.GhostLimit,
		Timeout:    opts.Timeout,
	}

	return BuildCRIUOpts(cfg)
}