mounts.go 6.31 KB
Newer Older
1
// mounts parses runtime mount state from /proc.
2
3
4
5
package checkpoint

import (
	"fmt"
6
7
	"path"
	"path/filepath"
8
9
	"strings"

10
	specs "github.com/opencontainers/runtime-spec/specs-go"
11

12
13
	"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/common"
)
14

15
16
17
18
19
20
21
22
23
type MountInfo struct {
	MountID      string
	ParentID     string
	MountPoint   string
	Root         string
	FSType       string
	Source       string
	Options      string
	SuperOptions string
24
25
}

26
27
28
29
// MountPolicy is the classified mount plan for CRIU dump options.
type MountPolicy struct {
	Externalized []string
	Skipped      []string
30
31
}

32
// BuildMountPolicy classifies mounts into CRIU extMnt and skipMnt lists.
33
//
34
35
36
37
38
39
40
41
42
43
44
45
// Rule order and precedence (top to bottom):
//  1. Skip non-OCI proc/sys submounts and non-OCI runtime /run submounts.
//     These mounts are typically node/kernel/runtime specific and are the
//     highest-risk source of cross-node restore failures, so skip wins.
//  2. Externalize mounts owned by runtime/OCI:
//     - "/" (rootfs is recreated by runtime in OCI restore path)
//     - OCI mount destinations
//     - OCI masked/readonly paths
//  3. Externalize non-OCI bind-like mounts (mount root is not "/" or ".").
//     This captures runtime-injected file mounts (for example driver files)
//     so CRIU does not try to recreate them from checkpoint data.
//  4. Anything else is left unflagged and handled by CRIU default behavior.
46
//
47
48
49
50
// Precedence: skip > externalize. If a path is classified as skipped, it is
// removed from the externalized set.
func BuildMountPolicy(mountInfo []MountInfo, ociSpec *specs.Spec, rootFS string) *MountPolicy {
	ociManagedSet := collectOCIManagedDestinations(ociSpec, rootFS)
51

52
53
	externalizedSet := make(map[string]struct{}, len(mountInfo)+len(ociManagedSet))
	skippedSet := make(map[string]struct{}, len(mountInfo))
54

55
56
57
58
	for _, mount := range mountInfo {
		mp := normalizeMountPath(mount.MountPoint)
		if mp == "" {
			continue
59
60
		}

61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
		source := path.Clean(strings.TrimSpace(mount.Source))
		root := path.Clean(strings.TrimSpace(mount.Root))
		isOCIManaged := false
		if _, ok := ociManagedSet[mp]; ok {
			isOCIManaged = true
		}
		if !isOCIManaged && strings.HasPrefix(mp, "/run/") {
			if _, ok := ociManagedSet["/var"+mp]; ok {
				isOCIManaged = true
			}
		}
		if !isOCIManaged && strings.HasPrefix(mp, "/var/run/") {
			if _, ok := ociManagedSet[strings.TrimPrefix(mp, "/var")]; ok {
				isOCIManaged = true
			}
		}
77

78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
		// Runtime-owned /run mounts are usually ephemeral tmpfs/overlay mounts
		// or bind-like mounts sourced from host runtime directories.
		// We skip these unless OCI explicitly manages that destination.
		isRunRuntimeMount := strings.HasPrefix(mp, "/run/") &&
			(mount.FSType == "tmpfs" ||
				mount.FSType == "overlay" ||
				strings.HasPrefix(source, "/run/") ||
				strings.HasPrefix(source, "/var/run/") ||
				strings.HasPrefix(root, "/run/") ||
				strings.HasPrefix(root, "/var/run/"))

		if !isOCIManaged && (strings.HasPrefix(mp, "/proc/") || strings.HasPrefix(mp, "/sys/") || isRunRuntimeMount) {
			skippedSet[mp] = struct{}{}
			delete(externalizedSet, mp)
			continue
		}
94

95
96
97
98
		if mp == "/" || isOCIManaged || (root != "." && root != "/") {
			externalizedSet[mp] = struct{}{}
			continue
		}
99
100
	}

101
102
103
104
105
106
107
	// Ensure OCI-managed destinations are externalized, even when mountinfo does not
	// include a direct entry (e.g., runtime-managed masked/readonly paths).
	for mp := range ociManagedSet {
		if _, skipped := skippedSet[mp]; skipped {
			continue
		}
		externalizedSet[mp] = struct{}{}
108
109
	}

110
111
112
	externalized := make([]string, 0, len(externalizedSet))
	for mp := range externalizedSet {
		externalized = append(externalized, mp)
113
	}
114
115
116
	skipped := make([]string, 0, len(skippedSet))
	for mp := range skippedSet {
		skipped = append(skipped, mp)
117
118
	}

119
120
121
	return &MountPolicy{
		Externalized: externalized,
		Skipped:      skipped,
122
123
124
	}
}

125
126
127
128
129
130
131
132
// collectOCIManagedDestinations returns the canonical set of OCI-owned mount
// targets. This includes regular OCI mounts plus Linux masked/readonly paths.
// Those masked/readonly paths may not appear as direct mountinfo entries, but
// still need to be treated as runtime-owned and externalized.
func collectOCIManagedDestinations(ociSpec *specs.Spec, rootFS string) map[string]struct{} {
	set := map[string]struct{}{}
	if ociSpec == nil {
		return set
133
134
	}

135
136
137
138
139
140
141
142
143
144
145
	paths := make([]string, 0, len(ociSpec.Mounts))
	for _, mount := range ociSpec.Mounts {
		paths = append(paths, mount.Destination)
	}
	if ociSpec.Linux != nil {
		paths = append(paths, ociSpec.Linux.MaskedPaths...)
		paths = append(paths, ociSpec.Linux.ReadonlyPaths...)
	}
	for _, raw := range paths {
		if p := normalizeOCIDestinationPath(raw, rootFS); p != "" {
			set[p] = struct{}{}
146
147
148
		}
	}

149
	return set
150
151
}

152
153
154
155
156
157
// normalizeMountPath applies lexical normalization only.
// Mountinfo paths are already kernel truth for the container namespace.
func normalizeMountPath(raw string) string {
	raw = strings.TrimSpace(raw)
	if raw == "" {
		return ""
158
159
	}

160
161
162
	p := path.Clean(raw)
	if !strings.HasPrefix(p, "/") {
		p = "/" + p
163
	}
164
	return path.Clean(p)
165
166
}

167
168
169
170
171
172
// normalizeOCIDestinationPath canonicalizes OCI destinations against container
// rootfs symlinks (for example /var/run -> /run) with lexical fallback.
func normalizeOCIDestinationPath(raw, rootFS string) string {
	p := normalizeMountPath(raw)
	if p == "" || rootFS == "" {
		return p
173
174
	}

175
176
	hostPath := filepath.Join(rootFS, strings.TrimPrefix(p, "/"))
	resolved, err := filepath.EvalSymlinks(hostPath)
177
	if err != nil {
178
		return p
179
180
	}

181
182
183
	rel, err := filepath.Rel(rootFS, resolved)
	if err != nil {
		return p
184
	}
185
186
187
	rel = filepath.ToSlash(rel)
	if rel == "." {
		return "/"
188
	}
189
190
	if strings.HasPrefix(rel, "../") || rel == ".." {
		return p
191
192
	}

193
194
	return normalizeMountPath("/" + rel)
}
195

196
197
198
199
200
func ReadMountInfoFromHostProcPath(pid int) ([]MountInfo, error) {
	mountinfoPath := fmt.Sprintf("%s/%d/mountinfo", HostProcPath, pid)
	parsedMounts, err := common.ParseMountInfoFile(mountinfoPath)
	if err != nil {
		return nil, fmt.Errorf("failed to parse mountinfo at %s: %w", mountinfoPath, err)
201
202
	}

203
204
205
206
207
208
209
210
211
212
213
214
	mounts := make([]MountInfo, 0, len(parsedMounts))
	for _, parsed := range parsedMounts {
		mounts = append(mounts, MountInfo{
			MountID:      parsed.MountID,
			ParentID:     parsed.ParentID,
			MountPoint:   parsed.Path,
			Root:         parsed.Root,
			FSType:       parsed.FSType,
			Source:       parsed.Source,
			Options:      parsed.Options,
			SuperOptions: parsed.SuperOpts,
		})
215
216
	}

217
	return mounts, nil
218
}