filesystem.go 9 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
// filesystem.go provides container rootfs introspection, filesystem config/metadata types,
// and rootfs diff capture for CRIU checkpoint.
package checkpoint

import (
	"encoding/json"
	"fmt"
	"os"
	"os/exec"
	"path/filepath"
	"strings"

	specs "github.com/opencontainers/runtime-spec/specs-go"
	"github.com/sirupsen/logrus"
)

// FilesystemConfig is the static config for rootfs exclusions (from values.yaml).
type FilesystemConfig struct {
	// SystemDirs are system directories that should be excluded from rootfs diff.
	// These directories are typically injected/bind-mounted by NVIDIA GPU Operator
	// at container start time, so they already exist in the restore target.
	// Excluding them prevents conflicts (especially socket files which cannot be overwritten).
	// Default: ["./usr", "./etc", "./opt", "./var", "./run"]
	SystemDirs []string `yaml:"systemDirs"`

	// CacheDirs are cache directories that can safely be excluded to reduce checkpoint size.
	// Model weights and other cached data are typically re-downloaded if needed.
	// Default: ["./.cache/huggingface", "./.cache/torch"]
	CacheDirs []string `yaml:"cacheDirs"`

	// AdditionalExclusions are custom paths to exclude from the rootfs diff.
	// Use this for application-specific exclusions.
	// Paths should be relative with "./" prefix (e.g., "./data/temp").
	AdditionalExclusions []string `yaml:"additionalExclusions"`
}

// GetAllExclusions returns all exclusion paths combined.
// This is used when building tar arguments for rootfs diff capture.
func (c *FilesystemConfig) GetAllExclusions() []string {
	if c == nil {
		return nil
	}
	total := len(c.SystemDirs) + len(c.CacheDirs) + len(c.AdditionalExclusions)
	exclusions := make([]string, 0, total)
	exclusions = append(exclusions, c.SystemDirs...)
	exclusions = append(exclusions, c.CacheDirs...)
	exclusions = append(exclusions, c.AdditionalExclusions...)
	return exclusions
}

// Validate checks that the FilesystemConfig has valid values.
func (c *FilesystemConfig) Validate() error {
	if c == nil {
		return nil
	}
	// All paths should start with "./" for tar relative path handling
	for _, path := range c.GetAllExclusions() {
		if !strings.HasPrefix(path, "./") {
			return &ConfigError{
				Field:   "rootfsExclusions",
				Message: "all exclusion paths must start with './' (got: " + path + ")",
			}
		}
	}
	return nil
}

// FilesystemManifest holds runtime filesystem state captured at checkpoint time.
type FilesystemManifest struct {
	Exclusions      FilesystemConfig `yaml:"exclusions"`
	UpperDir        string           `yaml:"upperDir,omitempty"`
	ExternalPaths   []string         `yaml:"externalPaths,omitempty"`
	BindMountDests  []string         `yaml:"bindMountDests,omitempty"`
	HasRootfsDiff   bool             `yaml:"hasRootfsDiff"`
	HasDeletedFiles bool             `yaml:"hasDeletedFiles"`
}

// NewFilesystemManifest constructs FilesystemManifest from config, overlay state, and OCI spec.
func NewFilesystemManifest(exclusions FilesystemConfig, upperDir string, ociSpec *specs.Spec) FilesystemManifest {
	meta := FilesystemManifest{
		Exclusions: exclusions,
		UpperDir:   upperDir,
	}
	if ociSpec == nil {
		return meta
	}

	if ociSpec.Linux != nil {
		meta.ExternalPaths = make([]string, 0, len(ociSpec.Linux.MaskedPaths)+len(ociSpec.Linux.ReadonlyPaths))
		meta.ExternalPaths = append(meta.ExternalPaths, ociSpec.Linux.MaskedPaths...)
		meta.ExternalPaths = append(meta.ExternalPaths, ociSpec.Linux.ReadonlyPaths...)
	}
	for _, m := range ociSpec.Mounts {
		if m.Type == "bind" {
			meta.BindMountDests = append(meta.BindMountDests, m.Destination)
		}
	}
	return meta
}

// GetRootFS returns the container's root filesystem path.
func GetRootFS(pid int) (string, error) {
	rootPath := fmt.Sprintf("%s/%d/root", HostProcPath, pid)

	if _, err := os.Stat(rootPath); err != nil {
		return "", fmt.Errorf("rootfs not accessible at %s: %w", rootPath, err)
	}

	return rootPath, nil
}

// GetOverlayUpperDir extracts the overlay upperdir from mountinfo.
// This is the writable layer of the container's filesystem.
func GetOverlayUpperDir(pid int) (string, error) {
	mountInfo, err := ReadMountInfoFromHostProcPath(pid)
	if err != nil {
		return "", fmt.Errorf("failed to parse mountinfo: %w", err)
	}

	for _, mount := range mountInfo {
		if mount.MountPoint != "/" || mount.FSType != "overlay" {
			continue
		}

		// Parse super options to find upperdir
		for _, opt := range strings.Split(mount.SuperOptions, ",") {
			if strings.HasPrefix(opt, "upperdir=") {
				return strings.TrimPrefix(opt, "upperdir="), nil
			}
		}
	}

	return "", fmt.Errorf("overlay upperdir not found for pid %d", pid)
}

// CaptureRootfsDiff captures the overlay upperdir to a tar file.
// The upperdir contains all filesystem modifications made by the container.
// Excludes bind mount destinations and configured directories to avoid conflicts during restore.
// Returns the path to the tar file or empty string if capture failed.
func CaptureRootfsDiff(upperDir, checkpointDir string, exclusions *FilesystemConfig, bindMountDests []string) (string, error) {
	if upperDir == "" {
		return "", fmt.Errorf("upperdir is empty")
	}

	rootfsDiffPath := filepath.Join(checkpointDir, RootfsDiffFilename)

	// Build tar arguments with xattrs and exclusions
	tarArgs := []string{"--xattrs"}

	// Add configured exclusions (systemDirs, cacheDirs, additionalExclusions from values.yaml)
	if exclusions != nil {
		for _, excl := range exclusions.GetAllExclusions() {
			tarArgs = append(tarArgs, "--exclude="+excl)
		}
	}

	// Add bind mount exclusions passed from caller
	for _, dest := range bindMountDests {
		// Convert absolute path to relative for tar (e.g., /etc/hosts -> ./etc/hosts)
		tarArgs = append(tarArgs, "--exclude=."+dest)
	}
	tarArgs = append(tarArgs, "-C", upperDir, "-cf", rootfsDiffPath, ".")

	cmd := exec.Command("tar", tarArgs...)
	output, err := cmd.CombinedOutput()
	if err != nil {
		return "", fmt.Errorf("tar failed: %w (output: %s)", err, string(output))
	}

	return rootfsDiffPath, nil
}

// CaptureDeletedFiles finds whiteout files and saves them to a JSON file.
// Returns true if deleted files were found and saved.
func CaptureDeletedFiles(upperDir, checkpointDir string) (bool, error) {
	if upperDir == "" {
		return false, nil
	}

	whiteouts, err := FindWhiteoutFiles(upperDir)
	if err != nil {
		return false, fmt.Errorf("failed to find whiteout files: %w", err)
	}

	if len(whiteouts) == 0 {
		return false, nil
	}

	deletedFilesPath := filepath.Join(checkpointDir, DeletedFilesFilename)
	data, err := json.Marshal(whiteouts)
	if err != nil {
		return false, fmt.Errorf("failed to marshal whiteouts: %w", err)
	}

	if err := os.WriteFile(deletedFilesPath, data, 0644); err != nil {
		return false, fmt.Errorf("failed to write deleted files: %w", err)
	}

	return true, nil
}

// FindWhiteoutFiles finds overlay whiteout files in the upperdir.
// Overlay filesystems use .wh.<filename> to mark deleted files.
// Returns a list of paths that were deleted in the container.
func FindWhiteoutFiles(upperDir string) ([]string, error) {
	var whiteouts []string

	err := filepath.Walk(upperDir, func(path string, info os.FileInfo, err error) error {
		if err != nil {
			return err
		}

		name := info.Name()
		if strings.HasPrefix(name, ".wh.") {
			// Convert whiteout marker to actual deleted path
			relPath, _ := filepath.Rel(upperDir, path)
			dir := filepath.Dir(relPath)
			deletedFile := strings.TrimPrefix(name, ".wh.")
			deletedPath := deletedFile
			if dir != "." {
				deletedPath = filepath.Join(dir, deletedFile)
			}
			whiteouts = append(whiteouts, deletedPath)
		}
		return nil
	})

	return whiteouts, err
}

// CaptureRootfsState captures the overlay upperdir and deleted files after CRIU dump.
// Updates the checkpoint manifest with rootfs diff information and saves it.
func CaptureRootfsState(upperDir, checkpointDir string, data *CheckpointManifest, log *logrus.Entry) {
	if upperDir == "" || data == nil {
		return
	}

	// Capture rootfs diff using exclusions from the checkpoint manifest.
	configuredExclusions := data.Filesystem.Exclusions.GetAllExclusions()
	log.WithFields(logrus.Fields{
		"configured_exclusions": configuredExclusions,
		"bind_mount_exclusions": data.Filesystem.BindMountDests,
	}).Debug("Rootfs diff exclusions")
	rootfsDiffPath, err := CaptureRootfsDiff(upperDir, checkpointDir, &data.Filesystem.Exclusions, data.Filesystem.BindMountDests)
	if err != nil {
		log.WithError(err).Warn("Failed to capture rootfs diff")
	} else {
		data.Filesystem.HasRootfsDiff = true
		log.WithFields(logrus.Fields{
			"upperdir": upperDir,
			"tar_path": rootfsDiffPath,
		}).Info("Captured rootfs diff")
	}

	// Capture deleted files (whiteouts)
	hasDeletedFiles, err := CaptureDeletedFiles(upperDir, checkpointDir)
	if err != nil {
		log.WithError(err).Warn("Failed to capture deleted files")
	} else if hasDeletedFiles {
		data.Filesystem.HasDeletedFiles = true
		log.Info("Recorded deleted files (whiteouts)")
	}

	// Update checkpoint manifest with rootfs diff info.
	if err := WriteCheckpointManifest(checkpointDir, data); err != nil {
		log.WithError(err).Warn("Failed to update checkpoint manifest with rootfs diff info")
	}
}