"vllm/vscode:/vscode.git/clone" did not exist on "ee59a7c61574485cf4ddbc6037ba557941be5c56"
dump.go 4.04 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
package criu

import (
	"fmt"
	"os"
	"path/filepath"
	"strings"
	"time"

	criulib "github.com/checkpoint-restore/go-criu/v8"
	criurpc "github.com/checkpoint-restore/go-criu/v8/rpc"
	"github.com/go-logr/logr"
	"google.golang.org/protobuf/proto"

15
16
	"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/common"
	"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
)

const (
	dumpLogFilename = "dump.log"
	criuConfFilename = "criu.conf"
)

// BuildDumpOptions creates CRIU options from the container snapshot and settings.
// It also writes the criu.conf file for options that cannot be passed via RPC.
// The ImagesDirFd is left unset — ExecuteDump opens it at dump time.
func BuildDumpOptions(
	state *types.CheckpointContainerSnapshot,
	settings *types.CRIUSettings,
	checkpointDir string,
	log logr.Logger,
) (*criurpc.CriuOpts, error) {
	var maskedPaths []string
	if state.OCISpec != nil && state.OCISpec.Linux != nil {
		maskedPaths = state.OCISpec.Linux.MaskedPaths
	}

	externalized, skipped := common.BuildMountPolicy(state.Mounts, state.RootFS, maskedPaths)
	log.V(1).Info("Resolved mount policy for CRIU dump",
		"externalized_count", len(externalized),
		"skipped_count", len(skipped),
	)

	criuOpts := &criurpc.CriuOpts{
		Pid:     proto.Int32(int32(state.PID)),
		Root:    proto.String(state.RootFS),
		LogFile: proto.String(dumpLogFilename),
		// Always externalize network namespace
		External: []string{fmt.Sprintf("net[%d]:extNetNs", state.NetNSInode)},
	}
	criuOpts.ExtMnt = toExtMountMaps(externalized)
	criuOpts.SkipMnt = skipped

	if state.HostCgroupPath != "" {
		criuOpts.FreezeCgroup = proto.String(state.HostCgroupPath)
	}

	if settings == nil {
		return criuOpts, nil
	}

	if err := applyCommonSettings(criuOpts, settings); err != nil {
		return nil, err
	}

	// Dump-only options
	criuOpts.LeaveRunning = proto.Bool(settings.LeaveRunning)
	criuOpts.OrphanPtsMaster = proto.Bool(settings.OrphanPtsMaster)
	criuOpts.ExtMasters = proto.Bool(settings.ExtMasters)
	criuOpts.AutoDedup = proto.Bool(settings.AutoDedup)
	criuOpts.LazyPages = proto.Bool(settings.LazyPages)

	if settings.GhostLimit > 0 {
		criuOpts.GhostLimit = proto.Uint32(settings.GhostLimit)
	}

	// Write criu.conf for options that cannot be passed via RPC.
	if confContent := buildCRIUConf(settings); confContent != "" {
		confPath := filepath.Join(checkpointDir, criuConfFilename)
		if err := os.WriteFile(confPath, []byte(confContent), 0644); err != nil {
			return nil, fmt.Errorf("failed to write criu.conf: %w", err)
		}
		criuOpts.ConfigFile = proto.String(confPath)
	}

	return criuOpts, nil
}

// ExecuteDump opens the image directory FD, runs the CRIU dump, and cleans up.
func ExecuteDump(
	criuOpts *criurpc.CriuOpts,
	checkpointDir string,
	settings *types.CRIUSettings,
	log logr.Logger,
) (time.Duration, error) {
	imageDir, imageDirFD, err := openPathForCRIU(checkpointDir)
	if err != nil {
		return 0, fmt.Errorf("failed to open image directory: %w", err)
	}
	defer imageDir.Close()
	criuOpts.ImagesDirFd = proto.Int32(imageDirFD)

	criuDumpStart := time.Now()
	criuClient := criulib.MakeCriu()
	if settings != nil && strings.TrimSpace(settings.BinaryPath) != "" {
		if _, err := os.Stat(settings.BinaryPath); err != nil {
			return 0, fmt.Errorf("criu binary not found at %s: %w", settings.BinaryPath, err)
		}
		criuClient.SetCriuPath(settings.BinaryPath)
	}
	if err := criuClient.Dump(criuOpts, nil); err != nil {
		dumpDuration := time.Since(criuDumpStart)
		log.Error(err, "CRIU dump failed",
			"duration", dumpDuration,
			"checkpoint_dir", checkpointDir,
			"dump_log_path", fmt.Sprintf("%s/%s", checkpointDir, dumpLogFilename),
		)
		return 0, fmt.Errorf("CRIU dump failed: %w", err)
	}

	criuDumpDuration := time.Since(criuDumpStart)
	log.Info("CRIU dump completed", "duration", criuDumpDuration)
	return criuDumpDuration, nil
}

func buildCRIUConf(c *types.CRIUSettings) string {
	if c == nil {
		return ""
	}
	var content string
	if c.LibDir != "" {
		content += "libdir " + c.LibDir + "\n"
	}
	if c.AllowUprobes {
		content += "allow-uprobes\n"
	}
	if c.SkipInFlight {
		content += "skip-in-flight\n"
	}
	return content
}