main.go 2.24 KB
Newer Older
1
2
3
// Package main provides the snapshot-agent DaemonSet entrypoint.
// The agent runs the node-local snapshot controller and delegates CRIU/CUDA
// execution to the snapshot executor workflows.
4
5
6
7
8
9
10
11
package main

import (
	"context"
	"os"
	"os/signal"
	"syscall"

12
13
14
	"github.com/containerd/containerd"
	"github.com/go-logr/logr"

15
	"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/common"
16
	"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/controller"
17
	"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/logging"
18
19
20
)

func main() {
21
22
23
	rootLog := logging.ConfigureLogger("stdout")
	agentLog := rootLog.WithName("agent")

24
25
	cfg, err := LoadConfigOrDefault(ConfigMapPath)
	if err != nil {
26
		fatal(agentLog, err, "Failed to load configuration")
27
	}
28
29
	if err := cfg.Validate(); err != nil {
		fatal(agentLog, err, "Invalid configuration")
30
31
	}

32
	ctrd, err := containerd.New(common.ContainerdSocket)
33
	if err != nil {
34
		fatal(agentLog, err, "Failed to connect to containerd")
35
	}
36
	defer ctrd.Close()
37
38
39
40
41
42
43

	ctx, cancel := context.WithCancel(context.Background())
	defer cancel()

	sigChan := make(chan os.Signal, 1)
	signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)

44
	agentLog.Info("Starting snapshot agent",
45
		"node", cfg.NodeName,
46
		"restricted_namespace", cfg.RestrictedNamespace,
47
	)
48

49
	nodeController, err := controller.NewNodeController(cfg, ctrd, rootLog.WithName("controller"))
50
	if err != nil {
51
		fatal(agentLog, err, "Failed to create snapshot node controller")
52
	}
53

54
55
	// Run the node-local controller in the background.
	controllerDone := make(chan error, 1)
56
	go func() {
57
58
		agentLog.Info("Snapshot node controller started")
		controllerDone <- nodeController.Run(ctx)
59
60
	}()

61
	// Wait for signal or controller exit.
62
63
64
65
66
	select {
	case <-sigChan:
		agentLog.Info("Shutting down")
		cancel()
		select {
67
		case err := <-controllerDone:
68
			if err != nil {
69
				agentLog.Error(err, "Snapshot node controller exited with error during shutdown")
70
			}
71
		default:
72
		}
73
	case err := <-controllerDone:
74
		if err != nil {
75
			fatal(agentLog, err, "Snapshot node controller exited with error")
76
		}
77
	}
78

79
80
	agentLog.Info("Agent stopped")
}
81

82
83
84
85
86
func fatal(log logr.Logger, err error, msg string, keysAndValues ...interface{}) {
	if err != nil {
		log.Error(err, msg, keysAndValues...)
	} else {
		log.Info(msg, keysAndValues...)
87
	}
88
	os.Exit(1)
89
}