main.go 3.07 KB
Newer Older
1
2
3
4
5
6
7
// Package main provides the restore-entrypoint binary for self-restoring placeholder containers.
// This binary replaces the shell script restore-entrypoint.sh with a Go implementation
// that uses the go-criu library for CRIU operations.
package main

import (
	"context"
8
	"fmt"
9
	"os"
10
	"os/exec"
11
	"os/signal"
12
13
	"path/filepath"
	"strings"
14
15
16
17
18
19
20
	"syscall"

	"github.com/sirupsen/logrus"

	"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/restore"
)

21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
// logGPUDiagnostics logs nvidia-smi output and /dev/nvidia* devices for debugging GPU visibility.
func logGPUDiagnostics(label string) {
	fmt.Printf("=== GPU DIAGNOSTICS [%s] ===\n", label)

	// nvidia-smi
	if out, err := exec.Command("nvidia-smi", "-L").CombinedOutput(); err != nil {
		fmt.Printf("nvidia-smi -L: error: %v\n", err)
	} else {
		fmt.Printf("nvidia-smi -L:\n%s", out)
	}

	// GPU memory usage
	if out, err := exec.Command("nvidia-smi", "--query-gpu=index,uuid,memory.used,memory.total,memory.free", "--format=csv,noheader").CombinedOutput(); err != nil {
		fmt.Printf("nvidia-smi memory query: error: %v\n", err)
	} else {
		fmt.Printf("nvidia-smi memory:\n%s", out)
	}

	// /dev/nvidia* devices
	matches, _ := filepath.Glob("/dev/nvidia*")
	fmt.Printf("/dev/nvidia* devices: %s\n", strings.Join(matches, ", "))

	// NVIDIA_VISIBLE_DEVICES env
	fmt.Printf("NVIDIA_VISIBLE_DEVICES=%s\n", os.Getenv("NVIDIA_VISIBLE_DEVICES"))
	fmt.Printf("CUDA_VISIBLE_DEVICES=%s\n", os.Getenv("CUDA_VISIBLE_DEVICES"))

	// Linux namespaces for PID 1
	for _, ns := range []string{"mnt", "pid", "ipc", "net", "uts", "cgroup"} {
		link, err := os.Readlink(fmt.Sprintf("/proc/1/ns/%s", ns))
		if err != nil {
			link = err.Error()
		}
		fmt.Printf("ns/%s: %s\n", ns, link)
	}

	fmt.Printf("=== END GPU DIAGNOSTICS [%s] ===\n", label)
}

59
func main() {
60
61
62
63
64
	// Log GPU diagnostics BEFORE anything else (gated on DEBUG for production quietness)
	if os.Getenv("DEBUG") == "1" {
		logGPUDiagnostics("PRE-RESTORE")
	}

65
66
67
68
69
70
71
72
	// Set up logging
	log := logrus.New()
	log.SetOutput(os.Stdout)
	log.SetFormatter(&logrus.TextFormatter{
		FullTimestamp:   true,
		TimestampFormat: "2006-01-02 15:04:05",
	})

73
74
75
76
77
78
	// Load configuration from hardcoded defaults + operator-injected env vars.
	// os.Args[1:] are the cold start command args (passed by the operator via pod spec).
	cfg, err := restore.NewRestoreRequest(os.Args[1:])
	if err != nil {
		log.WithError(err).Fatal("Failed to load restore configuration")
	}
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107

	// Set log level based on DEBUG flag
	if cfg.Debug {
		log.SetLevel(logrus.DebugLevel)
	} else {
		log.SetLevel(logrus.InfoLevel)
	}

	entry := log.WithField("component", "restore-entrypoint")

	// Set up context with signal handling for graceful shutdown
	ctx, cancel := context.WithCancel(context.Background())
	defer cancel()

	// Handle shutdown signals
	sigChan := make(chan os.Signal, 1)
	signal.Notify(sigChan, syscall.SIGTERM, syscall.SIGINT)

	go func() {
		sig := <-sigChan
		entry.WithField("signal", sig).Info("Received shutdown signal")
		cancel()
	}()

	// Run the restore entrypoint
	if err := restore.Run(ctx, cfg, entry); err != nil {
		entry.WithError(err).Fatal("Restore entrypoint failed")
	}
}