"vscode:/vscode.git/clone" did not exist on "c754652fcd1a5ac0e727343486657f5ef71b3252"
main.go 4.24 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
package main

import (
	"fmt"
	"get-container/cmd/opsflow/backend"
	"get-container/cmd/opsflow/web"
	"get-container/utils"
	"log"

	"github.com/spf13/pflag"
11
	"github.com/spf13/viper"
12
13
14
15
16
)

var (
	flagPort   = pflag.Int16P("port", "p", 10880, "listen port for service")
	flagServer = pflag.BoolP("server", "s", false, "run as server mode")
17
	flagCmd    = pflag.StringP("cmd", "c", "all", "command to execute, sys/dcu/login/rccl/all")
18
	flagHelp   = pflag.BoolP("help", "h", false, "show help message")
19
	flagCfg    = pflag.String("config", "./opsflow.yaml", "path to config file")
20
21
22
)

func main() {
23
24
25
26
	cfg := viper.New()
	pflag.String("rccl-test-path", "/opt/rccl-tests/build", "Path to rccl-tests")
	pflag.StringSlice("rccl-all-reduce-perf-args", []string{"-b", "8", "-e", "1G", "-f", "2", "-g", "8", "-d", "half"}, "Arguments for rccl all reduce perf")

27
28
29
30
31
32
33
34
	pflag.Parse()
	if *flagHelp {
		fmt.Println(`this is opsflow command line tool.
Usage:
	opsflow [options]

Options:`)
		pflag.PrintDefaults()
35
36
37
		fmt.Println(`Env Valiables:
           OPSFLOW_RCCL_TEST_PATH: set rccl test path
OPSFLOW_RCCL_ALL_REDUCE_PERF_ARGS: set rccl all reduce perf args`)
38
39
40
		return
	}

41
42
43
44
45
46
47
48
49
50
51
52
	cfg.SetDefault("rccl_all_reduce_perf_args", []string{"-b", "8", "-e", "1G", "-f", "2", "-g", "8", "-d", "half"})
	cfg.SetDefault("rccl_test_path", "/opt/rccl-tests/build")
	cfg.SetEnvPrefix("OPSFLOW")
	cfg.AutomaticEnv()
	cfg.BindPFlag("rccl_test_path", pflag.Lookup("rccl-test-path"))
	cfg.BindPFlag("rccl_all_reduce_perf_args", pflag.Lookup("rccl-all-reduce-perf-args"))
	cfg.SetConfigType("yaml")
	if flagCfg != nil {
		cfg.SetConfigFile(*flagCfg)
	}
	cfg.ReadInConfig()

53
54
55
56
	backend.Init()
	defer backend.Shutdown()
	if *flagServer {
		log.Println("start opsflow server mode")
57
		web.Init(cfg)
58
59
60
61
		err := web.WebServer(fmt.Sprintf(":%d", *flagPort))
		if err != nil {
			log.Fatalf("failed to start web server: %v", err)
		}
62
		return
63
64
65
66
67
68
69
70
	}
	switch *flagCmd {
	case "sys":
		PrintSysLoad()
	case "dcu":
		PrintDCUInfo()
	case "login":
		PrintLoginInfo()
71
72
	case "rccl":
		PrintRcclInfo(cfg.GetString("rccl_test_path"), cfg.GetStringSlice("rccl_all_reduce_perf_args")...)
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
	case "all":
		PrintSysLoad()
		PrintDCUInfo()
		PrintLoginInfo()
	default:
		log.Fatalf("unknown command: %s", *flagCmd)
	}
}

func PrintSysLoad() {
	load, err := backend.GetSysLoad()
	if err != nil {
		log.Fatalf("failed to get sys load: %v", err)
	}
	fmt.Println("============== sysload =================")
	fmt.Printf("CPU Usage: %.2f%%\n", load.CPUPercent)
	fmt.Printf("Load Average (1m, 5m, 15m): %.2f, %.2f, %.2f\n", load.LoadAverage1, load.LoadAverage5, load.LoadAverage15)
	memSize := utils.MemorySize{Unit: utils.Byte, Num: load.MemTotal}
	fmt.Printf("Total Memory: %s\n", memSize.HumanReadStr(1))
	fmt.Printf("Memory Usage: %.2f%%\n", load.MemUsagePercent)
	swapSize := utils.MemorySize{Unit: utils.Byte, Num: load.SwapTotal}
	fmt.Printf("Total Swap: %s\n", swapSize.HumanReadStr(1))
	fmt.Printf("Swap Usage: %.2f%%\n", load.SwapUsagePercent)
	fmt.Println("")
}

func PrintLoginInfo() {
	logins, err := backend.GetOnlineUser()
	if err != nil {
		log.Fatalf("failed to get login info: %v", err)
	}
	fmt.Println("============== login info =================")
	for _, login := range logins {
		fmt.Printf("User: %s, Terminal: %s, Login From: %s, Login Time: %s\n", login.Name, login.TTY, login.LoginFrom, login.LoginTime.Format("2006-01-02 15:04:05"))
	}
	fmt.Println("")
}

func PrintDCUInfo() {
	dcus, err := backend.GetDCULoad()
	if err != nil {
		log.Fatalf("failed to get dcu info: %v", err)
	}
	fmt.Println("============== dcu info =================")
	for _, dcu := range dcus {
		memTotal := utils.MemorySize{Unit: utils.Byte, Num: dcu.MemTotal}
		memUsed := utils.MemorySize{Unit: utils.Byte, Num: dcu.MemUsed}
120
121
122
123
124
125
126
127
128
		fmt.Printf("DCU index: %d Fan speed: %s Temperature: %.2f°C Power Capture: %.2fw Power Capture: %.2fw VRAM total: %s VRAM used: %s DCUUtils: %.2f%%\n", dcu.Index, dcu.Fan, dcu.Temp, dcu.PwrCap, dcu.PwrAvg, memTotal.HumanReadStr(1), memUsed.HumanReadStr(1), dcu.DCUUTil)
	}
	fmt.Println("")
}

func PrintRcclInfo(rccl_test_path string, args ...string) {
	output, _, err := backend.AllReducePerf(rccl_test_path, args...)
	if err != nil {
		log.Fatalf("failed to get rccl info: %v", err)
129
	}
130
131
	fmt.Println("============== rccl all reduce perf =================")
	fmt.Println(output)
132
	fmt.Println("")
133
}