main.go 4.61 KB
Newer Older
1
2
3
4
5
6
7
8
9
package main

import (
	"fmt"
	"get-container/cmd/opsflow/backend"
	"get-container/cmd/opsflow/web"
	"get-container/utils"
	"log"

10
11
	"get-container/cmd/opsflow/docs"

12
	"github.com/spf13/pflag"
13
	"github.com/spf13/viper"
14
15
16
17
18
)

var (
	flagPort   = pflag.Int16P("port", "p", 10880, "listen port for service")
	flagServer = pflag.BoolP("server", "s", false, "run as server mode")
19
	flagCmd    = pflag.StringP("cmd", "c", "all", "command to execute, sys/dcu/login/rccl/all")
20
	flagHelp   = pflag.BoolP("help", "h", false, "show help message")
21
	flagCfg    = pflag.String("config", "./opsflow.yaml", "path to config file")
22
	_          = pflag.BoolP("debug", "d", false, "enable debug mode. If enabled, Swagger will be available")
23
24
)

25
26
27
28
// @title OpsFlow API
// @version 1.0
// @description 这是opsflow节点命令在服务模式下的接口文档
// @BasePath /api/cmd
29
func main() {
30
31
32

	docs.SwaggerInfo.Title = "OpsFlow API"

33
34
	cfg := viper.New()
	pflag.String("rccl-test-path", "/opt/rccl-tests/build", "Path to rccl-tests")
35
	pflag.String("rccl-all-reduce-perf-args", "-b 8 -e 1G -f 2 -g 8 -d half", "Arguments for rccl all reduce perf")
36

37
38
39
40
41
42
43
44
	pflag.Parse()
	if *flagHelp {
		fmt.Println(`this is opsflow command line tool.
Usage:
	opsflow [options]

Options:`)
		pflag.PrintDefaults()
45
46
47
		fmt.Println(`Env Valiables:
           OPSFLOW_RCCL_TEST_PATH: set rccl test path
OPSFLOW_RCCL_ALL_REDUCE_PERF_ARGS: set rccl all reduce perf args`)
48
49
50
		return
	}

51
52
	cfg.SetDefault("debug_mode", false)
	cfg.SetDefault("rccl_all_reduce_perf_args", "-b 8 -e 1G -f 2 -g 8 -d half")
53
54
55
	cfg.SetDefault("rccl_test_path", "/opt/rccl-tests/build")
	cfg.SetEnvPrefix("OPSFLOW")
	cfg.AutomaticEnv()
liming6's avatar
liming6 committed
56
	cfg.SetDefault("auth_key", "OA5BDGLJ2DYGAWNCLJYSNZFAESPQ7BRL")
57
	cfg.BindPFlag("debug_mode", pflag.Lookup("debug"))
58
59
60
61
62
63
64
65
	cfg.BindPFlag("rccl_test_path", pflag.Lookup("rccl-test-path"))
	cfg.BindPFlag("rccl_all_reduce_perf_args", pflag.Lookup("rccl-all-reduce-perf-args"))
	cfg.SetConfigType("yaml")
	if flagCfg != nil {
		cfg.SetConfigFile(*flagCfg)
	}
	cfg.ReadInConfig()

66
67
68
69
	backend.Init()
	defer backend.Shutdown()
	if *flagServer {
		log.Println("start opsflow server mode")
70
		web.Init(cfg)
71
72
73
74
		err := web.WebServer(fmt.Sprintf(":%d", *flagPort))
		if err != nil {
			log.Fatalf("failed to start web server: %v", err)
		}
75
		return
76
77
78
79
80
81
82
83
	}
	switch *flagCmd {
	case "sys":
		PrintSysLoad()
	case "dcu":
		PrintDCUInfo()
	case "login":
		PrintLoginInfo()
84
	case "rccl":
85
		PrintRcclInfo(cfg.GetString("rccl_test_path"), cfg.GetString("rccl_all_reduce_perf_args"))
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
	case "all":
		PrintSysLoad()
		PrintDCUInfo()
		PrintLoginInfo()
	default:
		log.Fatalf("unknown command: %s", *flagCmd)
	}
}

func PrintSysLoad() {
	load, err := backend.GetSysLoad()
	if err != nil {
		log.Fatalf("failed to get sys load: %v", err)
	}
	fmt.Println("============== sysload =================")
	fmt.Printf("CPU Usage: %.2f%%\n", load.CPUPercent)
	fmt.Printf("Load Average (1m, 5m, 15m): %.2f, %.2f, %.2f\n", load.LoadAverage1, load.LoadAverage5, load.LoadAverage15)
	memSize := utils.MemorySize{Unit: utils.Byte, Num: load.MemTotal}
	fmt.Printf("Total Memory: %s\n", memSize.HumanReadStr(1))
	fmt.Printf("Memory Usage: %.2f%%\n", load.MemUsagePercent)
	swapSize := utils.MemorySize{Unit: utils.Byte, Num: load.SwapTotal}
	fmt.Printf("Total Swap: %s\n", swapSize.HumanReadStr(1))
	fmt.Printf("Swap Usage: %.2f%%\n", load.SwapUsagePercent)
	fmt.Println("")
}

func PrintLoginInfo() {
	logins, err := backend.GetOnlineUser()
	if err != nil {
		log.Fatalf("failed to get login info: %v", err)
	}
	fmt.Println("============== login info =================")
	for _, login := range logins {
		fmt.Printf("User: %s, Terminal: %s, Login From: %s, Login Time: %s\n", login.Name, login.TTY, login.LoginFrom, login.LoginTime.Format("2006-01-02 15:04:05"))
	}
	fmt.Println("")
}

func PrintDCUInfo() {
	dcus, err := backend.GetDCULoad()
	if err != nil {
		log.Fatalf("failed to get dcu info: %v", err)
	}
	fmt.Println("============== dcu info =================")
	for _, dcu := range dcus {
		memTotal := utils.MemorySize{Unit: utils.Byte, Num: dcu.MemTotal}
		memUsed := utils.MemorySize{Unit: utils.Byte, Num: dcu.MemUsed}
133
134
135
136
137
		fmt.Printf("DCU index: %d Fan speed: %s Temperature: %.2f°C Power Capture: %.2fw Power Capture: %.2fw VRAM total: %s VRAM used: %s DCUUtils: %.2f%%\n", dcu.Index, dcu.Fan, dcu.Temp, dcu.PwrCap, dcu.PwrAvg, memTotal.HumanReadStr(1), memUsed.HumanReadStr(1), dcu.DCUUTil)
	}
	fmt.Println("")
}

138
139
func PrintRcclInfo(rccl_test_path string, args string) {
	output, err := backend.AllReducePerf(rccl_test_path, args)
140
141
	if err != nil {
		log.Fatalf("failed to get rccl info: %v", err)
142
	}
143
144
	fmt.Println("============== rccl all reduce perf =================")
	fmt.Println(output)
145
	fmt.Println("")
146
}