package main import ( "fmt" "get-container/cmd/opsflow/backend" "get-container/cmd/opsflow/web" "get-container/utils" "log" "github.com/spf13/pflag" "github.com/spf13/viper" ) var ( flagPort = pflag.Int16P("port", "p", 10880, "listen port for service") flagServer = pflag.BoolP("server", "s", false, "run as server mode") flagCmd = pflag.StringP("cmd", "c", "all", "command to execute, sys/dcu/login/rccl/all") flagHelp = pflag.BoolP("help", "h", false, "show help message") flagCfg = pflag.String("config", "./opsflow.yaml", "path to config file") ) func main() { cfg := viper.New() pflag.String("rccl-test-path", "/opt/rccl-tests/build", "Path to rccl-tests") pflag.StringSlice("rccl-all-reduce-perf-args", []string{"-b", "8", "-e", "1G", "-f", "2", "-g", "8", "-d", "half"}, "Arguments for rccl all reduce perf") pflag.Parse() if *flagHelp { fmt.Println(`this is opsflow command line tool. Usage: opsflow [options] Options:`) pflag.PrintDefaults() fmt.Println(`Env Valiables: OPSFLOW_RCCL_TEST_PATH: set rccl test path OPSFLOW_RCCL_ALL_REDUCE_PERF_ARGS: set rccl all reduce perf args`) return } cfg.SetDefault("rccl_all_reduce_perf_args", []string{"-b", "8", "-e", "1G", "-f", "2", "-g", "8", "-d", "half"}) cfg.SetDefault("rccl_test_path", "/opt/rccl-tests/build") cfg.SetEnvPrefix("OPSFLOW") cfg.AutomaticEnv() cfg.BindPFlag("rccl_test_path", pflag.Lookup("rccl-test-path")) cfg.BindPFlag("rccl_all_reduce_perf_args", pflag.Lookup("rccl-all-reduce-perf-args")) cfg.SetConfigType("yaml") if flagCfg != nil { cfg.SetConfigFile(*flagCfg) } cfg.ReadInConfig() backend.Init() defer backend.Shutdown() if *flagServer { log.Println("start opsflow server mode") web.Init(cfg) err := web.WebServer(fmt.Sprintf(":%d", *flagPort)) if err != nil { log.Fatalf("failed to start web server: %v", err) } return } switch *flagCmd { case "sys": PrintSysLoad() case "dcu": PrintDCUInfo() case "login": PrintLoginInfo() case "rccl": PrintRcclInfo(cfg.GetString("rccl_test_path"), cfg.GetStringSlice("rccl_all_reduce_perf_args")...) case "all": PrintSysLoad() PrintDCUInfo() PrintLoginInfo() default: log.Fatalf("unknown command: %s", *flagCmd) } } func PrintSysLoad() { load, err := backend.GetSysLoad() if err != nil { log.Fatalf("failed to get sys load: %v", err) } fmt.Println("============== sysload =================") fmt.Printf("CPU Usage: %.2f%%\n", load.CPUPercent) fmt.Printf("Load Average (1m, 5m, 15m): %.2f, %.2f, %.2f\n", load.LoadAverage1, load.LoadAverage5, load.LoadAverage15) memSize := utils.MemorySize{Unit: utils.Byte, Num: load.MemTotal} fmt.Printf("Total Memory: %s\n", memSize.HumanReadStr(1)) fmt.Printf("Memory Usage: %.2f%%\n", load.MemUsagePercent) swapSize := utils.MemorySize{Unit: utils.Byte, Num: load.SwapTotal} fmt.Printf("Total Swap: %s\n", swapSize.HumanReadStr(1)) fmt.Printf("Swap Usage: %.2f%%\n", load.SwapUsagePercent) fmt.Println("") } func PrintLoginInfo() { logins, err := backend.GetOnlineUser() if err != nil { log.Fatalf("failed to get login info: %v", err) } fmt.Println("============== login info =================") for _, login := range logins { fmt.Printf("User: %s, Terminal: %s, Login From: %s, Login Time: %s\n", login.Name, login.TTY, login.LoginFrom, login.LoginTime.Format("2006-01-02 15:04:05")) } fmt.Println("") } func PrintDCUInfo() { dcus, err := backend.GetDCULoad() if err != nil { log.Fatalf("failed to get dcu info: %v", err) } fmt.Println("============== dcu info =================") for _, dcu := range dcus { memTotal := utils.MemorySize{Unit: utils.Byte, Num: dcu.MemTotal} memUsed := utils.MemorySize{Unit: utils.Byte, Num: dcu.MemUsed} fmt.Printf("DCU index: %d Fan speed: %s Temperature: %.2f°C Power Capture: %.2fw Power Capture: %.2fw VRAM total: %s VRAM used: %s DCUUtils: %.2f%%\n", dcu.Index, dcu.Fan, dcu.Temp, dcu.PwrCap, dcu.PwrAvg, memTotal.HumanReadStr(1), memUsed.HumanReadStr(1), dcu.DCUUTil) } fmt.Println("") } func PrintRcclInfo(rccl_test_path string, args ...string) { output, _, err := backend.AllReducePerf(rccl_test_path, args...) if err != nil { log.Fatalf("failed to get rccl info: %v", err) } fmt.Println("============== rccl all reduce perf =================") fmt.Println(output) fmt.Println("") }