package main import ( "fmt" "get-container/cmd/opsflow/backend" "get-container/cmd/opsflow/web" "get-container/utils" "log" "get-container/cmd/opsflow/docs" "github.com/spf13/pflag" "github.com/spf13/viper" ) var ( flagPort = pflag.Int16P("port", "p", 10880, "listen port for service") flagServer = pflag.BoolP("server", "s", false, "run as server mode") flagCmd = pflag.StringP("cmd", "c", "all", "command to execute, sys/dcu/login/rccl/all") flagHelp = pflag.BoolP("help", "h", false, "show help message") flagCfg = pflag.String("config", "./opsflow.yaml", "path to config file") _ = pflag.BoolP("debug", "d", false, "enable debug mode. If enabled, Swagger will be available") ) // @title OpsFlow API // @version 1.0 // @description 这是opsflow节点命令在服务模式下的接口文档 // @BasePath /api/cmd func main() { docs.SwaggerInfo.Title = "OpsFlow API" cfg := viper.New() pflag.String("rccl-test-path", "/opt/rccl-tests/build", "Path to rccl-tests") pflag.String("rccl-all-reduce-perf-args", "-b 8 -e 1G -f 2 -g 8 -d half", "Arguments for rccl all reduce perf") pflag.Parse() if *flagHelp { fmt.Println(`this is opsflow command line tool. Usage: opsflow [options] Options:`) pflag.PrintDefaults() fmt.Println(`Env Valiables: OPSFLOW_RCCL_TEST_PATH: set rccl test path OPSFLOW_RCCL_ALL_REDUCE_PERF_ARGS: set rccl all reduce perf args`) return } cfg.SetDefault("debug_mode", false) cfg.SetDefault("rccl_all_reduce_perf_args", "-b 8 -e 1G -f 2 -g 8 -d half") cfg.SetDefault("rccl_test_path", "/opt/rccl-tests/build") cfg.SetEnvPrefix("OPSFLOW") cfg.AutomaticEnv() cfg.SetDefault("auth_key", "OA5BDGLJ2DYGAWNCLJYSNZFAESPQ7BRL") cfg.BindPFlag("debug_mode", pflag.Lookup("debug")) cfg.BindPFlag("rccl_test_path", pflag.Lookup("rccl-test-path")) cfg.BindPFlag("rccl_all_reduce_perf_args", pflag.Lookup("rccl-all-reduce-perf-args")) cfg.SetConfigType("yaml") if flagCfg != nil { cfg.SetConfigFile(*flagCfg) } cfg.ReadInConfig() backend.Init() defer backend.Shutdown() if *flagServer { log.Println("start opsflow server mode") web.Init(cfg) err := web.WebServer(fmt.Sprintf(":%d", *flagPort)) if err != nil { log.Fatalf("failed to start web server: %v", err) } return } switch *flagCmd { case "sys": PrintSysLoad() case "dcu": PrintDCUInfo() case "login": PrintLoginInfo() case "rccl": PrintRcclInfo(cfg.GetString("rccl_test_path"), cfg.GetString("rccl_all_reduce_perf_args")) case "all": PrintSysLoad() PrintDCUInfo() PrintLoginInfo() default: log.Fatalf("unknown command: %s", *flagCmd) } } func PrintSysLoad() { load, err := backend.GetSysLoad() if err != nil { log.Fatalf("failed to get sys load: %v", err) } fmt.Println("============== sysload =================") fmt.Printf("CPU Usage: %.2f%%\n", load.CPUPercent) fmt.Printf("Load Average (1m, 5m, 15m): %.2f, %.2f, %.2f\n", load.LoadAverage1, load.LoadAverage5, load.LoadAverage15) memSize := utils.MemorySize{Unit: utils.Byte, Num: load.MemTotal} fmt.Printf("Total Memory: %s\n", memSize.HumanReadStr(1)) fmt.Printf("Memory Usage: %.2f%%\n", load.MemUsagePercent) swapSize := utils.MemorySize{Unit: utils.Byte, Num: load.SwapTotal} fmt.Printf("Total Swap: %s\n", swapSize.HumanReadStr(1)) fmt.Printf("Swap Usage: %.2f%%\n", load.SwapUsagePercent) fmt.Println("") } func PrintLoginInfo() { logins, err := backend.GetOnlineUser() if err != nil { log.Fatalf("failed to get login info: %v", err) } fmt.Println("============== login info =================") for _, login := range logins { fmt.Printf("User: %s, Terminal: %s, Login From: %s, Login Time: %s\n", login.Name, login.TTY, login.LoginFrom, login.LoginTime.Format("2006-01-02 15:04:05")) } fmt.Println("") } func PrintDCUInfo() { dcus, err := backend.GetDCULoad() if err != nil { log.Fatalf("failed to get dcu info: %v", err) } fmt.Println("============== dcu info =================") for _, dcu := range dcus { memTotal := utils.MemorySize{Unit: utils.Byte, Num: dcu.MemTotal} memUsed := utils.MemorySize{Unit: utils.Byte, Num: dcu.MemUsed} fmt.Printf("DCU index: %d Fan speed: %s Temperature: %.2f°C Power Capture: %.2fw Power Capture: %.2fw VRAM total: %s VRAM used: %s DCUUtils: %.2f%%\n", dcu.Index, dcu.Fan, dcu.Temp, dcu.PwrCap, dcu.PwrAvg, memTotal.HumanReadStr(1), memUsed.HumanReadStr(1), dcu.DCUUTil) } fmt.Println("") } func PrintRcclInfo(rccl_test_path string, args string) { output, err := backend.AllReducePerf(rccl_test_path, args) if err != nil { log.Fatalf("failed to get rccl info: %v", err) } fmt.Println("============== rccl all reduce perf =================") fmt.Println(output) fmt.Println("") }