Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
liming6
dcu-process-montor
Commits
1e4cd019
Commit
1e4cd019
authored
Dec 11, 2025
by
liming6
Browse files
feature 添加swagger接口文档,方便调试
parent
34501708
Changes
12
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
1596 additions
and
85 deletions
+1596
-85
cmd/hytop/hytop
cmd/hytop/hytop
+0
-0
cmd/opsflow/backend/backend_test.go
cmd/opsflow/backend/backend_test.go
+11
-4
cmd/opsflow/backend/main.go
cmd/opsflow/backend/main.go
+10
-3
cmd/opsflow/backend/rccl.go
cmd/opsflow/backend/rccl.go
+41
-23
cmd/opsflow/docs/docs.go
cmd/opsflow/docs/docs.go
+511
-0
cmd/opsflow/docs/swagger.json
cmd/opsflow/docs/swagger.json
+486
-0
cmd/opsflow/docs/swagger.yaml
cmd/opsflow/docs/swagger.yaml
+324
-0
cmd/opsflow/main.go
cmd/opsflow/main.go
+17
-5
cmd/opsflow/opsflow.yaml
cmd/opsflow/opsflow.yaml
+2
-1
cmd/opsflow/web/cmd.go
cmd/opsflow/web/cmd.go
+0
-1
cmd/opsflow/web/main.go
cmd/opsflow/web/main.go
+159
-39
go.mod
go.mod
+35
-9
No files found.
cmd/hytop/hytop
deleted
100755 → 0
View file @
34501708
File deleted
cmd/opsflow/backend/backend_test.go
View file @
1e4cd019
...
...
@@ -147,14 +147,21 @@ func TestFile(t *testing.T) {
if
err
!=
nil
{
t
.
Error
(
err
)
}
t
.
Logf
(
"%v"
,
st
.
Mode
()
.
Perm
()
&
0111
>
0
)
t
.
Logf
(
"%v"
,
st
.
Mode
()
.
Perm
()
&
0111
>
0
)
}
func
TestRcclTestCheck
(
t
*
testing
.
T
)
{
a
,
b
,
err
:=
AllReducePerf
(
"/home/panyq/wangx/rccl-tests/build-dan"
)
b
,
err
:=
AllReducePerf
(
"/home/panyq/wangx/rccl-tests/build-dan"
,
""
)
if
err
!=
nil
{
t
.
Error
(
err
)
}
t
.
Logf
(
"output: %s"
,
a
)
t
.
Logf
(
"result: %+v"
,
b
)
}
\ No newline at end of file
}
func
TestGetRcclDtk
(
t
*
testing
.
T
)
{
path
,
err
:=
GetRcclDtkPath
(
"/home/panyq/wangx/rccl-tests/build-dan"
,
RCCL_ALL_REDUCE
)
if
err
!=
nil
{
t
.
Error
(
err
)
}
t
.
Logf
(
"dtk path: %s"
,
path
)
}
cmd/opsflow/backend/main.go
View file @
1e4cd019
...
...
@@ -57,10 +57,17 @@ func parseWhoOutput(s string) ([]LoginUserInfo, error) {
return
result
,
nil
}
func
GetSysLoad
()
(
*
utils
.
SysInfo
,
error
)
{
return
utils
.
GetSysInfo
()
func
GetSysLoad
()
(
*
SysInfo
,
error
)
{
s
,
err
:=
utils
.
GetSysInfo
()
if
err
!=
nil
{
return
nil
,
err
}
ss
:=
SysInfo
(
*
s
)
return
&
ss
,
err
}
type
SysInfo
utils
.
SysInfo
type
DCULoad
struct
{
Name
string
`json:"name"`
Index
int
`json:"index"`
...
...
@@ -110,6 +117,6 @@ func GetDCULoad() ([]DCULoad, error) {
type
AllInfo
struct
{
DCUInfo
[]
DCULoad
`json:"dcuInfo"`
SysInfo
utils
.
SysInfo
`json:"sysInfo"`
SysInfo
SysInfo
`json:"sysInfo"`
OnlineUserInfo
[]
LoginUserInfo
`json:"loginUserInfo"`
}
cmd/opsflow/backend/rccl.go
View file @
1e4cd019
...
...
@@ -25,10 +25,12 @@ const (
)
type
RcclTestAllReducePrefResult
struct
{
Args
[]
string
`json:"args"`
// 执行参数
DTKPath
string
`json:"dtk_path"`
// dtk 库路径
Args
string
`json:"args"`
// 执行参数
TestVersion
string
`json:"test_version"`
// rccl-tests 版本信息
UseDevice
[]
string
`json:"use_device"`
// 使用的设备列表
Results
[]
*
RcclTestItem
`json:"results"`
RawOutput
string
`json:"raw_output,omitempty"`
}
type
RcclTestItem
struct
{
...
...
@@ -120,6 +122,7 @@ var (
ReMetricsLine
=
regexp
.
MustCompile
(
`(?mi)^\s*(\d+)\s+(\d+)\s+(\w+)\s+(\w+)\s+((?:-|)\d+)\s+([0-9.]+)\s+([0-9.]+)\s+([0-9.]+)\s+(\d+)\s+([0-9.]+)\s+([0-9.]+)\s+([0-9.]+)\s+(\d+)$`
)
ReDeviceLineFlag
=
regexp
.
MustCompile
(
`(?mi)^#\s*using\s+devices\s*$`
)
ReDeviceLine
=
regexp
.
MustCompile
(
`(?mi)^#\s*rank\s+(\d+)\s+pid\s+(\d+)\s+on\s+(\w+)\s+device\s+(\d+)\s+\[([0-9a-zA-Z:.]*)\]\s(\w+)$`
)
ReDtkPath
=
regexp
.
MustCompile
(
`^(.*dtk[0-9A-Za-z.-]*).*`
)
)
// RcclTestCheck 检查 rccl-tests 目录及指定二进制文件是否存在,返回二进制文件的完整路径
...
...
@@ -134,7 +137,7 @@ func RcclTestCheck(rccl_test_path string, binary RCCL_BINARY) (string, error) {
}
// 检查可执行文件是否存在
exePath
:=
strings
.
TrimSuffix
(
rccl_test_path
,
"/"
)
+
"/"
+
string
(
binary
)
exePath
:=
strings
.
TrimSuffix
(
rccl_test_path
,
"/"
)
+
"/"
+
string
(
binary
)
exeStat
,
err
:=
os
.
Stat
(
exePath
)
if
err
!=
nil
{
return
""
,
err
...
...
@@ -149,7 +152,7 @@ func RcclTestCheck(rccl_test_path string, binary RCCL_BINARY) (string, error) {
}
func
GetRcclDtkPath
(
rccl_test_path
string
,
binary
RCCL_BINARY
)
(
string
,
error
)
{
path
,
err
:=
RcclTestCheck
(
rccl_test_path
,
binary
)
path
,
err
:=
RcclTestCheck
(
rccl_test_path
,
binary
)
if
err
!=
nil
{
return
""
,
err
}
...
...
@@ -163,8 +166,11 @@ func GetRcclDtkPath(rccl_test_path string, binary RCCL_BINARY) (string, error) {
if
strings
.
Contains
(
v
,
"librccl.so"
)
{
parts
:=
strings
.
Fields
(
v
)
for
_
,
part
:=
range
parts
{
if
strings
.
Contains
(
part
,
"dtk"
)
{
return
part
,
nil
if
strings
.
Contains
(
part
,
"dtk"
)
&&
ReDtkPath
.
MatchString
(
part
)
{
m
:=
ReDtkPath
.
FindStringSubmatch
(
part
)
if
len
(
m
)
==
2
{
return
m
[
1
],
nil
}
}
}
}
...
...
@@ -172,24 +178,35 @@ func GetRcclDtkPath(rccl_test_path string, binary RCCL_BINARY) (string, error) {
return
""
,
errors
.
New
(
"librccl.so not found in ldd output"
)
}
func
AllReducePerf
(
rccl_test_path
string
,
args
...
string
)
(
string
,
*
RcclTestAllReducePrefResult
,
error
)
{
func
AllReducePerf
(
rccl_test_path
string
,
args
string
)
(
*
RcclTestAllReducePrefResult
,
error
)
{
path
,
err
:=
RcclTestCheck
(
rccl_test_path
,
RCCL_ALL_REDUCE
)
if
err
!=
nil
{
return
""
,
nil
,
err
return
nil
,
err
}
args
=
strings
.
Trim
(
args
,
" "
)
var
output
[]
byte
if
args
==
""
{
output
,
err
=
exec
.
Command
(
path
)
.
CombinedOutput
()
}
else
{
output
,
err
=
exec
.
Command
(
path
,
strings
.
Fields
(
args
)
...
)
.
CombinedOutput
()
}
output
,
err
:=
exec
.
Command
(
path
,
args
...
)
.
CombinedOutput
()
if
err
!=
nil
{
return
""
,
nil
,
err
return
nil
,
err
}
str
:=
string
(
output
)
res
:=
ParseRcclOutput
(
str
)
res
.
Args
=
args
return
str
,
res
,
nil
dp
,
err
:=
GetRcclDtkPath
(
rccl_test_path
,
RCCL_ALL_REDUCE
)
if
err
==
nil
{
res
.
DTKPath
=
dp
}
return
res
,
nil
}
func
ParseRcclOutput
(
output
string
)
*
RcclTestAllReducePrefResult
{
re
t
ult
:=
RcclTestAllReducePrefResult
{}
re
t
ult
.
UseDevice
=
make
([]
string
,
0
,
8
)
re
s
ult
:=
RcclTestAllReducePrefResult
{}
re
s
ult
.
UseDevice
=
make
([]
string
,
0
,
8
)
str
:=
strings
.
Trim
(
output
,
"
\n
"
)
lines
:=
strings
.
Split
(
str
,
"
\n
"
)
...
...
@@ -216,9 +233,9 @@ func ParseRcclOutput(output string) *RcclTestAllReducePrefResult {
continue
}
if
ReMetricsLine
.
MatchString
(
line
)
{
match
:=
ReMetricsLine
.
Find
All
StringSubmatch
(
line
,
-
1
)
if
len
(
match
)
>
0
&&
len
(
match
[
0
])
==
14
{
item
:=
NewRcclTestItem
(
match
[
0
][
1
:
])
match
:=
ReMetricsLine
.
FindStringSubmatch
(
line
)
if
len
(
match
)
==
14
{
item
:=
NewRcclTestItem
(
match
[
1
:
])
if
item
!=
nil
{
testItems
=
append
(
testItems
,
item
)
}
...
...
@@ -227,12 +244,12 @@ func ParseRcclOutput(output string) *RcclTestAllReducePrefResult {
cache
=
append
(
cache
,
line
)
}
}
re
t
ult
.
Results
=
testItems
re
s
ult
.
Results
=
testItems
for
_
,
v
:=
range
cache
{
if
ReRcclVersion
.
MatchString
(
v
)
{
match
:=
ReRcclVersion
.
Find
All
StringSubmatch
(
v
,
-
1
)
if
len
(
match
)
>
0
&&
len
(
match
[
0
])
==
2
{
re
t
ult
.
TestVersion
=
match
[
0
][
1
]
match
:=
ReRcclVersion
.
FindStringSubmatch
(
v
)
if
len
(
match
)
==
2
{
re
s
ult
.
TestVersion
=
match
[
1
]
}
break
}
...
...
@@ -249,14 +266,15 @@ func ParseRcclOutput(output string) *RcclTestAllReducePrefResult {
continue
}
if
findFlag
&&
ReDeviceLine
.
MatchString
(
line
)
{
match
:=
ReDeviceLine
.
Find
All
StringSubmatch
(
line
,
-
1
)
if
len
(
match
)
>
0
&&
len
(
match
[
0
])
==
7
{
re
t
ult
.
UseDevice
=
append
(
re
t
ult
.
UseDevice
,
match
[
0
][
6
])
match
:=
ReDeviceLine
.
FindStringSubmatch
(
line
)
if
len
(
match
)
==
7
{
re
s
ult
.
UseDevice
=
append
(
re
s
ult
.
UseDevice
,
match
[
6
])
}
}
else
if
findFlag
{
findFlag
=
false
break
}
}
return
&
retult
result
.
RawOutput
=
output
return
&
result
}
cmd/opsflow/docs/docs.go
0 → 100644
View file @
1e4cd019
// Package docs Code generated by swaggo/swag. DO NOT EDIT
package
docs
import
"github.com/swaggo/swag"
const
docTemplate
=
`{
"schemes": {{ marshal .Schemes }},
"swagger": "2.0",
"info": {
"description": "{{escape .Description}}",
"title": "{{.Title}}",
"contact": {},
"version": "{{.Version}}"
},
"host": "{{.Host}}",
"basePath": "{{.BasePath}}",
"paths": {
"/all": {
"get": {
"description": "获取所有信息(系统负载、DCU 负载、在线用户)",
"consumes": [
"application/json"
],
"produces": [
"application/json"
],
"summary": "获取所有信息(系统负载、DCU 负载、在线用户)",
"responses": {
"200": {
"description": "OK",
"schema": {
"$ref": "#/definitions/web.RestfulResult-backend_AllInfo"
}
},
"500": {
"description": "Internal Server Error",
"schema": {
"$ref": "#/definitions/web.RestfulNoDataResult"
}
}
}
}
},
"/dcuload": {
"get": {
"description": "获取 DCU 负载信息",
"consumes": [
"application/json"
],
"produces": [
"application/json"
],
"summary": "获取 DCU 负载信息",
"responses": {
"200": {
"description": "OK",
"schema": {
"$ref": "#/definitions/web.RestfulListResult-backend_DCULoad"
}
},
"500": {
"description": "Internal Server Error",
"schema": {
"$ref": "#/definitions/web.RestfulNoDataResult"
}
}
}
}
},
"/loginUser": {
"get": {
"description": "获取在线用户信息",
"consumes": [
"application/json"
],
"produces": [
"application/json"
],
"summary": "获取在线用户信息",
"responses": {
"200": {
"description": "OK",
"schema": {
"$ref": "#/definitions/web.RestfulListResult-backend_LoginUserInfo"
}
},
"500": {
"description": "Internal Server Error",
"schema": {
"$ref": "#/definitions/web.RestfulNoDataResult"
}
}
}
}
},
"/rccl/post": {
"post": {
"description": "给出rccl all_reduce_perf参数,执行单机测试",
"consumes": [
"application/json"
],
"produces": [
"application/json"
],
"summary": "给出rccl all_reduce_perf参数,执行单机测试",
"parameters": [
{
"description": "rccl all reduce perf args",
"name": "args",
"in": "body",
"required": true,
"schema": {
"$ref": "#/definitions/web.RcclArgs"
}
}
],
"responses": {
"200": {
"description": "OK",
"schema": {
"$ref": "#/definitions/web.RestfulResult-backend_RcclTestAllReducePrefResult"
}
},
"500": {
"description": "Internal Server Error",
"schema": {
"$ref": "#/definitions/web.RestfulNoDataResult"
}
}
}
}
},
"/rcclinfo": {
"get": {
"description": "获取 rccl all_reduce_perf 性能信息",
"consumes": [
"application/json"
],
"produces": [
"application/json"
],
"summary": "获取 rccl all_reduce_perf 性能信息",
"responses": {
"200": {
"description": "OK",
"schema": {
"$ref": "#/definitions/web.RestfulResult-backend_RcclTestAllReducePrefResult"
}
},
"500": {
"description": "Internal Server Error",
"schema": {
"$ref": "#/definitions/web.RestfulNoDataResult"
}
}
}
}
},
"/sysload": {
"get": {
"description": "获取系统负载信息",
"consumes": [
"application/json"
],
"produces": [
"application/json"
],
"summary": "获取系统负载信息",
"responses": {
"200": {
"description": "OK",
"schema": {
"$ref": "#/definitions/web.RestfulResult-backend_SysInfo"
}
},
"500": {
"description": "Internal Server Error",
"schema": {
"$ref": "#/definitions/web.RestfulNoDataResult"
}
}
}
}
}
},
"definitions": {
"backend.AllInfo": {
"type": "object",
"properties": {
"dcuInfo": {
"type": "array",
"items": {
"$ref": "#/definitions/backend.DCULoad"
}
},
"loginUserInfo": {
"type": "array",
"items": {
"$ref": "#/definitions/backend.LoginUserInfo"
}
},
"sysInfo": {
"$ref": "#/definitions/backend.SysInfo"
}
}
},
"backend.DCULoad": {
"type": "object",
"properties": {
"dcuUtilPercent": {
"type": "number"
},
"fan": {
"type": "string"
},
"index": {
"type": "integer"
},
"memTotal": {
"type": "integer"
},
"memUsed": {
"type": "integer"
},
"memUsedPercent": {
"type": "number"
},
"name": {
"type": "string"
},
"pwrAvg": {
"description": "单位是瓦",
"type": "number"
},
"pwrCap": {
"description": "单位是瓦",
"type": "number"
},
"temp": {
"description": "单位是摄氏度",
"type": "number"
}
}
},
"backend.LoginUserInfo": {
"type": "object",
"properties": {
"loginFrom": {
"description": "登录方式",
"type": "string"
},
"loginTime": {
"description": "登录时间",
"type": "string"
},
"name": {
"description": "用户名",
"type": "string"
},
"pid": {
"description": "登录的接管进程",
"type": "array",
"items": {
"type": "integer"
}
},
"tty": {
"description": "占用的终端",
"type": "string"
}
}
},
"backend.Metrics": {
"type": "object",
"properties": {
"alg_bw": {
"type": "number"
},
"bus_bw": {
"type": "number"
},
"time": {
"type": "number"
},
"wrong": {
"type": "integer"
}
}
},
"backend.RcclTestAllReducePrefResult": {
"type": "object",
"properties": {
"args": {
"description": "执行参数",
"type": "string"
},
"dtk_path": {
"description": "dtk 库路径",
"type": "string"
},
"raw_output": {
"type": "string"
},
"results": {
"type": "array",
"items": {
"$ref": "#/definitions/backend.RcclTestItem"
}
},
"test_version": {
"description": "rccl-tests 版本信息",
"type": "string"
},
"use_device": {
"description": "使用的设备列表",
"type": "array",
"items": {
"type": "string"
}
}
}
},
"backend.RcclTestItem": {
"type": "object",
"properties": {
"count": {
"type": "integer"
},
"in_place": {
"$ref": "#/definitions/backend.Metrics"
},
"out_of_place": {
"$ref": "#/definitions/backend.Metrics"
},
"redop": {
"type": "string"
},
"root": {
"type": "integer"
},
"size": {
"type": "integer"
},
"type": {
"type": "string"
}
}
},
"backend.SysInfo": {
"type": "object",
"properties": {
"cpuPercent": {
"description": "CPU使用率",
"type": "number"
},
"loadAverage1": {
"description": "1分钟内平均负载",
"type": "number"
},
"loadAverage15": {
"description": "15分钟平均负载",
"type": "number"
},
"loadAverage5": {
"description": "5分钟平均负载",
"type": "number"
},
"memTotal": {
"description": "总内存",
"type": "integer"
},
"memUsage": {
"description": "已使用内存",
"type": "integer"
},
"memUsagePercent": {
"description": "已使用内存百分比",
"type": "number"
},
"swapTotal": {
"description": "总swap",
"type": "integer"
},
"swapUsage": {
"description": "已使用swap",
"type": "integer"
},
"swapUsagePercent": {
"description": "已使用swap百分比",
"type": "number"
}
}
},
"web.RcclArgs": {
"type": "object",
"properties": {
"args": {
"type": "array",
"items": {
"type": "string"
}
}
}
},
"web.RestfulListResult-backend_DCULoad": {
"type": "object",
"properties": {
"code": {
"type": "integer"
},
"data": {
"type": "array",
"items": {
"$ref": "#/definitions/backend.DCULoad"
}
},
"msg": {
"type": "string"
}
}
},
"web.RestfulListResult-backend_LoginUserInfo": {
"type": "object",
"properties": {
"code": {
"type": "integer"
},
"data": {
"type": "array",
"items": {
"$ref": "#/definitions/backend.LoginUserInfo"
}
},
"msg": {
"type": "string"
}
}
},
"web.RestfulNoDataResult": {
"type": "object",
"properties": {
"code": {
"type": "integer"
},
"msg": {
"type": "string"
}
}
},
"web.RestfulResult-backend_AllInfo": {
"type": "object",
"properties": {
"code": {
"type": "integer"
},
"data": {
"$ref": "#/definitions/backend.AllInfo"
},
"msg": {
"type": "string"
}
}
},
"web.RestfulResult-backend_RcclTestAllReducePrefResult": {
"type": "object",
"properties": {
"code": {
"type": "integer"
},
"data": {
"$ref": "#/definitions/backend.RcclTestAllReducePrefResult"
},
"msg": {
"type": "string"
}
}
},
"web.RestfulResult-backend_SysInfo": {
"type": "object",
"properties": {
"code": {
"type": "integer"
},
"data": {
"$ref": "#/definitions/backend.SysInfo"
},
"msg": {
"type": "string"
}
}
}
}
}`
// SwaggerInfo holds exported Swagger Info so clients can modify it
var
SwaggerInfo
=
&
swag
.
Spec
{
Version
:
"1.0"
,
Host
:
""
,
BasePath
:
"/api/cmd"
,
Schemes
:
[]
string
{},
Title
:
"OpsFlow API"
,
Description
:
"这是opsflow节点命令在服务模式下的接口文档"
,
InfoInstanceName
:
"swagger"
,
SwaggerTemplate
:
docTemplate
,
LeftDelim
:
"{{"
,
RightDelim
:
"}}"
,
}
func
init
()
{
swag
.
Register
(
SwaggerInfo
.
InstanceName
(),
SwaggerInfo
)
}
cmd/opsflow/docs/swagger.json
0 → 100644
View file @
1e4cd019
{
"swagger"
:
"2.0"
,
"info"
:
{
"description"
:
"这是opsflow节点命令在服务模式下的接口文档"
,
"title"
:
"OpsFlow API"
,
"contact"
:
{},
"version"
:
"1.0"
},
"basePath"
:
"/api/cmd"
,
"paths"
:
{
"/all"
:
{
"get"
:
{
"description"
:
"获取所有信息(系统负载、DCU 负载、在线用户)"
,
"consumes"
:
[
"application/json"
],
"produces"
:
[
"application/json"
],
"summary"
:
"获取所有信息(系统负载、DCU 负载、在线用户)"
,
"responses"
:
{
"200"
:
{
"description"
:
"OK"
,
"schema"
:
{
"$ref"
:
"#/definitions/web.RestfulResult-backend_AllInfo"
}
},
"500"
:
{
"description"
:
"Internal Server Error"
,
"schema"
:
{
"$ref"
:
"#/definitions/web.RestfulNoDataResult"
}
}
}
}
},
"/dcuload"
:
{
"get"
:
{
"description"
:
"获取 DCU 负载信息"
,
"consumes"
:
[
"application/json"
],
"produces"
:
[
"application/json"
],
"summary"
:
"获取 DCU 负载信息"
,
"responses"
:
{
"200"
:
{
"description"
:
"OK"
,
"schema"
:
{
"$ref"
:
"#/definitions/web.RestfulListResult-backend_DCULoad"
}
},
"500"
:
{
"description"
:
"Internal Server Error"
,
"schema"
:
{
"$ref"
:
"#/definitions/web.RestfulNoDataResult"
}
}
}
}
},
"/loginUser"
:
{
"get"
:
{
"description"
:
"获取在线用户信息"
,
"consumes"
:
[
"application/json"
],
"produces"
:
[
"application/json"
],
"summary"
:
"获取在线用户信息"
,
"responses"
:
{
"200"
:
{
"description"
:
"OK"
,
"schema"
:
{
"$ref"
:
"#/definitions/web.RestfulListResult-backend_LoginUserInfo"
}
},
"500"
:
{
"description"
:
"Internal Server Error"
,
"schema"
:
{
"$ref"
:
"#/definitions/web.RestfulNoDataResult"
}
}
}
}
},
"/rccl/post"
:
{
"post"
:
{
"description"
:
"给出rccl all_reduce_perf参数,执行单机测试"
,
"consumes"
:
[
"application/json"
],
"produces"
:
[
"application/json"
],
"summary"
:
"给出rccl all_reduce_perf参数,执行单机测试"
,
"parameters"
:
[
{
"description"
:
"rccl all reduce perf args"
,
"name"
:
"args"
,
"in"
:
"body"
,
"required"
:
true
,
"schema"
:
{
"$ref"
:
"#/definitions/web.RcclArgs"
}
}
],
"responses"
:
{
"200"
:
{
"description"
:
"OK"
,
"schema"
:
{
"$ref"
:
"#/definitions/web.RestfulResult-backend_RcclTestAllReducePrefResult"
}
},
"500"
:
{
"description"
:
"Internal Server Error"
,
"schema"
:
{
"$ref"
:
"#/definitions/web.RestfulNoDataResult"
}
}
}
}
},
"/rcclinfo"
:
{
"get"
:
{
"description"
:
"获取 rccl all_reduce_perf 性能信息"
,
"consumes"
:
[
"application/json"
],
"produces"
:
[
"application/json"
],
"summary"
:
"获取 rccl all_reduce_perf 性能信息"
,
"responses"
:
{
"200"
:
{
"description"
:
"OK"
,
"schema"
:
{
"$ref"
:
"#/definitions/web.RestfulResult-backend_RcclTestAllReducePrefResult"
}
},
"500"
:
{
"description"
:
"Internal Server Error"
,
"schema"
:
{
"$ref"
:
"#/definitions/web.RestfulNoDataResult"
}
}
}
}
},
"/sysload"
:
{
"get"
:
{
"description"
:
"获取系统负载信息"
,
"consumes"
:
[
"application/json"
],
"produces"
:
[
"application/json"
],
"summary"
:
"获取系统负载信息"
,
"responses"
:
{
"200"
:
{
"description"
:
"OK"
,
"schema"
:
{
"$ref"
:
"#/definitions/web.RestfulResult-backend_SysInfo"
}
},
"500"
:
{
"description"
:
"Internal Server Error"
,
"schema"
:
{
"$ref"
:
"#/definitions/web.RestfulNoDataResult"
}
}
}
}
}
},
"definitions"
:
{
"backend.AllInfo"
:
{
"type"
:
"object"
,
"properties"
:
{
"dcuInfo"
:
{
"type"
:
"array"
,
"items"
:
{
"$ref"
:
"#/definitions/backend.DCULoad"
}
},
"loginUserInfo"
:
{
"type"
:
"array"
,
"items"
:
{
"$ref"
:
"#/definitions/backend.LoginUserInfo"
}
},
"sysInfo"
:
{
"$ref"
:
"#/definitions/backend.SysInfo"
}
}
},
"backend.DCULoad"
:
{
"type"
:
"object"
,
"properties"
:
{
"dcuUtilPercent"
:
{
"type"
:
"number"
},
"fan"
:
{
"type"
:
"string"
},
"index"
:
{
"type"
:
"integer"
},
"memTotal"
:
{
"type"
:
"integer"
},
"memUsed"
:
{
"type"
:
"integer"
},
"memUsedPercent"
:
{
"type"
:
"number"
},
"name"
:
{
"type"
:
"string"
},
"pwrAvg"
:
{
"description"
:
"单位是瓦"
,
"type"
:
"number"
},
"pwrCap"
:
{
"description"
:
"单位是瓦"
,
"type"
:
"number"
},
"temp"
:
{
"description"
:
"单位是摄氏度"
,
"type"
:
"number"
}
}
},
"backend.LoginUserInfo"
:
{
"type"
:
"object"
,
"properties"
:
{
"loginFrom"
:
{
"description"
:
"登录方式"
,
"type"
:
"string"
},
"loginTime"
:
{
"description"
:
"登录时间"
,
"type"
:
"string"
},
"name"
:
{
"description"
:
"用户名"
,
"type"
:
"string"
},
"pid"
:
{
"description"
:
"登录的接管进程"
,
"type"
:
"array"
,
"items"
:
{
"type"
:
"integer"
}
},
"tty"
:
{
"description"
:
"占用的终端"
,
"type"
:
"string"
}
}
},
"backend.Metrics"
:
{
"type"
:
"object"
,
"properties"
:
{
"alg_bw"
:
{
"type"
:
"number"
},
"bus_bw"
:
{
"type"
:
"number"
},
"time"
:
{
"type"
:
"number"
},
"wrong"
:
{
"type"
:
"integer"
}
}
},
"backend.RcclTestAllReducePrefResult"
:
{
"type"
:
"object"
,
"properties"
:
{
"args"
:
{
"description"
:
"执行参数"
,
"type"
:
"string"
},
"dtk_path"
:
{
"description"
:
"dtk 库路径"
,
"type"
:
"string"
},
"raw_output"
:
{
"type"
:
"string"
},
"results"
:
{
"type"
:
"array"
,
"items"
:
{
"$ref"
:
"#/definitions/backend.RcclTestItem"
}
},
"test_version"
:
{
"description"
:
"rccl-tests 版本信息"
,
"type"
:
"string"
},
"use_device"
:
{
"description"
:
"使用的设备列表"
,
"type"
:
"array"
,
"items"
:
{
"type"
:
"string"
}
}
}
},
"backend.RcclTestItem"
:
{
"type"
:
"object"
,
"properties"
:
{
"count"
:
{
"type"
:
"integer"
},
"in_place"
:
{
"$ref"
:
"#/definitions/backend.Metrics"
},
"out_of_place"
:
{
"$ref"
:
"#/definitions/backend.Metrics"
},
"redop"
:
{
"type"
:
"string"
},
"root"
:
{
"type"
:
"integer"
},
"size"
:
{
"type"
:
"integer"
},
"type"
:
{
"type"
:
"string"
}
}
},
"backend.SysInfo"
:
{
"type"
:
"object"
,
"properties"
:
{
"cpuPercent"
:
{
"description"
:
"CPU使用率"
,
"type"
:
"number"
},
"loadAverage1"
:
{
"description"
:
"1分钟内平均负载"
,
"type"
:
"number"
},
"loadAverage15"
:
{
"description"
:
"15分钟平均负载"
,
"type"
:
"number"
},
"loadAverage5"
:
{
"description"
:
"5分钟平均负载"
,
"type"
:
"number"
},
"memTotal"
:
{
"description"
:
"总内存"
,
"type"
:
"integer"
},
"memUsage"
:
{
"description"
:
"已使用内存"
,
"type"
:
"integer"
},
"memUsagePercent"
:
{
"description"
:
"已使用内存百分比"
,
"type"
:
"number"
},
"swapTotal"
:
{
"description"
:
"总swap"
,
"type"
:
"integer"
},
"swapUsage"
:
{
"description"
:
"已使用swap"
,
"type"
:
"integer"
},
"swapUsagePercent"
:
{
"description"
:
"已使用swap百分比"
,
"type"
:
"number"
}
}
},
"web.RcclArgs"
:
{
"type"
:
"object"
,
"properties"
:
{
"args"
:
{
"type"
:
"array"
,
"items"
:
{
"type"
:
"string"
}
}
}
},
"web.RestfulListResult-backend_DCULoad"
:
{
"type"
:
"object"
,
"properties"
:
{
"code"
:
{
"type"
:
"integer"
},
"data"
:
{
"type"
:
"array"
,
"items"
:
{
"$ref"
:
"#/definitions/backend.DCULoad"
}
},
"msg"
:
{
"type"
:
"string"
}
}
},
"web.RestfulListResult-backend_LoginUserInfo"
:
{
"type"
:
"object"
,
"properties"
:
{
"code"
:
{
"type"
:
"integer"
},
"data"
:
{
"type"
:
"array"
,
"items"
:
{
"$ref"
:
"#/definitions/backend.LoginUserInfo"
}
},
"msg"
:
{
"type"
:
"string"
}
}
},
"web.RestfulNoDataResult"
:
{
"type"
:
"object"
,
"properties"
:
{
"code"
:
{
"type"
:
"integer"
},
"msg"
:
{
"type"
:
"string"
}
}
},
"web.RestfulResult-backend_AllInfo"
:
{
"type"
:
"object"
,
"properties"
:
{
"code"
:
{
"type"
:
"integer"
},
"data"
:
{
"$ref"
:
"#/definitions/backend.AllInfo"
},
"msg"
:
{
"type"
:
"string"
}
}
},
"web.RestfulResult-backend_RcclTestAllReducePrefResult"
:
{
"type"
:
"object"
,
"properties"
:
{
"code"
:
{
"type"
:
"integer"
},
"data"
:
{
"$ref"
:
"#/definitions/backend.RcclTestAllReducePrefResult"
},
"msg"
:
{
"type"
:
"string"
}
}
},
"web.RestfulResult-backend_SysInfo"
:
{
"type"
:
"object"
,
"properties"
:
{
"code"
:
{
"type"
:
"integer"
},
"data"
:
{
"$ref"
:
"#/definitions/backend.SysInfo"
},
"msg"
:
{
"type"
:
"string"
}
}
}
}
}
\ No newline at end of file
cmd/opsflow/docs/swagger.yaml
0 → 100644
View file @
1e4cd019
basePath
:
/api/cmd
definitions
:
backend.AllInfo
:
properties
:
dcuInfo
:
items
:
$ref
:
'
#/definitions/backend.DCULoad'
type
:
array
loginUserInfo
:
items
:
$ref
:
'
#/definitions/backend.LoginUserInfo'
type
:
array
sysInfo
:
$ref
:
'
#/definitions/backend.SysInfo'
type
:
object
backend.DCULoad
:
properties
:
dcuUtilPercent
:
type
:
number
fan
:
type
:
string
index
:
type
:
integer
memTotal
:
type
:
integer
memUsed
:
type
:
integer
memUsedPercent
:
type
:
number
name
:
type
:
string
pwrAvg
:
description
:
单位是瓦
type
:
number
pwrCap
:
description
:
单位是瓦
type
:
number
temp
:
description
:
单位是摄氏度
type
:
number
type
:
object
backend.LoginUserInfo
:
properties
:
loginFrom
:
description
:
登录方式
type
:
string
loginTime
:
description
:
登录时间
type
:
string
name
:
description
:
用户名
type
:
string
pid
:
description
:
登录的接管进程
items
:
type
:
integer
type
:
array
tty
:
description
:
占用的终端
type
:
string
type
:
object
backend.Metrics
:
properties
:
alg_bw
:
type
:
number
bus_bw
:
type
:
number
time
:
type
:
number
wrong
:
type
:
integer
type
:
object
backend.RcclTestAllReducePrefResult
:
properties
:
args
:
description
:
执行参数
type
:
string
dtk_path
:
description
:
dtk 库路径
type
:
string
raw_output
:
type
:
string
results
:
items
:
$ref
:
'
#/definitions/backend.RcclTestItem'
type
:
array
test_version
:
description
:
rccl-tests 版本信息
type
:
string
use_device
:
description
:
使用的设备列表
items
:
type
:
string
type
:
array
type
:
object
backend.RcclTestItem
:
properties
:
count
:
type
:
integer
in_place
:
$ref
:
'
#/definitions/backend.Metrics'
out_of_place
:
$ref
:
'
#/definitions/backend.Metrics'
redop
:
type
:
string
root
:
type
:
integer
size
:
type
:
integer
type
:
type
:
string
type
:
object
backend.SysInfo
:
properties
:
cpuPercent
:
description
:
CPU使用率
type
:
number
loadAverage1
:
description
:
1分钟内平均负载
type
:
number
loadAverage5
:
description
:
5分钟平均负载
type
:
number
loadAverage15
:
description
:
15分钟平均负载
type
:
number
memTotal
:
description
:
总内存
type
:
integer
memUsage
:
description
:
已使用内存
type
:
integer
memUsagePercent
:
description
:
已使用内存百分比
type
:
number
swapTotal
:
description
:
总swap
type
:
integer
swapUsage
:
description
:
已使用swap
type
:
integer
swapUsagePercent
:
description
:
已使用swap百分比
type
:
number
type
:
object
web.RcclArgs
:
properties
:
args
:
items
:
type
:
string
type
:
array
type
:
object
web.RestfulListResult-backend_DCULoad
:
properties
:
code
:
type
:
integer
data
:
items
:
$ref
:
'
#/definitions/backend.DCULoad'
type
:
array
msg
:
type
:
string
type
:
object
web.RestfulListResult-backend_LoginUserInfo
:
properties
:
code
:
type
:
integer
data
:
items
:
$ref
:
'
#/definitions/backend.LoginUserInfo'
type
:
array
msg
:
type
:
string
type
:
object
web.RestfulNoDataResult
:
properties
:
code
:
type
:
integer
msg
:
type
:
string
type
:
object
web.RestfulResult-backend_AllInfo
:
properties
:
code
:
type
:
integer
data
:
$ref
:
'
#/definitions/backend.AllInfo'
msg
:
type
:
string
type
:
object
web.RestfulResult-backend_RcclTestAllReducePrefResult
:
properties
:
code
:
type
:
integer
data
:
$ref
:
'
#/definitions/backend.RcclTestAllReducePrefResult'
msg
:
type
:
string
type
:
object
web.RestfulResult-backend_SysInfo
:
properties
:
code
:
type
:
integer
data
:
$ref
:
'
#/definitions/backend.SysInfo'
msg
:
type
:
string
type
:
object
info
:
contact
:
{}
description
:
这是opsflow节点命令在服务模式下的接口文档
title
:
OpsFlow API
version
:
"
1.0"
paths
:
/all
:
get
:
consumes
:
-
application/json
description
:
获取所有信息(系统负载、DCU 负载、在线用户)
produces
:
-
application/json
responses
:
"
200"
:
description
:
OK
schema
:
$ref
:
'
#/definitions/web.RestfulResult-backend_AllInfo'
"
500"
:
description
:
Internal Server Error
schema
:
$ref
:
'
#/definitions/web.RestfulNoDataResult'
summary
:
获取所有信息(系统负载、DCU 负载、在线用户)
/dcuload
:
get
:
consumes
:
-
application/json
description
:
获取 DCU 负载信息
produces
:
-
application/json
responses
:
"
200"
:
description
:
OK
schema
:
$ref
:
'
#/definitions/web.RestfulListResult-backend_DCULoad'
"
500"
:
description
:
Internal Server Error
schema
:
$ref
:
'
#/definitions/web.RestfulNoDataResult'
summary
:
获取 DCU 负载信息
/loginUser
:
get
:
consumes
:
-
application/json
description
:
获取在线用户信息
produces
:
-
application/json
responses
:
"
200"
:
description
:
OK
schema
:
$ref
:
'
#/definitions/web.RestfulListResult-backend_LoginUserInfo'
"
500"
:
description
:
Internal Server Error
schema
:
$ref
:
'
#/definitions/web.RestfulNoDataResult'
summary
:
获取在线用户信息
/rccl/post
:
post
:
consumes
:
-
application/json
description
:
给出rccl all_reduce_perf参数,执行单机测试
parameters
:
-
description
:
rccl all reduce perf args
in
:
body
name
:
args
required
:
true
schema
:
$ref
:
'
#/definitions/web.RcclArgs'
produces
:
-
application/json
responses
:
"
200"
:
description
:
OK
schema
:
$ref
:
'
#/definitions/web.RestfulResult-backend_RcclTestAllReducePrefResult'
"
500"
:
description
:
Internal Server Error
schema
:
$ref
:
'
#/definitions/web.RestfulNoDataResult'
summary
:
给出rccl all_reduce_perf参数,执行单机测试
/rcclinfo
:
get
:
consumes
:
-
application/json
description
:
获取 rccl all_reduce_perf 性能信息
produces
:
-
application/json
responses
:
"
200"
:
description
:
OK
schema
:
$ref
:
'
#/definitions/web.RestfulResult-backend_RcclTestAllReducePrefResult'
"
500"
:
description
:
Internal Server Error
schema
:
$ref
:
'
#/definitions/web.RestfulNoDataResult'
summary
:
获取 rccl all_reduce_perf 性能信息
/sysload
:
get
:
consumes
:
-
application/json
description
:
获取系统负载信息
produces
:
-
application/json
responses
:
"
200"
:
description
:
OK
schema
:
$ref
:
'
#/definitions/web.RestfulResult-backend_SysInfo'
"
500"
:
description
:
Internal Server Error
schema
:
$ref
:
'
#/definitions/web.RestfulNoDataResult'
summary
:
获取系统负载信息
swagger
:
"
2.0"
cmd/opsflow/main.go
View file @
1e4cd019
...
...
@@ -7,6 +7,8 @@ import (
"get-container/utils"
"log"
"get-container/cmd/opsflow/docs"
"github.com/spf13/pflag"
"github.com/spf13/viper"
)
...
...
@@ -17,12 +19,20 @@ var (
flagCmd
=
pflag
.
StringP
(
"cmd"
,
"c"
,
"all"
,
"command to execute, sys/dcu/login/rccl/all"
)
flagHelp
=
pflag
.
BoolP
(
"help"
,
"h"
,
false
,
"show help message"
)
flagCfg
=
pflag
.
String
(
"config"
,
"./opsflow.yaml"
,
"path to config file"
)
_
=
pflag
.
BoolP
(
"debug"
,
"d"
,
false
,
"enable debug mode. If enabled, Swagger will be available"
)
)
// @title OpsFlow API
// @version 1.0
// @description 这是opsflow节点命令在服务模式下的接口文档
// @BasePath /api/cmd
func
main
()
{
docs
.
SwaggerInfo
.
Title
=
"OpsFlow API"
cfg
:=
viper
.
New
()
pflag
.
String
(
"rccl-test-path"
,
"/opt/rccl-tests/build"
,
"Path to rccl-tests"
)
pflag
.
String
Slice
(
"rccl-all-reduce-perf-args"
,
[]
string
{
"-b"
,
"8"
,
"-e"
,
"1G"
,
"-f"
,
"2"
,
"-g"
,
"8"
,
"-d"
,
"
half"
}
,
"Arguments for rccl all reduce perf"
)
pflag
.
String
(
"rccl-all-reduce-perf-args"
,
"-b 8 -e 1G -f 2 -g 8 -d
half"
,
"Arguments for rccl all reduce perf"
)
pflag
.
Parse
()
if
*
flagHelp
{
...
...
@@ -38,10 +48,12 @@ OPSFLOW_RCCL_ALL_REDUCE_PERF_ARGS: set rccl all reduce perf args`)
return
}
cfg
.
SetDefault
(
"rccl_all_reduce_perf_args"
,
[]
string
{
"-b"
,
"8"
,
"-e"
,
"1G"
,
"-f"
,
"2"
,
"-g"
,
"8"
,
"-d"
,
"half"
})
cfg
.
SetDefault
(
"debug_mode"
,
false
)
cfg
.
SetDefault
(
"rccl_all_reduce_perf_args"
,
"-b 8 -e 1G -f 2 -g 8 -d half"
)
cfg
.
SetDefault
(
"rccl_test_path"
,
"/opt/rccl-tests/build"
)
cfg
.
SetEnvPrefix
(
"OPSFLOW"
)
cfg
.
AutomaticEnv
()
cfg
.
BindPFlag
(
"debug_mode"
,
pflag
.
Lookup
(
"debug"
))
cfg
.
BindPFlag
(
"rccl_test_path"
,
pflag
.
Lookup
(
"rccl-test-path"
))
cfg
.
BindPFlag
(
"rccl_all_reduce_perf_args"
,
pflag
.
Lookup
(
"rccl-all-reduce-perf-args"
))
cfg
.
SetConfigType
(
"yaml"
)
...
...
@@ -69,7 +81,7 @@ OPSFLOW_RCCL_ALL_REDUCE_PERF_ARGS: set rccl all reduce perf args`)
case
"login"
:
PrintLoginInfo
()
case
"rccl"
:
PrintRcclInfo
(
cfg
.
GetString
(
"rccl_test_path"
),
cfg
.
GetString
Slice
(
"rccl_all_reduce_perf_args"
)
...
)
PrintRcclInfo
(
cfg
.
GetString
(
"rccl_test_path"
),
cfg
.
GetString
(
"rccl_all_reduce_perf_args"
))
case
"all"
:
PrintSysLoad
()
PrintDCUInfo
()
...
...
@@ -122,8 +134,8 @@ func PrintDCUInfo() {
fmt
.
Println
(
""
)
}
func
PrintRcclInfo
(
rccl_test_path
string
,
args
...
string
)
{
output
,
_
,
err
:=
backend
.
AllReducePerf
(
rccl_test_path
,
args
...
)
func
PrintRcclInfo
(
rccl_test_path
string
,
args
string
)
{
output
,
err
:=
backend
.
AllReducePerf
(
rccl_test_path
,
args
)
if
err
!=
nil
{
log
.
Fatalf
(
"failed to get rccl info: %v"
,
err
)
}
...
...
cmd/opsflow/opsflow.yaml
View file @
1e4cd019
rccl_test_path
:
/home/panyq/wangx/rccl-tests/build-dan
rccl_all_reduce_perf_args
:
[
"
-b"
,
"
8"
,
"
-e"
,
"
128M"
,
"
-f"
,
"
2"
,
"
-g"
,
"
8"
,
"
-d"
,
"
half"
]
rccl_all_reduce_perf_args
:
"
-b
8
-e
128M
-f
2
-g
8
-d
half"
debug_mode
:
false
cmd/opsflow/web/cmd.go
deleted
100644 → 0
View file @
34501708
package
web
cmd/opsflow/web/main.go
View file @
1e4cd019
...
...
@@ -2,11 +2,17 @@ package web
import
(
"get-container/cmd/opsflow/backend"
"strings"
"github.com/gin-gonic/gin"
"github.com/spf13/viper"
swaggerFiles
"github.com/swaggo/files"
ginSwagger
"github.com/swaggo/gin-swagger"
)
// swagger embed files
var
(
globalCfg
*
viper
.
Viper
=
nil
)
...
...
@@ -15,21 +21,47 @@ func Init(cfg *viper.Viper) {
globalCfg
=
cfg
}
type
RestfulResult
struct
{
type
RestfulResult
[
T
any
]
struct
{
Code
int
`json:"code"`
Msg
string
`json:"msg"`
Data
T
`json:"data,omitempty"`
}
type
RestfulNoDataResult
struct
{
Code
int
`json:"code"`
Msg
string
`json:"msg"`
Data
any
`json:"data,omitempty"`
}
type
RestfulListResult
[
T
any
]
struct
{
Code
int
`json:"code"`
Msg
string
`json:"msg"`
Data
[]
T
`json:"data,omitempty"`
}
func
ReturnGin
(
ctx
*
gin
.
Context
,
data
any
,
err
error
)
{
if
err
!=
nil
{
ctx
.
JSON
(
500
,
RestfulResult
{
ctx
.
JSON
(
500
,
Restful
NoData
Result
{
Code
:
500
,
Msg
:
err
.
Error
(),
})
return
}
ctx
.
JSON
(
200
,
RestfulResult
{
ctx
.
JSON
(
200
,
RestfulResult
[
any
]{
Code
:
200
,
Msg
:
"ok"
,
Data
:
data
,
})
}
func
ReturnGinList
[
T
any
](
ctx
*
gin
.
Context
,
data
[]
T
,
err
error
)
{
if
err
!=
nil
{
ctx
.
JSON
(
500
,
RestfulNoDataResult
{
Code
:
500
,
Msg
:
err
.
Error
(),
})
return
}
ctx
.
JSON
(
200
,
RestfulListResult
[
T
]{
Code
:
200
,
Msg
:
"ok"
,
Data
:
data
,
...
...
@@ -38,41 +70,129 @@ func ReturnGin(ctx *gin.Context, data any, err error) {
func
WebServer
(
addr
string
)
error
{
engine
:=
gin
.
Default
()
if
globalCfg
.
GetBool
(
"debug_mode"
)
{
engine
.
GET
(
"/swagger/*any"
,
ginSwagger
.
WrapHandler
(
swaggerFiles
.
Handler
))
}
cmdGroup
:=
engine
.
Group
(
"/api/cmd"
)
cmdGroup
.
GET
(
"/all"
,
func
(
ctx
*
gin
.
Context
)
{
olu
,
err
:=
backend
.
GetOnlineUser
()
if
err
!=
nil
{
ReturnGin
(
ctx
,
nil
,
err
)
}
sys
,
err
:=
backend
.
GetSysLoad
()
if
err
!=
nil
{
ReturnGin
(
ctx
,
nil
,
err
)
}
dcu
,
err
:=
backend
.
GetDCULoad
()
if
err
!=
nil
{
ReturnGin
(
ctx
,
nil
,
err
)
}
ReturnGin
(
ctx
,
backend
.
AllInfo
{
DCUInfo
:
dcu
,
SysInfo
:
*
sys
,
OnlineUserInfo
:
olu
,
},
err
)
})
cmdGroup
.
GET
(
"/loginUser"
,
func
(
ctx
*
gin
.
Context
)
{
olu
,
err
:=
backend
.
GetOnlineUser
()
ReturnGin
(
ctx
,
olu
,
err
)
})
cmdGroup
.
GET
(
"/sysload"
,
func
(
ctx
*
gin
.
Context
)
{
sys
,
err
:=
backend
.
GetSysLoad
()
ReturnGin
(
ctx
,
sys
,
err
)
})
cmdGroup
.
GET
(
"/dcuload"
,
func
(
ctx
*
gin
.
Context
)
{
dcu
,
err
:=
backend
.
GetDCULoad
()
ReturnGin
(
ctx
,
dcu
,
err
)
})
cmdGroup
.
GET
(
"/rcclinfo"
,
func
(
ctx
*
gin
.
Context
)
{
_
,
r
,
err
:=
backend
.
AllReducePerf
(
globalCfg
.
GetString
(
"rccl_test_path"
),
globalCfg
.
GetStringSlice
(
"rccl_all_reduce_perf_args"
)
...
)
ReturnGin
(
ctx
,
r
,
err
)
})
cmdGroup
.
GET
(
"/all"
,
_controller
.
GetAllInfo
)
cmdGroup
.
GET
(
"/loginUser"
,
_controller
.
GetOnlineUser
)
cmdGroup
.
GET
(
"/sysload"
,
_controller
.
GetSysLoad
)
cmdGroup
.
GET
(
"/dcuload"
,
_controller
.
GetDCULoad
)
cmdGroup
.
GET
(
"/rcclinfo"
,
_controller
.
GetRcclHandler
)
cmdGroup
.
POST
(
"/rccl/post"
,
_controller
.
PostRcclHandler
)
return
engine
.
Run
(
addr
)
}
type
controller
struct
{}
var
_controller
=
controller
{}
// GetRcclHandler godoc
// @Summary 获取 rccl all_reduce_perf 性能信息
// @Description 获取 rccl all_reduce_perf 性能信息
// @Accept json
// @Produce json
// @Success 200 {object} RestfulResult[backend.RcclTestAllReducePrefResult]
// @Failure 500 {object} RestfulNoDataResult
// @Router /rcclinfo [get]
func
(
c
controller
)
GetRcclHandler
(
ctx
*
gin
.
Context
)
{
r
,
err
:=
backend
.
AllReducePerf
(
globalCfg
.
GetString
(
"rccl_test_path"
),
globalCfg
.
GetString
(
"rccl_all_reduce_perf_args"
))
ReturnGin
(
ctx
,
r
,
err
)
}
// GetDCULoad godoc
// @Summary 获取 DCU 负载信息
// @Description 获取 DCU 负载信息
// @Accept json
// @Produce json
// @Success 200 {object} RestfulListResult[backend.DCULoad]
// @Failure 500 {object} RestfulNoDataResult
// @Router /dcuload [get]
func
(
c
controller
)
GetDCULoad
(
ctx
*
gin
.
Context
)
{
dcu
,
err
:=
backend
.
GetDCULoad
()
ReturnGinList
(
ctx
,
dcu
,
err
)
}
// GetOnlineUser godoc
// @Summary 获取在线用户信息
// @Description 获取在线用户信息
// @Accept json
// @Produce json
// @Success 200 {object} RestfulListResult[backend.LoginUserInfo]
// @Failure 500 {object} RestfulNoDataResult
// @Router /loginUser [get]
func
(
c
controller
)
GetOnlineUser
(
ctx
*
gin
.
Context
)
{
olu
,
err
:=
backend
.
GetOnlineUser
()
ReturnGinList
(
ctx
,
olu
,
err
)
}
// GetSysLoad godoc
// @Summary 获取系统负载信息
// @Description 获取系统负载信息
// @Accept json
// @Produce json
// @Success 200 {object} RestfulResult[backend.SysInfo]
// @Failure 500 {object} RestfulNoDataResult
// @Router /sysload [get]
func
(
c
controller
)
GetSysLoad
(
ctx
*
gin
.
Context
)
{
sys
,
err
:=
backend
.
GetSysLoad
()
ReturnGin
(
ctx
,
sys
,
err
)
}
// GetAllInfo godoc
// @Summary 获取所有信息(系统负载、DCU 负载、在线用户)
// @Description 获取所有信息(系统负载、DCU 负载、在线用户)
// @Accept json
// @Produce json
// @Success 200 {object} RestfulResult[backend.AllInfo]
// @Failure 500 {object} RestfulNoDataResult
// @Router /all [get]
func
(
c
controller
)
GetAllInfo
(
ctx
*
gin
.
Context
)
{
olu
,
err
:=
backend
.
GetOnlineUser
()
if
err
!=
nil
{
ReturnGin
(
ctx
,
nil
,
err
)
}
sys
,
err
:=
backend
.
GetSysLoad
()
if
err
!=
nil
{
ReturnGin
(
ctx
,
nil
,
err
)
}
dcu
,
err
:=
backend
.
GetDCULoad
()
if
err
!=
nil
{
ReturnGin
(
ctx
,
nil
,
err
)
}
ReturnGin
(
ctx
,
backend
.
AllInfo
{
DCUInfo
:
dcu
,
SysInfo
:
*
sys
,
OnlineUserInfo
:
olu
,
},
err
)
}
type
RcclArgs
struct
{
Args
[]
string
`json:"args"`
}
// PostRcclHandler godoc
// @Summary 给出rccl all_reduce_perf参数,执行单机测试
// @Description 给出rccl all_reduce_perf参数,执行单机测试
// @Accept json
// @Produce json
// @Param args body RcclArgs true "rccl all reduce perf args"
// @Success 200 {object} RestfulResult[backend.RcclTestAllReducePrefResult]
// @Failure 500 {object} RestfulNoDataResult
// @Router /rccl/post [post]
func
(
c
controller
)
PostRcclHandler
(
ctx
*
gin
.
Context
)
{
args
:=
RcclArgs
{}
err
:=
ctx
.
BindJSON
(
&
args
)
if
err
!=
nil
{
ReturnGin
(
ctx
,
nil
,
err
)
return
}
if
len
(
args
.
Args
)
==
0
{
r
,
err
:=
backend
.
AllReducePerf
(
globalCfg
.
GetString
(
"rccl_test_path"
),
globalCfg
.
GetString
(
"rccl_all_reduce_perf_args"
))
ReturnGin
(
ctx
,
r
,
err
)
return
}
arg
:=
strings
.
Join
(
args
.
Args
,
" "
)
r
,
err
:=
backend
.
AllReducePerf
(
globalCfg
.
GetString
(
"rccl_test_path"
),
arg
)
ReturnGin
(
ctx
,
r
,
err
)
}
go.mod
View file @
1e4cd019
...
...
@@ -15,22 +15,39 @@ require (
)
require (
github.com/KyleBanks/depth
v1.2.1 // indirect
github.com/PuerkitoBio/purell
v1.2.1 // indirect
github.com/PuerkitoBio/urlesc
v0.0.0-20170810143723-de5bf2ad4578 // indirect
github.com/bytedance/sonic
v1.14.0 // indirect
github.com/bytedance/sonic/loader
v0.3.0 // indirect
github.com/cloudwego/base64x
v0.1.6 // indirect
github.com/cpuguy83/go-md2man/v2
v2.0.7 // indirect
github.com/fsnotify/fsnotify
v1.9.0 // indirect
github.com/gabriel-vasile/mimetype
v1.4.8 // indirect
github.com/gin-contrib/sse
v1.1.0 // indirect
github.com/go-openapi/jsonpointer
v0.22.4 // indirect
github.com/go-openapi/jsonreference
v0.21.4 // indirect
github.com/go-openapi/spec
v0.22.2 // indirect
github.com/go-openapi/swag
v0.25.4 // indirect
github.com/go-openapi/swag/conv
v0.25.4 // indirect
github.com/go-openapi/swag/jsonname
v0.25.4 // indirect
github.com/go-openapi/swag/jsonutils
v0.25.4 // indirect
github.com/go-openapi/swag/loading
v0.25.4 // indirect
github.com/go-openapi/swag/stringutils
v0.25.4 // indirect
github.com/go-openapi/swag/typeutils
v0.25.4 // indirect
github.com/go-openapi/swag/yamlutils
v0.25.4 // indirect
github.com/go-playground/locales
v0.14.1 // indirect
github.com/go-playground/universal-translator
v0.18.1 // indirect
github.com/go-playground/validator/v10
v10.27.0 // indirect
github.com/go-viper/mapstructure/v2
v2.4.0 // indirect
github.com/goccy/go-json
v0.10.2 // indirect
github.com/goccy/go-yaml
v1.18.0 // indirect
github.com/josharian/intern
v1.0.0 // indirect
github.com/json-iterator/go
v1.1.12 // indirect
github.com/klauspost/cpuid/v2
v2.3.0 // indirect
github.com/leodido/go-urn
v1.4.0 // indirect
github.com/modern-go/concurrent
v0.0.0-20180228061459-e0a39a4cb421 // indirect
github.com/mailru/easyjson
v0.9.1 // indirect
github.com/modern-go/concurrent
v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2
v1.0.2 // indirect
github.com/muesli/clusters
v0.0.0-20200529215643-2700303c1762 // indirect
github.com/muesli/kmeans
v0.3.1 // indirect
...
...
@@ -38,24 +55,31 @@ require (
github.com/quic-go/qpack
v0.5.1 // indirect
github.com/quic-go/quic-go
v0.54.0 // indirect
github.com/ramya-rao-a/go-outline
v0.0.0-20210608161538-9736a4bde949 // indirect
github.com/russross/blackfriday/v2
v2.1.0 // indirect
github.com/sagikazarmark/locafero
v0.11.0 // indirect
github.com/shoenig/go-m1cpu
v0.1.6 // indirect
github.com/shurcooL/sanitized_anchor_name
v1.0.0 // indirect
github.com/sourcegraph/conc
v0.3.1-0.20240121214520-5f936abd7ae8 // indirect
github.com/spf13/afero
v1.15.0 // indirect
github.com/spf13/cast
v1.10.0 // indirect
github.com/subosito/gotenv
v1.6.0 // indirect
github.com/swaggo/swag
v1.16.6 // indirect
github.com/twitchyliquid64/golang-asm
v0.15.1 // indirect
github.com/ugorji/go/codec
v1.3.0 // indirect
github.com/xrash/smetrics
v0.0.0-20201216005158-039620a65673 // indirect
github.com/urfave/cli/v2
v2.27.7 // indirect
github.com/xrash/smetrics
v0.0.0-20250705151800-55b8f293f342 // indirect
go.uber.org/mock
v0.5.0 // indirect
go.yaml.in/yaml/v2
v2.4.3 // indirect
go.yaml.in/yaml/v3
v3.0.4 // indirect
golang.org/x/arch
v0.20.0 // indirect
golang.org/x/crypto
v0.4
0
.0 // indirect
golang.org/x/mod
v0.
26
.0 // indirect
golang.org/x/net
v0.4
2
.0 // indirect
golang.org/x/sync
v0.1
6
.0 // indirect
golang.org/x/tools
v0.
35
.0 // indirect
golang.org/x/crypto
v0.4
6
.0 // indirect
golang.org/x/mod
v0.
31
.0 // indirect
golang.org/x/net
v0.4
8
.0 // indirect
golang.org/x/sync
v0.1
9
.0 // indirect
golang.org/x/tools
v0.
40
.0 // indirect
google.golang.org/protobuf
v1.36.9 // indirect
gopkg.in/yaml.v2
v2.4.0 // indirect
sigs.k8s.io/yaml
v1.6.0 // indirect
)
require (
...
...
@@ -100,6 +124,8 @@ require (
github.com/power-devops/perfstat
v0.0.0-20240221224432-82ca36839d55 // indirect
github.com/rivo/uniseg
v0.4.7 // indirect
github.com/spf13/pflag
v1.0.10
github.com/swaggo/files
v1.0.1
github.com/swaggo/gin-swagger
v1.6.1
github.com/tklauser/go-sysconf
v0.3.15 // indirect
github.com/tklauser/numcpus
v0.10.0 // indirect
github.com/xo/terminfo
v0.0.0-20220910002029-abceb7e1c41e // indirect
...
...
@@ -109,6 +135,6 @@ require (
go.opentelemetry.io/otel
v1.35.0 // indirect
go.opentelemetry.io/otel/metric
v1.35.0 // indirect
go.opentelemetry.io/otel/trace
v1.35.0 // indirect
golang.org/x/sys
v0.3
6
.0 // indirect
golang.org/x/text
v0.2
8
.0 // indirect
golang.org/x/sys
v0.3
9
.0 // indirect
golang.org/x/text
v0.
3
2.0 // indirect
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment