Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
liming6
dcu-process-montor
Commits
34501708
Commit
34501708
authored
Dec 11, 2025
by
liming6
Browse files
feature 添加配置管理功能和rccl单测功能
parent
f70b0280
Changes
10
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
510 additions
and
60 deletions
+510
-60
.vscode/launch.json
.vscode/launch.json
+0
-14
cmd/hytop/hytop
cmd/hytop/hytop
+0
-0
cmd/hytop/lib/lib.go
cmd/hytop/lib/lib.go
+39
-39
cmd/opsflow/backend/backend_test.go
cmd/opsflow/backend/backend_test.go
+99
-0
cmd/opsflow/backend/rccl-test-output.log
cmd/opsflow/backend/rccl-test-output.log
+46
-0
cmd/opsflow/backend/rccl.go
cmd/opsflow/backend/rccl.go
+260
-0
cmd/opsflow/main.go
cmd/opsflow/main.go
+38
-3
cmd/opsflow/opsflow.yaml
cmd/opsflow/opsflow.yaml
+2
-0
cmd/opsflow/web/main.go
cmd/opsflow/web/main.go
+13
-0
go.mod
go.mod
+13
-4
No files found.
.vscode/launch.json
deleted
100644 → 0
View file @
f70b0280
{
"version"
:
"0.2.0"
,
"configurations"
:
[
{
"name"
:
"Attach to Delve Server"
,
"type"
:
"go"
,
"request"
:
"attach"
,
"mode"
:
"remote"
,
"host"
:
"127.0.0.1"
,
//
dlv服务器的IP地址(如果是本地就是
127.0
.
0.1
)
"port"
:
43000
,
//
dlv服务器监听的端口
"remotePath"
:
"/root/cache/dcu-process-montor/cmd/hytop"
,
//
**重要:远程机器上Go源代码的绝对路径**
}
]
}
\ No newline at end of file
cmd/hytop/hytop
0 → 100755
View file @
34501708
File added
cmd/hytop/lib/lib.go
View file @
34501708
...
@@ -141,16 +141,16 @@ type RSMIProcessInfoV2 struct {
...
@@ -141,16 +141,16 @@ type RSMIProcessInfoV2 struct {
GPUUsage
map
[
int
]
float32
// GPU usage rate as a percentage
GPUUsage
map
[
int
]
float32
// GPU usage rate as a percentage
}
}
func
(
pi2
*
RSMIProcessInfoV2
)
FromC
(
c
C
.
rsmi_process_info_v2_t
)
{
//
func (pi2 *RSMIProcessInfoV2) FromC(c C.rsmi_process_info_v2_t) {
pi2
.
Pid
=
uint32
(
c
.
processId
)
//
pi2.Pid = uint32(c.processId)
pi2
.
VramUsageSize
=
uint64
(
c
.
vramUsageSize
)
//
pi2.VramUsageSize = uint64(c.vramUsageSize)
pi2
.
VramUsageRate
=
float32
(
c
.
vramUsageRate
)
//
pi2.VramUsageRate = float32(c.vramUsageRate)
pi2
.
UsedGPUs
=
int
(
c
.
usedGpus
)
//
pi2.UsedGPUs = int(c.usedGpus)
pi2
.
GPUUsage
=
make
(
map
[
int
]
float32
)
//
pi2.GPUUsage = make(map[int]float32)
for
k
,
v
:=
range
c
.
gpuIndex
{
//
for k, v := range c.gpuIndex {
pi2
.
GPUUsage
[
int
(
v
)]
=
float32
(
c
.
gpuUsageRate
[
k
])
//
pi2.GPUUsage[int(v)] = float32(c.gpuUsageRate[k])
}
//
}
}
//
}
// RSMI_init 初始化rsmi
// RSMI_init 初始化rsmi
func
RSMI_init
()
error
{
func
RSMI_init
()
error
{
...
@@ -285,35 +285,35 @@ func RSMI_compute_process_info_get() ([]RSMIProcessInfo, error) {
...
@@ -285,35 +285,35 @@ func RSMI_compute_process_info_get() ([]RSMIProcessInfo, error) {
}
}
// RSMI_compute_process_info_by_pid_get_v2 获取进程的详细信息,注意:不是所有版本的so文件都支持该方法,可能导致进程崩溃
// RSMI_compute_process_info_by_pid_get_v2 获取进程的详细信息,注意:不是所有版本的so文件都支持该方法,可能导致进程崩溃
func
RSMI_compute_process_info_by_pid_get_v2
(
pid
uint32
)
(
info
*
RSMIProcessInfoV2
,
res
error
)
{
//
func RSMI_compute_process_info_by_pid_get_v2(pid uint32) (info *RSMIProcessInfoV2, res error) {
ps2
:=
(
*
C
.
rsmi_process_info_v2_t
)(
C
.
malloc
(
C
.
sizeof_rsmi_process_info_v2_t
))
//
ps2 := (*C.rsmi_process_info_v2_t)(C.malloc(C.sizeof_rsmi_process_info_v2_t))
if
unsafe
.
Pointer
(
ps2
)
!=
C
.
NULL
{
//
if unsafe.Pointer(ps2) != C.NULL {
defer
func
()
{
//
defer func() {
C
.
free
(
unsafe
.
Pointer
(
ps2
))
//
C.free(unsafe.Pointer(ps2))
}()
//
}()
}
else
{
//
} else {
info
=
nil
//
info = nil
res
=
ErrMallocError
//
res = ErrMallocError
return
//
return
}
//
}
defer
func
()
{
//
defer func() {
if
r
:=
recover
();
r
!=
nil
{
//
if r := recover(); r != nil {
info
=
nil
//
info = nil
res
=
ErrUnknowError
//
res = ErrUnknowError
}
//
}
}()
//
}()
r
:=
C
.
rsmi_compute_process_info_by_pid_get_v2
(
C
.
uint
(
pid
),
ps2
)
//
r := C.rsmi_compute_process_info_by_pid_get_v2(C.uint(pid), ps2)
if
res
!=
nil
{
//
if res != nil {
info
=
nil
//
info = nil
res
=
ToRSMIResult
(
r
)
//
res = ToRSMIResult(r)
return
//
return
}
//
}
result
:=
RSMIProcessInfoV2
{}
//
result := RSMIProcessInfoV2{}
result
.
FromC
(
*
ps2
)
//
result.FromC(*ps2)
info
=
&
result
//
info = &result
res
=
nil
//
res = nil
return
//
return
}
//
}
func
RSMI_dev_fan_rpms_get
(
devIndex
uint32
)
(
int64
,
error
)
{
func
RSMI_dev_fan_rpms_get
(
devIndex
uint32
)
(
int64
,
error
)
{
var
rpm
C
.
long
=
C
.
long
(
0
)
var
rpm
C
.
long
=
C
.
long
(
0
)
...
...
cmd/opsflow/backend/backend_test.go
View file @
34501708
...
@@ -2,6 +2,8 @@ package backend
...
@@ -2,6 +2,8 @@ package backend
import
(
import
(
"encoding/json"
"encoding/json"
"os"
"strings"
"testing"
"testing"
)
)
...
@@ -59,3 +61,100 @@ func TestGetDCULoad(t *testing.T) {
...
@@ -59,3 +61,100 @@ func TestGetDCULoad(t *testing.T) {
t
.
Logf
(
"%d"
,
len
(
result
))
t
.
Logf
(
"%d"
,
len
(
result
))
}
}
}
}
func
TestReMetricsLine
(
t
*
testing
.
T
)
{
content
,
err
:=
os
.
ReadFile
(
"./rccl-test-output.log"
)
if
err
!=
nil
{
t
.
Error
(
err
)
}
output
:=
string
(
content
)
output
=
strings
.
Trim
(
output
,
"
\n
"
)
for
_
,
v
:=
range
strings
.
Split
(
output
,
"
\n
"
)
{
if
ReMetricsLine
.
MatchString
(
v
)
{
t
.
Log
(
"match"
)
s
:=
ReMetricsLine
.
FindAllStringSubmatch
(
output
,
-
1
)
for
_
,
v
:=
range
s
{
for
i
,
u
:=
range
v
{
t
.
Logf
(
" %d: %s"
,
i
,
u
)
}
}
}
}
}
func
TestReRcclVersion
(
t
*
testing
.
T
)
{
content
,
err
:=
os
.
ReadFile
(
"./rccl-test-output.log"
)
if
err
!=
nil
{
t
.
Error
(
err
)
}
output
:=
string
(
content
)
output
=
strings
.
Trim
(
output
,
"
\n
"
)
if
ReRcclVersion
.
MatchString
(
output
)
{
t
.
Log
(
"match"
)
s
:=
ReRcclVersion
.
FindAllStringSubmatch
(
output
,
-
1
)
for
_
,
v
:=
range
s
{
for
i
,
u
:=
range
v
{
t
.
Logf
(
" %d: %s"
,
i
,
u
)
}
}
}
}
func
TestReDeviceLine
(
t
*
testing
.
T
)
{
content
,
err
:=
os
.
ReadFile
(
"./rccl-test-output.log"
)
if
err
!=
nil
{
t
.
Error
(
err
)
}
output
:=
string
(
content
)
output
=
strings
.
Trim
(
output
,
"
\n
"
)
lines
:=
strings
.
Split
(
output
,
"
\n
"
)
for
_
,
v
:=
range
lines
{
if
ReDeviceLine
.
MatchString
(
v
)
{
s
:=
ReDeviceLine
.
FindAllStringSubmatch
(
v
,
-
1
)
for
_
,
v
:=
range
s
{
for
i
,
u
:=
range
v
{
t
.
Logf
(
" %d: %s"
,
i
,
u
)
}
}
}
}
}
func
TestParseRcclOutput
(
t
*
testing
.
T
)
{
content
,
err
:=
os
.
ReadFile
(
"./rccl-test-output.log"
)
if
err
!=
nil
{
t
.
Error
(
err
)
}
output
:=
string
(
content
)
result
:=
ParseRcclOutput
(
output
)
if
result
==
nil
{
t
.
Error
(
"error parse output"
)
}
t
.
Logf
(
"result: %+v"
,
result
)
if
len
(
result
.
Results
)
>
0
{
for
_
,
v
:=
range
result
.
Results
{
if
v
==
nil
{
continue
}
t
.
Logf
(
" item: %+v"
,
v
)
}
}
}
func
TestFile
(
t
*
testing
.
T
)
{
st
,
err
:=
os
.
Stat
(
"/usr/bin/bash"
)
if
err
!=
nil
{
t
.
Error
(
err
)
}
t
.
Logf
(
"%v"
,
st
.
Mode
()
.
Perm
()
&
0111
>
0
)
}
func
TestRcclTestCheck
(
t
*
testing
.
T
)
{
a
,
b
,
err
:=
AllReducePerf
(
"/home/panyq/wangx/rccl-tests/build-dan"
)
if
err
!=
nil
{
t
.
Error
(
err
)
}
t
.
Logf
(
"output: %s"
,
a
)
t
.
Logf
(
"result: %+v"
,
b
)
}
\ No newline at end of file
cmd/opsflow/backend/rccl-test-output.log
0 → 100644
View file @
34501708
# nThread 1 nGpus 8 minBytes 8 maxBytes 134217728 step: 2(factor) warmup iters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0
#
rccl-tests: Version develop:ae3e635
# Using devices
# Rank 0 Pid 2512460 on BW11 device 0 [0000:49:00.0] BW1000_H
# Rank 1 Pid 2512460 on BW11 device 1 [0000:54:00.0] BW1000_H
# Rank 2 Pid 2512460 on BW11 device 2 [0000:5e:00.0] BW1000_H
# Rank 3 Pid 2512460 on BW11 device 3 [0000:67:00.0] BW1000_H
# Rank 4 Pid 2512460 on BW11 device 4 [0000:9c:00.0] BW1000_H
# Rank 5 Pid 2512460 on BW11 device 5 [0000:bc:00.0] BW1000_H
# Rank 6 Pid 2512460 on BW11 device 6 [0000:cd:00.0] BW1000_H
# Rank 7 Pid 2512460 on BW11 device 7 [0000:dd:00.0] BW1000_H
#
# out-of-place in-place
# size count type redop root time algbw busbw #wrong time algbw busbw #wrong
# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s)
Launch params (512, 1, 1) are larger than launch bounds (256) for kernel _ZN12_GLOBAL__N_113prepareInput2I6__halfNS_9ReduceSumEEEvPT_lT0_iiml please add __launch_bounds__ to kernel define or use --gpu-max-threads-per-block recompile program !
8 4 half sum -1 34.26 0.00 0.00 0 33.92 0.00 0.00 0
16 8 half sum -1 34.53 0.00 0.00 0 34.15 0.00 0.00 0
32 16 half sum -1 33.16 0.00 0.00 0 33.44 0.00 0.00 0
64 32 half sum -1 33.67 0.00 0.00 0 33.05 0.00 0.00 0
128 64 half sum -1 33.24 0.00 0.01 0 33.26 0.00 0.01 0
256 128 half sum -1 33.47 0.01 0.01 0 33.24 0.01 0.01 0
512 256 half sum -1 32.94 0.02 0.03 0 33.24 0.02 0.03 0
1024 512 half sum -1 33.89 0.03 0.05 0 34.67 0.03 0.05 0
2048 1024 half sum -1 33.44 0.06 0.11 0 32.78 0.06 0.11 0
4096 2048 half sum -1 34.63 0.12 0.21 0 34.65 0.12 0.21 0
8192 4096 half sum -1 40.74 0.20 0.35 0 40.98 0.20 0.35 0
16384 8192 half sum -1 51.19 0.32 0.56 0 51.43 0.32 0.56 0
32768 16384 half sum -1 51.48 0.64 1.11 0 51.58 0.64 1.11 0
65536 32768 half sum -1 51.63 1.27 2.22 0 51.52 1.27 2.23 0
131072 65536 half sum -1 62.81 2.09 3.65 0 62.74 2.09 3.66 0
262144 131072 half sum -1 89.14 2.94 5.15 0 89.05 2.94 5.15 0
524288 262144 half sum -1 75.89 6.91 12.09 0 75.71 6.92 12.12 0
1048576 524288 half sum -1 96.24 10.89 19.07 0 96.35 10.88 19.05 0
2097152 1048576 half sum -1 115.6 18.14 31.74 0 115.3 18.18 31.82 0
4194304 2097152 half sum -1 171.1 24.51 42.89 0 171.6 24.44 42.77 0
8388608 4194304 half sum -1 259.7 32.30 56.52 0 259.5 32.32 56.56 0
16777216 8388608 half sum -1 456.7 36.74 64.29 0 456.4 36.76 64.33 0
33554432 16777216 half sum -1 827.4 40.56 70.97 0 826.9 40.58 71.01 0
67108864 33554432 half sum -1 1592.4 42.14 73.75 0 1590.7 42.19 73.83 0
134217728 67108864 half sum -1 3097.2 43.33 75.84 0 3101.3 43.28 75.74 0
# Errors with asterisks indicate errors that have exceeded the maximum threshold.
# Out of bounds values : 0 OK
# Avg bus bandwidth : 18.4264
#
cmd/opsflow/backend/rccl.go
View file @
34501708
package
backend
package
backend
import
(
"container/list"
"errors"
"os"
"os/exec"
"regexp"
"strconv"
"strings"
)
type
RCCL_BINARY
string
const
(
RCCL_ALL_GATHER
RCCL_BINARY
=
"all_gather_perf"
RCCL_ALL_REDUCE
RCCL_BINARY
=
"all_reduce_perf"
RCCL_ALL_TO_ALL
RCCL_BINARY
=
"alltoall_perf"
RCCL_BROADCASE
RCCL_BINARY
=
"broadcast_perf"
RCCL_GATHER
RCCL_BINARY
=
"gather_perf"
RCCL_REDUCE
RCCL_BINARY
=
"reduce_perf"
RCCL_REDUCE_SCATTER
RCCL_BINARY
=
"reduce_scatter_perf"
RCCL_SCATTER
RCCL_BINARY
=
"scatter_perf"
RCCL_SEND_RECV
RCCL_BINARY
=
"send_recv_perf"
)
type
RcclTestAllReducePrefResult
struct
{
Args
[]
string
`json:"args"`
// 执行参数
TestVersion
string
`json:"test_version"`
// rccl-tests 版本信息
UseDevice
[]
string
`json:"use_device"`
// 使用的设备列表
Results
[]
*
RcclTestItem
`json:"results"`
}
type
RcclTestItem
struct
{
Size
uint64
`json:"size"`
Count
uint64
`json:"count"`
Type
string
`json:"type"`
Redop
string
`json:"redop"`
Root
int
`json:"root"`
OutOfPlace
Metrics
`json:"out_of_place"`
InPlace
Metrics
`json:"in_place"`
}
type
Metrics
struct
{
Time
float64
`json:"time"`
AlgBW
float64
`json:"alg_bw"`
BusBW
float64
`json:"bus_bw"`
Wrong
uint32
`json:"wrong"`
}
func
NewRcclTestItem
(
str
[]
string
)
*
RcclTestItem
{
if
len
(
str
)
!=
13
{
return
nil
}
item
:=
&
RcclTestItem
{}
i
,
err
:=
strconv
.
ParseUint
(
str
[
0
],
10
,
64
)
if
err
!=
nil
{
return
nil
}
item
.
Size
=
i
i
,
err
=
strconv
.
ParseUint
(
str
[
1
],
10
,
64
)
if
err
!=
nil
{
return
nil
}
item
.
Count
=
i
item
.
Type
=
str
[
2
]
item
.
Redop
=
str
[
3
]
r
,
err
:=
strconv
.
ParseInt
(
str
[
4
],
10
,
10
)
if
err
!=
nil
{
return
nil
}
item
.
Root
=
int
(
r
)
t
,
err
:=
strconv
.
ParseFloat
(
str
[
5
],
64
)
if
err
!=
nil
{
return
nil
}
item
.
OutOfPlace
.
Time
=
t
t
,
err
=
strconv
.
ParseFloat
(
str
[
6
],
64
)
if
err
!=
nil
{
return
nil
}
item
.
OutOfPlace
.
AlgBW
=
t
t
,
err
=
strconv
.
ParseFloat
(
str
[
7
],
64
)
if
err
!=
nil
{
return
nil
}
item
.
OutOfPlace
.
BusBW
=
t
w
,
err
:=
strconv
.
ParseUint
(
str
[
8
],
10
,
32
)
if
err
!=
nil
{
return
nil
}
item
.
OutOfPlace
.
Wrong
=
uint32
(
w
)
t
,
err
=
strconv
.
ParseFloat
(
str
[
9
],
64
)
if
err
!=
nil
{
return
nil
}
item
.
InPlace
.
Time
=
t
t
,
err
=
strconv
.
ParseFloat
(
str
[
10
],
64
)
if
err
!=
nil
{
return
nil
}
item
.
InPlace
.
AlgBW
=
t
t
,
err
=
strconv
.
ParseFloat
(
str
[
11
],
64
)
if
err
!=
nil
{
return
nil
}
item
.
InPlace
.
BusBW
=
t
w
,
err
=
strconv
.
ParseUint
(
str
[
12
],
10
,
32
)
if
err
!=
nil
{
return
nil
}
item
.
InPlace
.
Wrong
=
uint32
(
w
)
return
item
}
var
(
ReUselessLine
=
regexp
.
MustCompile
(
`^#\s*$`
)
ReSharpLine
=
regexp
.
MustCompile
(
`^#.*$`
)
ReRcclVersion
=
regexp
.
MustCompile
(
`(?mi)^rccl-tests:\s+Version\s+(.+)$`
)
ReMetricsLine
=
regexp
.
MustCompile
(
`(?mi)^\s*(\d+)\s+(\d+)\s+(\w+)\s+(\w+)\s+((?:-|)\d+)\s+([0-9.]+)\s+([0-9.]+)\s+([0-9.]+)\s+(\d+)\s+([0-9.]+)\s+([0-9.]+)\s+([0-9.]+)\s+(\d+)$`
)
ReDeviceLineFlag
=
regexp
.
MustCompile
(
`(?mi)^#\s*using\s+devices\s*$`
)
ReDeviceLine
=
regexp
.
MustCompile
(
`(?mi)^#\s*rank\s+(\d+)\s+pid\s+(\d+)\s+on\s+(\w+)\s+device\s+(\d+)\s+\[([0-9a-zA-Z:.]*)\]\s(\w+)$`
)
)
// RcclTestCheck 检查 rccl-tests 目录及指定二进制文件是否存在,返回二进制文件的完整路径
func
RcclTestCheck
(
rccl_test_path
string
,
binary
RCCL_BINARY
)
(
string
,
error
)
{
// 检查 rccl-tests 目录是否存在
stat
,
err
:=
os
.
Stat
(
rccl_test_path
)
if
err
!=
nil
{
return
""
,
err
}
if
!
stat
.
IsDir
()
{
return
""
,
os
.
ErrNotExist
}
// 检查可执行文件是否存在
exePath
:=
strings
.
TrimSuffix
(
rccl_test_path
,
"/"
)
+
"/"
+
string
(
binary
)
exeStat
,
err
:=
os
.
Stat
(
exePath
)
if
err
!=
nil
{
return
""
,
err
}
if
exeStat
.
IsDir
()
{
return
""
,
errors
.
New
(
"it's a dir, not file"
)
}
if
exeStat
.
Mode
()
.
Perm
()
&
0111
==
0
{
return
""
,
errors
.
New
(
"file is not executable"
)
}
return
exePath
,
nil
}
func
GetRcclDtkPath
(
rccl_test_path
string
,
binary
RCCL_BINARY
)
(
string
,
error
)
{
path
,
err
:=
RcclTestCheck
(
rccl_test_path
,
binary
)
if
err
!=
nil
{
return
""
,
err
}
output
,
err
:=
exec
.
Command
(
"ldd"
,
path
)
.
CombinedOutput
()
if
err
!=
nil
{
return
""
,
err
}
str
:=
string
(
output
)
lines
:=
strings
.
Split
(
str
,
"
\n
"
)
for
_
,
v
:=
range
lines
{
if
strings
.
Contains
(
v
,
"librccl.so"
)
{
parts
:=
strings
.
Fields
(
v
)
for
_
,
part
:=
range
parts
{
if
strings
.
Contains
(
part
,
"dtk"
)
{
return
part
,
nil
}
}
}
}
return
""
,
errors
.
New
(
"librccl.so not found in ldd output"
)
}
func
AllReducePerf
(
rccl_test_path
string
,
args
...
string
)
(
string
,
*
RcclTestAllReducePrefResult
,
error
)
{
path
,
err
:=
RcclTestCheck
(
rccl_test_path
,
RCCL_ALL_REDUCE
)
if
err
!=
nil
{
return
""
,
nil
,
err
}
output
,
err
:=
exec
.
Command
(
path
,
args
...
)
.
CombinedOutput
()
if
err
!=
nil
{
return
""
,
nil
,
err
}
str
:=
string
(
output
)
res
:=
ParseRcclOutput
(
str
)
res
.
Args
=
args
return
str
,
res
,
nil
}
func
ParseRcclOutput
(
output
string
)
*
RcclTestAllReducePrefResult
{
retult
:=
RcclTestAllReducePrefResult
{}
retult
.
UseDevice
=
make
([]
string
,
0
,
8
)
str
:=
strings
.
Trim
(
output
,
"
\n
"
)
lines
:=
strings
.
Split
(
str
,
"
\n
"
)
sharpLines
:=
list
.
New
()
nosharpLines
:=
list
.
New
()
testItems
:=
make
([]
*
RcclTestItem
,
0
,
16
)
for
_
,
v
:=
range
lines
{
if
ReUselessLine
.
MatchString
(
v
)
{
continue
}
if
ReSharpLine
.
MatchString
(
v
)
{
sharpLines
.
PushBack
(
v
)
}
else
{
nosharpLines
.
PushBack
(
v
)
}
}
cache
:=
make
([]
string
,
0
,
16
)
for
e
:=
nosharpLines
.
Front
();
e
!=
nil
;
e
=
e
.
Next
()
{
line
,
ok
:=
e
.
Value
.
(
string
)
if
!
ok
{
continue
}
if
ReMetricsLine
.
MatchString
(
line
)
{
match
:=
ReMetricsLine
.
FindAllStringSubmatch
(
line
,
-
1
)
if
len
(
match
)
>
0
&&
len
(
match
[
0
])
==
14
{
item
:=
NewRcclTestItem
(
match
[
0
][
1
:
])
if
item
!=
nil
{
testItems
=
append
(
testItems
,
item
)
}
}
}
else
{
cache
=
append
(
cache
,
line
)
}
}
retult
.
Results
=
testItems
for
_
,
v
:=
range
cache
{
if
ReRcclVersion
.
MatchString
(
v
)
{
match
:=
ReRcclVersion
.
FindAllStringSubmatch
(
v
,
-
1
)
if
len
(
match
)
>
0
&&
len
(
match
[
0
])
==
2
{
retult
.
TestVersion
=
match
[
0
][
1
]
}
break
}
}
findFlag
:=
false
for
e
:=
sharpLines
.
Front
();
e
!=
nil
;
e
=
e
.
Next
()
{
line
,
ok
:=
e
.
Value
.
(
string
)
if
!
ok
{
continue
}
if
ReDeviceLineFlag
.
MatchString
(
line
)
{
findFlag
=
true
continue
}
if
findFlag
&&
ReDeviceLine
.
MatchString
(
line
)
{
match
:=
ReDeviceLine
.
FindAllStringSubmatch
(
line
,
-
1
)
if
len
(
match
)
>
0
&&
len
(
match
[
0
])
==
7
{
retult
.
UseDevice
=
append
(
retult
.
UseDevice
,
match
[
0
][
6
])
}
}
else
if
findFlag
{
findFlag
=
false
break
}
}
return
&
retult
}
cmd/opsflow/main.go
View file @
34501708
...
@@ -8,16 +8,22 @@ import (
...
@@ -8,16 +8,22 @@ import (
"log"
"log"
"github.com/spf13/pflag"
"github.com/spf13/pflag"
"github.com/spf13/viper"
)
)
var
(
var
(
flagPort
=
pflag
.
Int16P
(
"port"
,
"p"
,
10880
,
"listen port for service"
)
flagPort
=
pflag
.
Int16P
(
"port"
,
"p"
,
10880
,
"listen port for service"
)
flagServer
=
pflag
.
BoolP
(
"server"
,
"s"
,
false
,
"run as server mode"
)
flagServer
=
pflag
.
BoolP
(
"server"
,
"s"
,
false
,
"run as server mode"
)
flagCmd
=
pflag
.
StringP
(
"cmd"
,
"c"
,
"all"
,
"command to execute, sys/dcu/login/all"
)
flagCmd
=
pflag
.
StringP
(
"cmd"
,
"c"
,
"all"
,
"command to execute, sys/dcu/login/
rccl/
all"
)
flagHelp
=
pflag
.
BoolP
(
"help"
,
"h"
,
false
,
"show help message"
)
flagHelp
=
pflag
.
BoolP
(
"help"
,
"h"
,
false
,
"show help message"
)
flagCfg
=
pflag
.
String
(
"config"
,
"./opsflow.yaml"
,
"path to config file"
)
)
)
func
main
()
{
func
main
()
{
cfg
:=
viper
.
New
()
pflag
.
String
(
"rccl-test-path"
,
"/opt/rccl-tests/build"
,
"Path to rccl-tests"
)
pflag
.
StringSlice
(
"rccl-all-reduce-perf-args"
,
[]
string
{
"-b"
,
"8"
,
"-e"
,
"1G"
,
"-f"
,
"2"
,
"-g"
,
"8"
,
"-d"
,
"half"
},
"Arguments for rccl all reduce perf"
)
pflag
.
Parse
()
pflag
.
Parse
()
if
*
flagHelp
{
if
*
flagHelp
{
fmt
.
Println
(
`this is opsflow command line tool.
fmt
.
Println
(
`this is opsflow command line tool.
...
@@ -26,17 +32,34 @@ Usage:
...
@@ -26,17 +32,34 @@ Usage:
Options:`
)
Options:`
)
pflag
.
PrintDefaults
()
pflag
.
PrintDefaults
()
fmt
.
Println
(
`Env Valiables:
OPSFLOW_RCCL_TEST_PATH: set rccl test path
OPSFLOW_RCCL_ALL_REDUCE_PERF_ARGS: set rccl all reduce perf args`
)
return
return
}
}
cfg
.
SetDefault
(
"rccl_all_reduce_perf_args"
,
[]
string
{
"-b"
,
"8"
,
"-e"
,
"1G"
,
"-f"
,
"2"
,
"-g"
,
"8"
,
"-d"
,
"half"
})
cfg
.
SetDefault
(
"rccl_test_path"
,
"/opt/rccl-tests/build"
)
cfg
.
SetEnvPrefix
(
"OPSFLOW"
)
cfg
.
AutomaticEnv
()
cfg
.
BindPFlag
(
"rccl_test_path"
,
pflag
.
Lookup
(
"rccl-test-path"
))
cfg
.
BindPFlag
(
"rccl_all_reduce_perf_args"
,
pflag
.
Lookup
(
"rccl-all-reduce-perf-args"
))
cfg
.
SetConfigType
(
"yaml"
)
if
flagCfg
!=
nil
{
cfg
.
SetConfigFile
(
*
flagCfg
)
}
cfg
.
ReadInConfig
()
backend
.
Init
()
backend
.
Init
()
defer
backend
.
Shutdown
()
defer
backend
.
Shutdown
()
if
*
flagServer
{
if
*
flagServer
{
log
.
Println
(
"start opsflow server mode"
)
log
.
Println
(
"start opsflow server mode"
)
web
.
Init
(
cfg
)
err
:=
web
.
WebServer
(
fmt
.
Sprintf
(
":%d"
,
*
flagPort
))
err
:=
web
.
WebServer
(
fmt
.
Sprintf
(
":%d"
,
*
flagPort
))
if
err
!=
nil
{
if
err
!=
nil
{
log
.
Fatalf
(
"failed to start web server: %v"
,
err
)
log
.
Fatalf
(
"failed to start web server: %v"
,
err
)
}
}
return
}
}
switch
*
flagCmd
{
switch
*
flagCmd
{
case
"sys"
:
case
"sys"
:
...
@@ -45,6 +68,8 @@ Options:`)
...
@@ -45,6 +68,8 @@ Options:`)
PrintDCUInfo
()
PrintDCUInfo
()
case
"login"
:
case
"login"
:
PrintLoginInfo
()
PrintLoginInfo
()
case
"rccl"
:
PrintRcclInfo
(
cfg
.
GetString
(
"rccl_test_path"
),
cfg
.
GetStringSlice
(
"rccl_all_reduce_perf_args"
)
...
)
case
"all"
:
case
"all"
:
PrintSysLoad
()
PrintSysLoad
()
PrintDCUInfo
()
PrintDCUInfo
()
...
@@ -92,7 +117,17 @@ func PrintDCUInfo() {
...
@@ -92,7 +117,17 @@ func PrintDCUInfo() {
for
_
,
dcu
:=
range
dcus
{
for
_
,
dcu
:=
range
dcus
{
memTotal
:=
utils
.
MemorySize
{
Unit
:
utils
.
Byte
,
Num
:
dcu
.
MemTotal
}
memTotal
:=
utils
.
MemorySize
{
Unit
:
utils
.
Byte
,
Num
:
dcu
.
MemTotal
}
memUsed
:=
utils
.
MemorySize
{
Unit
:
utils
.
Byte
,
Num
:
dcu
.
MemUsed
}
memUsed
:=
utils
.
MemorySize
{
Unit
:
utils
.
Byte
,
Num
:
dcu
.
MemUsed
}
fmt
.
Printf
(
"DCU index: %d
\n
Fan speed: %s
\n
Temperature: %.2f%%
\n
Power Capture: %.2fw
\n
Power Capture: %.2fw
\n
VRAM total: %s
\n
VRAM used: %s
\n
DCUUtils: %.2f%%
\n
"
,
dcu
.
Index
,
dcu
.
Fan
,
dcu
.
Temp
,
dcu
.
PwrCap
,
dcu
.
PwrAvg
,
memTotal
.
HumanReadStr
(
1
),
memUsed
.
HumanReadStr
(
1
),
dcu
.
DCUUTil
)
fmt
.
Printf
(
"DCU index: %d Fan speed: %s Temperature: %.2f°C Power Capture: %.2fw Power Capture: %.2fw VRAM total: %s VRAM used: %s DCUUtils: %.2f%%
\n
"
,
dcu
.
Index
,
dcu
.
Fan
,
dcu
.
Temp
,
dcu
.
PwrCap
,
dcu
.
PwrAvg
,
memTotal
.
HumanReadStr
(
1
),
memUsed
.
HumanReadStr
(
1
),
dcu
.
DCUUTil
)
}
fmt
.
Println
(
""
)
}
func
PrintRcclInfo
(
rccl_test_path
string
,
args
...
string
)
{
output
,
_
,
err
:=
backend
.
AllReducePerf
(
rccl_test_path
,
args
...
)
if
err
!=
nil
{
log
.
Fatalf
(
"failed to get rccl info: %v"
,
err
)
}
}
fmt
.
Println
(
"============== rccl all reduce perf ================="
)
fmt
.
Println
(
output
)
fmt
.
Println
(
""
)
fmt
.
Println
(
""
)
}
}
cmd/opsflow/opsflow.yaml
0 → 100644
View file @
34501708
rccl_test_path
:
/home/panyq/wangx/rccl-tests/build-dan
rccl_all_reduce_perf_args
:
[
"
-b"
,
"
8"
,
"
-e"
,
"
128M"
,
"
-f"
,
"
2"
,
"
-g"
,
"
8"
,
"
-d"
,
"
half"
]
cmd/opsflow/web/main.go
View file @
34501708
...
@@ -4,8 +4,17 @@ import (
...
@@ -4,8 +4,17 @@ import (
"get-container/cmd/opsflow/backend"
"get-container/cmd/opsflow/backend"
"github.com/gin-gonic/gin"
"github.com/gin-gonic/gin"
"github.com/spf13/viper"
)
)
var
(
globalCfg
*
viper
.
Viper
=
nil
)
func
Init
(
cfg
*
viper
.
Viper
)
{
globalCfg
=
cfg
}
type
RestfulResult
struct
{
type
RestfulResult
struct
{
Code
int
`json:"code"`
Code
int
`json:"code"`
Msg
string
`json:"msg"`
Msg
string
`json:"msg"`
...
@@ -61,5 +70,9 @@ func WebServer(addr string) error {
...
@@ -61,5 +70,9 @@ func WebServer(addr string) error {
dcu
,
err
:=
backend
.
GetDCULoad
()
dcu
,
err
:=
backend
.
GetDCULoad
()
ReturnGin
(
ctx
,
dcu
,
err
)
ReturnGin
(
ctx
,
dcu
,
err
)
})
})
cmdGroup
.
GET
(
"/rcclinfo"
,
func
(
ctx
*
gin
.
Context
)
{
_
,
r
,
err
:=
backend
.
AllReducePerf
(
globalCfg
.
GetString
(
"rccl_test_path"
),
globalCfg
.
GetStringSlice
(
"rccl_all_reduce_perf_args"
)
...
)
ReturnGin
(
ctx
,
r
,
err
)
})
return
engine
.
Run
(
addr
)
return
engine
.
Run
(
addr
)
}
}
go.mod
View file @
34501708
...
@@ -6,23 +6,25 @@ require (
...
@@ -6,23 +6,25 @@ require (
github.com/charmbracelet/bubbletea
v1.3.10
github.com/charmbracelet/bubbletea
v1.3.10
github.com/charmbracelet/lipgloss
v1.1.0
github.com/charmbracelet/lipgloss
v1.1.0
github.com/emirpasic/gods/v2
v2.0.0-alpha
github.com/emirpasic/gods/v2
v2.0.0-alpha
github.com/google/uuid
v1.6.0
github.com/lrstanley/bubblezone
v0.0.0-20240914071701-b48c55a5e78e
github.com/lrstanley/bubblezone
v0.0.0-20240914071701-b48c55a5e78e
github.com/moby/moby/api
v1.52.0-beta.2
github.com/moby/moby/api
v1.52.0-beta.2
github.com/moby/moby/client
v0.1.0-beta.2
github.com/moby/moby/client
v0.1.0-beta.2
github.com/shirou/gopsutil/v3
v3.24.5
github.com/shirou/gopsutil/v3
v3.24.5
github.com/shirou/gopsutil/v4
v4.25.9
github.com/shirou/gopsutil/v4
v4.25.9
github.com/spf13/viper
v1.21.0
)
)
require (
require (
github.com/bytedance/sonic
v1.14.0 // indirect
github.com/bytedance/sonic
v1.14.0 // indirect
github.com/bytedance/sonic/loader
v0.3.0 // indirect
github.com/bytedance/sonic/loader
v0.3.0 // indirect
github.com/cloudwego/base64x
v0.1.6 // indirect
github.com/cloudwego/base64x
v0.1.6 // indirect
github.com/fsnotify/fsnotify
v1.9.0 // indirect
github.com/gabriel-vasile/mimetype
v1.4.8 // indirect
github.com/gabriel-vasile/mimetype
v1.4.8 // indirect
github.com/gin-contrib/sse
v1.1.0 // indirect
github.com/gin-contrib/sse
v1.1.0 // indirect
github.com/go-playground/locales
v0.14.1 // indirect
github.com/go-playground/locales
v0.14.1 // indirect
github.com/go-playground/universal-translator
v0.18.1 // indirect
github.com/go-playground/universal-translator
v0.18.1 // indirect
github.com/go-playground/validator/v10
v10.27.0 // indirect
github.com/go-playground/validator/v10
v10.27.0 // indirect
github.com/go-viper/mapstructure/v2
v2.4.0 // indirect
github.com/goccy/go-json
v0.10.2 // indirect
github.com/goccy/go-json
v0.10.2 // indirect
github.com/goccy/go-yaml
v1.18.0 // indirect
github.com/goccy/go-yaml
v1.18.0 // indirect
github.com/json-iterator/go
v1.1.12 // indirect
github.com/json-iterator/go
v1.1.12 // indirect
...
@@ -35,17 +37,24 @@ require (
...
@@ -35,17 +37,24 @@ require (
github.com/pelletier/go-toml/v2
v2.2.4 // indirect
github.com/pelletier/go-toml/v2
v2.2.4 // indirect
github.com/quic-go/qpack
v0.5.1 // indirect
github.com/quic-go/qpack
v0.5.1 // indirect
github.com/quic-go/quic-go
v0.54.0 // indirect
github.com/quic-go/quic-go
v0.54.0 // indirect
github.com/ramya-rao-a/go-outline
v0.0.0-20210608161538-9736a4bde949 // indirect
github.com/sagikazarmark/locafero
v0.11.0 // indirect
github.com/shoenig/go-m1cpu
v0.1.6 // indirect
github.com/shoenig/go-m1cpu
v0.1.6 // indirect
github.com/sourcegraph/conc
v0.3.1-0.20240121214520-5f936abd7ae8 // indirect
github.com/spf13/afero
v1.15.0 // indirect
github.com/spf13/cast
v1.10.0 // indirect
github.com/subosito/gotenv
v1.6.0 // indirect
github.com/twitchyliquid64/golang-asm
v0.15.1 // indirect
github.com/twitchyliquid64/golang-asm
v0.15.1 // indirect
github.com/ugorji/go/codec
v1.3.0 // indirect
github.com/ugorji/go/codec
v1.3.0 // indirect
github.com/xrash/smetrics
v0.0.0-20201216005158-039620a65673 // indirect
github.com/xrash/smetrics
v0.0.0-20201216005158-039620a65673 // indirect
go.uber.org/mock
v0.5.0 // indirect
go.uber.org/mock
v0.5.0 // indirect
go.yaml.in/yaml/v3
v3.0.4 // indirect
golang.org/x/arch
v0.20.0 // indirect
golang.org/x/arch
v0.20.0 // indirect
golang.org/x/crypto
v0.40.0 // indirect
golang.org/x/crypto
v0.40.0 // indirect
golang.org/x/mod
v0.2
5
.0 // indirect
golang.org/x/mod
v0.2
6
.0 // indirect
golang.org/x/net
v0.42.0 // indirect
golang.org/x/net
v0.42.0 // indirect
golang.org/x/sync
v0.16.0 // indirect
golang.org/x/sync
v0.16.0 // indirect
golang.org/x/tools
v0.3
4
.0 // indirect
golang.org/x/tools
v0.3
5
.0 // indirect
google.golang.org/protobuf
v1.36.9 // indirect
google.golang.org/protobuf
v1.36.9 // indirect
)
)
...
@@ -101,5 +110,5 @@ require (
...
@@ -101,5 +110,5 @@ require (
go.opentelemetry.io/otel/metric
v1.35.0 // indirect
go.opentelemetry.io/otel/metric
v1.35.0 // indirect
go.opentelemetry.io/otel/trace
v1.35.0 // indirect
go.opentelemetry.io/otel/trace
v1.35.0 // indirect
golang.org/x/sys
v0.36.0 // indirect
golang.org/x/sys
v0.36.0 // indirect
golang.org/x/text
v0.2
7
.0 // indirect
golang.org/x/text
v0.2
8
.0 // indirect
)
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment