Commit 7923c563 authored by liming6's avatar liming6
Browse files

init commit

parents
Pipeline #3024 failed with stages
in 0 seconds
.idea
go.sum
# Readme
这是一个类似Prometheus的监控工具,目前的想法是采集服务器上使用GPU/DCU的进程信息
主要包含两部分:
- 探针:部署在每个需要被监控的服务器上,暴露服务器中使用GPU/DCU进程的相关信息
- 收集器:收集各个探针的数据,并记录下来,且支持查询
写这个工具的目的是希望能快速快速找到哪些节点的GPU/DCU是空的,没有被使用的。
nvidia-dcgm-exporter和dcu-exporter收集的是一些数字指标信息,不能直观的体现GPU/DCU是否被占用
***本工具的另一个目的是希望能追查到进程的真实用户,以便统计每个人的GPU使用率***
## 关键组件选型
数据库
- prometheus/tsdb
序列化方法
- json
- CBOR
## 如何确定谁使用了显卡、加速卡
编写本工具的一个目的是想了解是哪些Linux系统用户在使用显卡
对于曙光环境,使用显卡的方式主要有两种
- docker容器
- 主机进程使用
对于docker容器,我们认为谁创建了容器,那么该容器中的进程如何使用了显卡,就认为是容器的创建者使用了显卡
对于主机进程,那就是进程的用户使用了显卡
复杂情况:
- sudo转换用户执行
- su转换用户
思路或方法:
- docker
- 使用audit审计docker命令、docker.sock文件
- 使用空壳脚本记录
- 代理和转发docker.sock文件
- 使用falco审计工具
- 监听/var/run/docker.sock
- 主机进程
- 直接审计使用显卡设备的进程
### docker监控
audit方案
```shell
auditctl -w /usr/bin/docker -p x -k docker-cmd
auditctl -w /var/run/docker.sock -p rwxa -k docker-daemon
ausearch -k docker-daemon | aureport -f -i
```
空壳shell方案: 编写一个空壳脚本替代原本的docker命令,在执行实际docker命令前记录相关信息,以下是空壳脚本的内容
```shell
#!/bin/bash
REAL_SUFFIX='-real'
BASE_NAME=$(basename "$0")
REAL_CMD="${BASE_NAME}$REAL_SUFFIX"
LOG_FILE="/var/log/${BASE_NAME}-call.log"
[[ -f "$LOG_FILE" ]] || (touch "$LOG_FILE" && chmod 666 "$LOG_FILE")
# 记录调用信息
{
TIMESTAMP=$(date +"%Y-%m-%d %H:%M:%S")
echo "=== [${TIMESTAMP}] ==="
echo "Command : $0"
echo "All args : $*"
echo "PID : $$"
echo "Parent PID: $PPID"
PARENT_INFO=$(ps -fp $PPID)
echo "Parent Info:"
echo "$PARENT_INFO"
echo "--- Environment ---"
env
echo "--- Environment ---"
echo "----- Output -----"
} >> "$LOG_FILE"
(exec /usr/bin/"${REAL_CMD}" "$@") | tee "$LOG_FILE"
{
echo "----- Output -----"
echo ""
} >> "$LOG_FILE"
```
falco方案与audit的方案类似,本质上都是使用eBPF抓包,这里省略
### 针对sudo和su的方法
- 记录进程树
- 检查进程信息 /proc/\<PID\>/{status,sessionid,stat,environment}
- 使用who或last命令
- audit审计
- 使用pam会话记录
- acct记账
# Todo
待办清单:
- 解析/etc/passwd文件,解析Linux系统用户和家目录
- 解析docker容器相关信息,尝试找出启动容器的用户
- 对于占用tty的命令,查询tty用户
- 对于非交互式的命令,???
package main
import (
"log"
"os"
"os/exec"
)
/**
这个工具是对docker命令的一个包装,用于记录docker创建、删除容器的动作,并记录执行的用户
*/
// runWithoutAction 没有任何额外动作,仅执行命令
func runWithoutAction(args []string) error {
var cmd *exec.Cmd
if args == nil || len(args) == 0 {
cmd = exec.Command("docker")
}
cmd = exec.Command("docker", args...)
cmd.Stdin = os.Stdin
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
return cmd.Run()
}
func main() {
args := os.Args[1:]
if len(args) == 0 {
err := runWithoutAction(args)
if err != nil {
log.Fatal(err)
}
os.Exit(0)
}
switch args[0] {
case "run":
case "create":
case "rm":
default:
err := runWithoutAction(args)
if err != nil {
log.Fatal(err)
}
os.Exit(0)
}
}
/*
用户
时间
当前文件夹
*/
package types
// ProbeResult 探针的探测结果
type ProbeResult struct {
NvidiaGPUS string
DCUS string
NodeInfo string
}
package docker
import (
"get-container/utils"
"context"
"errors"
"fmt"
"github.com/moby/moby/api/types/container"
"github.com/moby/moby/client"
"os"
"regexp"
"strings"
"sync"
"time"
)
/**
有两种方法获取进程属于哪个容器
1. 通过查询pid命令空间
2. 通过查询进程的cgroup
*/
type FindCIDMethod string
const (
ByCgroup FindCIDMethod = "byCGroup"
ByPidNS FindCIDMethod = "byPidNS"
)
var (
ReDocker = regexp.MustCompile(`^.*docker[-/]([0-9a-z]*)(?:|.*)`)
ContainerInfo *ContainersInfo = nil
)
type ContainersInfo struct {
lock sync.RWMutex // 读写锁,防止对Info的并发写
time time.Time // 记录写入Info的时间
inspectInfo map[string]container.InspectResponse
listInfo map[string]container.Summary
}
func (info *ContainersInfo) Update() error {
info.lock.Lock()
defer info.lock.Unlock()
i, s, err := getContainerInfo()
if err != nil {
return err
}
info.inspectInfo = i
info.listInfo = s
info.time = time.Now()
return nil
}
func (info *ContainersInfo) Get() (map[string]container.InspectResponse, sync.Locker) {
rl := info.lock.RLocker()
rl.Lock()
return info.inspectInfo, rl
}
func init() {
_ = initContainerInfo()
}
func initContainerInfo() error {
inspect, lists, err := getContainerInfo()
if err != nil {
return err
}
ContainerInfo = &ContainersInfo{
lock: sync.RWMutex{},
time: time.Now(),
inspectInfo: inspect,
listInfo: lists,
}
return nil
}
// FindContainerIdByPid 根据pid获取该进程属于哪个docker容器,返回容器id,如果为nil,表示找不到容器id
func FindContainerIdByPid(pid uint64, method FindCIDMethod) (*string, error) {
switch method {
case ByPidNS:
return findContainerIdByNS(pid)
case ByCgroup:
return findContainerIdByCgroup(pid)
default:
return nil, fmt.Errorf("unknown method: %s", method)
}
}
func FindContainerIdByPidBatch(pids []uint64, method FindCIDMethod) (map[uint64]string, error) {
if pids == nil || len(pids) == 0 {
return nil, nil
}
switch method {
case ByPidNS:
return findContainerIdByNSBatch(pids)
case ByCgroup:
return findContainerIdByCgroupBatch(pids)
default:
return nil, fmt.Errorf("unknown method: %s", method)
}
}
// findContainerIdByPidCgroup 通过cgroup查询docker容器id
func findContainerIdByCgroup(pid uint64) (*string, error) {
content, err := os.ReadFile(fmt.Sprintf("/proc/%d/cgroup", pid))
if err != nil {
return nil, err
}
contentStr := strings.Trim(string(content), "\n")
if len(contentStr) == 0 {
return nil, errors.New("process's cgroup not found")
}
lines := strings.Split(contentStr, "\n")
var target string
if len(lines) > 1 {
// 如果有多行,解析有pids的行
for _, line := range lines {
if strings.Contains(line, "pids") {
target = strings.TrimSpace(line)
break
}
}
if target == "" {
return nil, errors.New("process's cgroup not found pids line")
}
} else {
// 如果是单行,直接解析
target = strings.TrimSpace(lines[0])
}
target = strings.TrimSpace(target)
if !strings.Contains(target, "docker") {
return nil, errors.New("process's cgroup is not create by docker")
}
if ReDocker.MatchString(target) {
fields := ReDocker.FindStringSubmatch(target)
if len(fields) < 2 {
return nil, errors.New("process's cgroup is not create by docker")
}
cid := fields[1]
return &cid, nil
} else {
return nil, errors.New("process's cgroup is not create by docker")
}
}
func findContainerIdByCgroupBatch(pids []uint64) (map[uint64]string, error) {
results := make(map[uint64]string)
for _, pid := range pids {
str, err := findContainerIdByCgroup(pid)
if err != nil {
return nil, err
}
s := *str
results[pid] = s
}
return results, nil
}
// findContainerIdByNS 通过pid命名空间查询docker容器id
func findContainerIdByNS(pid uint64) (*string, error) {
ns, err := utils.GetPidNS(pid)
if err != nil {
return nil, err
}
if ContainerInfo == nil {
innerErr := initContainerInfo()
if innerErr != nil {
return nil, innerErr
}
} else {
if innerErr := ContainerInfo.Update(); innerErr != nil {
return nil, innerErr
}
}
info, lock := ContainerInfo.Get()
defer lock.Unlock()
for k, v := range info {
containerNs, innerErr := utils.GetPidNS(uint64(v.State.Pid))
if innerErr != nil {
continue
}
if containerNs == ns {
cid := k
return &cid, nil
}
}
return nil, nil
}
func findContainerIdByNSBatch(pids []uint64) (map[uint64]string, error) {
if ContainerInfo == nil {
innerErr := initContainerInfo()
if innerErr != nil {
return nil, innerErr
}
} else {
if innerErr := ContainerInfo.Update(); innerErr != nil {
return nil, innerErr
}
}
info, lock := ContainerInfo.Get()
defer lock.Unlock()
results := make(map[uint64]string)
ns2cid := make(map[uint64]string)
for k, v := range info {
containerNs, innerErr := utils.GetPidNS(uint64(v.State.Pid))
if innerErr != nil {
return nil, innerErr
}
ns2cid[containerNs] = k
}
for _, pid := range pids {
ns, err := utils.GetPidNS(pid)
if err != nil {
continue
}
if cid, ok := ns2cid[ns]; ok {
results[pid] = cid
}
}
return results, nil
}
// getContainerInfo 获取所有正在运行的docker容器的详细信息
func getContainerInfo() (map[string]container.InspectResponse, map[string]container.Summary, error) {
cli, err := client.NewClientWithOpts(client.FromEnv, client.WithAPIVersionNegotiation())
if err != nil {
return nil, nil, err
}
defer func() {
_ = cli.Close()
}()
containerSum, err := cli.ContainerList(context.Background(), client.ContainerListOptions{All: false})
if err != nil {
return nil, nil, err
}
inspects := make(map[string]container.InspectResponse)
lists := make(map[string]container.Summary)
for _, c := range containerSum {
inspect, innerErr := cli.ContainerInspect(context.Background(), c.ID)
if innerErr != nil {
return nil, nil, innerErr
}
inspects[c.ID] = inspect
lists[c.ID] = c
}
return inspects, lists, nil
}
package docker
import (
"context"
"github.com/moby/moby/client"
"strings"
"testing"
)
func TestRegexp(t *testing.T) {
testData := []string{
`13:pids:/system.slice/docker-ce3d84d6b51029f2eb89a6ab15ccdd9756add41032945a75b58b974aeb365389.scope
12:freezer:/system.slice/docker-ce3d84d6b51029f2eb89a6ab15ccdd9756add41032945a75b58b974aeb365389.scope
11:rdma:/system.slice/docker-ce3d84d6b51029f2eb89a6ab15ccdd9756add41032945a75b58b974aeb365389.scope
10:net_cls,net_prio:/system.slice/docker-ce3d84d6b51029f2eb89a6ab15ccdd9756add41032945a75b58b974aeb365389.scope
9:cpuset:/system.slice/docker-ce3d84d6b51029f2eb89a6ab15ccdd9756add41032945a75b58b974aeb365389.scope
8:hugetlb:/system.slice/docker-ce3d84d6b51029f2eb89a6ab15ccdd9756add41032945a75b58b974aeb365389.scope
7:memory:/system.slice/docker-ce3d84d6b51029f2eb89a6ab15ccdd9756add41032945a75b58b974aeb365389.scope
6:perf_event:/system.slice/docker-ce3d84d6b51029f2eb89a6ab15ccdd9756add41032945a75b58b974aeb365389.scope
5:devices:/system.slice/docker-ce3d84d6b51029f2eb89a6ab15ccdd9756add41032945a75b58b974aeb365389.scope
4:ioasids:/
3:blkio:/system.slice/docker-ce3d84d6b51029f2eb89a6ab15ccdd9756add41032945a75b58b974aeb365389.scope
2:cpu,cpuacct:/system.slice/docker-ce3d84d6b51029f2eb89a6ab15ccdd9756add41032945a75b58b974aeb365389.scope
1:name=systemd:/system.slice/docker-ce3d84d6b51029f2eb89a6ab15ccdd9756add41032945a75b58b974aeb365389.scope`,
"0::/system.slice/docker-e6369ea11c46057bcd05cb15be33014e0220e0319bb0ca15a71b295f33025798.scope",
`12:memory:/docker/eb30ba674bdffab5c7165ec4c3d69cd9ebb0805670be91b4d080eaf818192d7b
11:devices:/docker/eb30ba674bdffab5c7165ec4c3d69cd9ebb0805670be91b4d080eaf818192d7b
10:hugetlb:/docker/eb30ba674bdffab5c7165ec4c3d69cd9ebb0805670be91b4d080eaf818192d7b
9:net_cls,net_prio:/docker/eb30ba674bdffab5c7165ec4c3d69cd9ebb0805670be91b4d080eaf818192d7b
8:perf_event:/docker/eb30ba674bdffab5c7165ec4c3d69cd9ebb0805670be91b4d080eaf818192d7b
7:cpuset:/docker/eb30ba674bdffab5c7165ec4c3d69cd9ebb0805670be91b4d080eaf818192d7b
6:rdma:/docker/eb30ba674bdffab5c7165ec4c3d69cd9ebb0805670be91b4d080eaf818192d7b
5:pids:/docker/eb30ba674bdffab5c7165ec4c3d69cd9ebb0805670be91b4d080eaf818192d7b
4:freezer:/docker/eb30ba674bdffab5c7165ec4c3d69cd9ebb0805670be91b4d080eaf818192d7b
3:blkio:/docker/eb30ba674bdffab5c7165ec4c3d69cd9ebb0805670be91b4d080eaf818192d7b
2:cpu,cpuacct:/docker/eb30ba674bdffab5c7165ec4c3d69cd9ebb0805670be91b4d080eaf818192d7b
1:name=systemd:/docker/eb30ba674bdffab5c7165ec4c3d69cd9ebb0805670be91b4d080eaf818192d7b`,
}
testResult := []string{
"ce3d84d6b51029f2eb89a6ab15ccdd9756add41032945a75b58b974aeb365389",
"e6369ea11c46057bcd05cb15be33014e0220e0319bb0ca15a71b295f33025798",
"eb30ba674bdffab5c7165ec4c3d69cd9ebb0805670be91b4d080eaf818192d7b",
}
for i, s := range testData {
lines := strings.Split(s, "\n")
var target string
if len(lines) > 1 {
for _, line := range lines {
if strings.Contains(line, "pids") {
target = line
break
}
}
} else {
target = lines[0]
}
if ReDocker.MatchString(target) {
fields := ReDocker.FindStringSubmatch(target)
t.Logf("fields: %v", fields)
if fields[1] != testResult[i] {
t.Errorf("not match: input str %s, target str: %s", target, testResult[i])
}
}
}
}
func TestDocker(t *testing.T) {
cli, err := client.NewClientWithOpts(client.FromEnv, client.WithAPIVersionNegotiation())
if err != nil {
t.Error(err)
}
defer func() {
_ = cli.Close()
}()
listOpt := client.ContainerListOptions{All: true}
//filter := make(client.Filters)
//filter.Add()
//listOpt.Filters = filter
c, err := cli.ContainerList(context.Background(), listOpt)
if err != nil {
t.Error(err)
}
for _, container := range c {
t.Logf("%+v\n", container)
}
}
module get-container
go 1.24.2
require (
github.com/moby/moby/api v1.52.0-beta.2
github.com/moby/moby/client v0.1.0-beta.2
github.com/shirou/gopsutil/v4 v4.25.9
)
require (
github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c // indirect
github.com/Microsoft/go-winio v0.6.2 // indirect
github.com/containerd/errdefs v1.0.0 // indirect
github.com/containerd/errdefs/pkg v0.3.0 // indirect
github.com/distribution/reference v0.6.0 // indirect
github.com/docker/go-connections v0.6.0 // indirect
github.com/docker/go-units v0.5.0 // indirect
github.com/ebitengine/purego v0.9.0 // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect
github.com/go-logr/logr v1.4.2 // indirect
github.com/go-logr/stdr v1.2.2 // indirect
github.com/go-ole/go-ole v1.2.6 // indirect
github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect
github.com/moby/docker-image-spec v1.3.1 // indirect
github.com/moby/term v0.5.2 // indirect
github.com/opencontainers/go-digest v1.0.0 // indirect
github.com/opencontainers/image-spec v1.1.1 // indirect
github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 // indirect
github.com/tklauser/go-sysconf v0.3.15 // indirect
github.com/tklauser/numcpus v0.10.0 // indirect
github.com/yusufpapurcu/wmi v1.2.4 // indirect
go.opentelemetry.io/auto/sdk v1.1.0 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 // indirect
go.opentelemetry.io/otel v1.35.0 // indirect
go.opentelemetry.io/otel/metric v1.35.0 // indirect
go.opentelemetry.io/otel/trace v1.35.0 // indirect
golang.org/x/sys v0.35.0 // indirect
)
package gpu
import (
"encoding/json"
"get-container/utils"
"os/exec"
"regexp"
"strconv"
"strings"
)
type HYVersionInfo struct {
SMIVersion string
LibVersion string
DriverVersion string
}
type DCUInfo struct {
}
type DCUPidInfo struct {
Pid uint64
PASId uint64
HCUNode []string
HCUIndex []string
GPUID []string
PCIBus []string
VRamUsed utils.MemorySize
VRamUsedPercent int
SDMAUsed int
}
var (
ReEmptyLine = regexp.MustCompile(`^\s*$`)
ReUselessLine = regexp.MustCompile(`^=[ =a-zA-Z0-9]*=$`)
)
const (
PIDHeader = "PID"
PASIDHeader = "PASID"
HCUNodeHeader = "HCU Node(Include CPU sort)"
HCUIndexHeader = "HCU Index"
GPUIDHeader = "GPUID"
PCIBusHeader = "PCI BUS"
VRamUsedHeader = "VRAM USED(MiB)"
VRamUsedPercentHeader = "VRAM USED(%)"
SDMAUsedHeader = "SDMA USED"
)
// GetDCUPidInfo 获取Pid相关信息
func GetDCUPidInfo() ([]DCUPidInfo, error) {
output, err := exec.Command("hy-smi", "--showpids").Output()
if err != nil {
return nil, err
}
return parseDCUPidInfo(string(output))
}
func parseDCUPidInfo(s string) ([]DCUPidInfo, error) {
lines := strings.Split(strings.Trim(string(s), "\n"), "\n")
linesArray := make([][]string, 0)
for _, line := range lines {
if ReEmptyLine.MatchString(line) || ReUselessLine.MatchString(line) {
continue
}
linesArray = append(linesArray, strings.SplitN(strings.TrimSpace(line), ":", 2))
}
infosArray := make([]map[string]string, 0)
index := -1
for _, line := range linesArray {
if line == nil || len(line) != 2 {
continue
}
line[1] = strings.TrimSpace(line[1])
switch line[0] {
case PIDHeader:
index += 1
infosArray = append(infosArray, make(map[string]string))
infosArray[index][PIDHeader] = line[1]
break
case PASIDHeader:
infosArray[index][PASIDHeader] = line[1]
break
case HCUNodeHeader:
infosArray[index][HCUNodeHeader] = line[1]
break
case HCUIndexHeader:
infosArray[index][HCUIndexHeader] = line[1]
break
case GPUIDHeader:
infosArray[index][GPUIDHeader] = line[1]
break
case PCIBusHeader:
infosArray[index][PCIBusHeader] = line[1]
break
case VRamUsedHeader:
infosArray[index][VRamUsedHeader] = line[1]
break
case VRamUsedPercentHeader:
infosArray[index][VRamUsedPercentHeader] = line[1]
break
case SDMAUsedHeader:
infosArray[index][SDMAUsedHeader] = line[1]
break
default:
break
}
}
result := make([]DCUPidInfo, 0)
if len(infosArray) == 0 {
return result, nil
}
for _, info := range infosArray {
i := DCUPidInfo{}
pid, innerErr := strconv.ParseUint(info[PIDHeader], 10, 64)
if innerErr != nil {
return nil, innerErr
}
i.Pid = pid
i.PASId, innerErr = strconv.ParseUint(info[PASIDHeader], 10, 64)
if innerErr != nil {
return nil, innerErr
}
_ = json.Unmarshal([]byte(strings.ReplaceAll(info[HCUNodeHeader], "'", `"`)), &i.HCUNode)
_ = json.Unmarshal([]byte(strings.ReplaceAll(info[HCUIndexHeader], "'", `"`)), &i.HCUIndex)
_ = json.Unmarshal([]byte(strings.ReplaceAll(info[GPUIDHeader], "'", `"`)), &i.GPUID)
_ = json.Unmarshal([]byte(strings.ReplaceAll(info[PCIBusHeader], "'", `"`)), &i.PCIBus)
s, innerErr := strconv.ParseUint(info[VRamUsedHeader], 10, 64)
if innerErr != nil {
return nil, innerErr
}
i.VRamUsed = utils.MemorySize{Num: s, Unit: utils.MiB}
if info[VRamUsedPercentHeader] == "" || info[VRamUsedPercentHeader] == "inf" {
i.VRamUsedPercent = -1
} else {
i.VRamUsedPercent, innerErr = strconv.Atoi(info[VRamUsedPercentHeader])
if innerErr != nil {
return nil, innerErr
}
}
i.SDMAUsed, innerErr = strconv.Atoi(info[SDMAUsedHeader])
if innerErr != nil {
return nil, innerErr
}
result = append(result, i)
}
return result, nil
}
package gpu
import (
"errors"
"get-container/utils"
"os/exec"
"strconv"
"strings"
)
/*
从nvidia-smi命令中获取相关信息
*/
const (
SmiVersionHeader = "NVIDIA-SMI version"
NvmlVersionHeader = "NVML version"
DriverVersionHeader = "DRIVER version"
CudaVersionHeader = "CUDA Version"
)
// NVAppInfo 定义了从nvidia-smi中能直接获取的进程相关信息
type NVAppInfo struct {
GPUName string
GPUBusId string
GPUSerial string
GPUUUID string
Pid uint64
ProcessName string
UsedGPUMemory utils.MemorySize
}
// GetAppInfo 获取GPU进程相关信息
// nvidia-smi --query-compute-apps=gpu_name,gpu_bus_id,gpu_serial,gpu_uuid,pid,process_name,used_gpu_memory --format=csv,noheader
func GetAppInfo() ([]NVAppInfo, error) {
output, err := exec.Command("nvidia-smi",
"--query-compute-apps=gpu_name,gpu_bus_id,gpu_serial,gpu_uuid,pid,process_name,used_gpu_memory",
"--format=csv,noheader").Output()
// output为
// NVIDIA H20, 00000000:0F:00.0, 1321424020484, GPU-f71f52ad-4c29-30dd-7f0f-609de5ff1510, 1272015, /usr/bin/python3, 89976 MiB
// NVIDIA H20, 00000000:34:00.0, 1321424019230, GPU-e6c3552d-98b5-fd23-a8e0-c1d85fccfdaa, 1272016, /usr/bin/python3, 90072 MiB
// NVIDIA H20, 00000000:48:00.0, 1321424020685, GPU-53aa03d3-2ac9-1d81-6106-495b68c7315f, 1272017, /usr/bin/python3, 90072 MiB
// NVIDIA H20, 00000000:5A:00.0, 1321424018547, GPU-9defd340-30ab-9c4b-99aa-818c1169277a, 1272018, /usr/bin/python3, 90072 MiB
if err != nil {
return nil, err
}
outStr := strings.Trim(string(output), "\n")
if len(outStr) == 0 {
return nil, nil
}
lines := strings.Split(outStr, "\n")
if len(lines) == 0 {
return make([]NVAppInfo, 0), nil
}
result := make([]NVAppInfo, 0)
for _, line := range lines {
fields := strings.Split(strings.TrimSpace(line), ",")
if len(fields) < 7 {
continue
}
item := NVAppInfo{}
item.GPUName = strings.TrimSpace(fields[0])
item.GPUBusId = strings.TrimSpace(fields[1])
item.GPUSerial = strings.TrimSpace(fields[2])
item.GPUUUID = strings.TrimSpace(fields[3])
item.Pid, err = strconv.ParseUint(strings.TrimSpace(fields[4]), 10, 64)
if err != nil {
return nil, err
}
item.ProcessName = strings.TrimSpace(fields[5])
if s, ifErr := utils.ParseMemorySize(fields[6]); ifErr == nil {
if s == nil {
return nil, errors.New("parse storage size error")
}
item.UsedGPUMemory = *s
} else {
return nil, ifErr
}
result = append(result, item)
}
return result, err
}
// NVVersionInfo 版本信息
type NVVersionInfo struct {
SMIVersion string
NVMLVersion string
DriverVersion string
CUDAVersion string
}
// GetVersionInfo 获取版本信息
func GetVersionInfo() (*NVVersionInfo, error) {
output, err := exec.Command("nvidia-smi", "--version").Output()
if err != nil {
return nil, err
}
if len(output) == 0 {
return nil, errors.New("nvidia-smi version not found")
}
lines := strings.Split(string(output), "\n")
result := &NVVersionInfo{}
for _, line := range lines {
field := strings.SplitN(strings.TrimSpace(line), ":", 2)
if len(field) != 2 {
return nil, errors.New("parse nvidia-smi version error")
}
switch strings.ToLower(strings.TrimSpace(field[0])) {
case strings.ToLower(SmiVersionHeader):
result.SMIVersion = strings.TrimSpace(field[1])
break
case strings.ToLower(NvmlVersionHeader):
result.NVMLVersion = strings.TrimSpace(field[1])
break
case strings.ToLower(DriverVersionHeader):
result.DriverVersion = strings.TrimSpace(field[1])
break
case strings.ToLower(CudaVersionHeader):
result.CUDAVersion = strings.TrimSpace(field[1])
break
}
}
return result, err
}
// Info GPU基本信息
type Info struct {
GPUName string // name
DriverVersion string // driver_version
PersistenceMode bool // persistence_mode Disabled/Enabled
FanSpeed string // fan.speed
Temperature string // temperature.gpu
PerformanceState string // pstate
BusID string // pci.bus_id
DisplayActive bool // display_active Disabled/Enabled
PowerUsage string // power.draw
PowerCapacity string // power.limit
MemorySize utils.MemorySize // memory.total
MemoryUsage utils.MemorySize // memory.used
VBIOSVersion string // vbios_version
MIGMode bool // mig.mode.current Disabled/Enabled
}
// GetGPUInfo 获取GPU信息
func GetGPUInfo() ([]Info, error) {
output, err := exec.Command("nvidia-smi", "--format=csv,noheader",
"--query-gpu=name,driver_version,persistence_mode,fan.speed,temperature.gpu,pstate,pci.bus_id,display_active,power.draw,power.limit,memory.total,memory.used,vbios_version,mig.mode.current").Output()
if err != nil {
return nil, err
}
outStr := strings.Trim(string(output), "\n")
if len(outStr) == 0 {
return make([]Info, 0), nil
}
lines := strings.Split(outStr, "\n")
// NVIDIA H20, 570.86.10, Enabled, [N/A], 34, P0, 00000000:0F:00.0, Disabled, 123.33 W, 500.00 W, 97871 MiB, 89986 MiB, 96.00.99.00.1D, Disabled
// NVIDIA H20, 570.86.10, Enabled, [N/A], 31, P0, 00000000:34:00.0, Disabled, 115.92 W, 500.00 W, 97871 MiB, 90082 MiB, 96.00.99.00.1D, Disabled
// NVIDIA H20, 570.86.10, Enabled, [N/A], 33, P0, 00000000:48:00.0, Disabled, 118.64 W, 500.00 W, 97871 MiB, 90082 MiB, 96.00.99.00.1D, Disabled
// NVIDIA H20, 570.86.10, Enabled, [N/A], 29, P0, 00000000:5A:00.0, Disabled, 113.40 W, 500.00 W, 97871 MiB, 90082 MiB, 96.00.99.00.1D, Disabled
result := make([]Info, 0)
for _, line := range lines {
fields := strings.Split(strings.TrimSpace(line), ",")
if len(fields) != 14 {
continue
}
item := Info{}
item.GPUName = strings.TrimSpace(fields[0])
item.DriverVersion = strings.TrimSpace(fields[1])
item.PersistenceMode = strings.TrimSpace(fields[2]) == "Enabled"
item.FanSpeed = strings.TrimSpace(fields[3])
item.Temperature = strings.TrimSpace(fields[4])
item.PerformanceState = strings.TrimSpace(fields[5])
item.BusID = strings.TrimSpace(fields[6])
item.DisplayActive = strings.TrimSpace(fields[7]) == "Enabled"
item.PowerUsage = strings.TrimSpace(fields[8])
item.PowerCapacity = strings.TrimSpace(fields[9])
if s, innerErr := utils.ParseMemorySize(strings.TrimSpace(fields[10])); innerErr == nil {
if s == nil {
return nil, errors.New("parse storage size error")
}
item.MemorySize = *s
} else {
return nil, innerErr
}
if s, innerErr := utils.ParseMemorySize(strings.TrimSpace(fields[11])); innerErr == nil {
if s == nil {
return nil, errors.New("parse storage size error")
}
item.MemoryUsage = *s
} else {
return nil, innerErr
}
item.VBIOSVersion = strings.TrimSpace(fields[12])
item.MIGMode = strings.TrimSpace(fields[13]) == "Enabled"
result = append(result, item)
}
return result, nil
}
package gpu
import (
"encoding/json"
"testing"
)
const (
Data1 = `============================ System Management Interface =============================
======================================================================================
No KFD PIDs currently running!
======================================================================================
=================================== End of SMI Log ===================================`
Data2 = `================================= System Management Interface ==================================
================================================================================================
PIDs for KFD processes:
PID: 142211
PASID: 32775
HCU Node(Include CPU sort): ['2']
HCU Index: ['0']
GPUID: ['15868']
PCI BUS: ['0000:49:00.0']
VRAM USED(MiB): 54682
VRAM USED(%): 83
SDMA USED: 0
PID: 142218
PASID: 32772
HCU Node(Include CPU sort): ['8']
HCU Index: ['6']
GPUID: ['37441']
PCI BUS: ['0000:cd:00.0']
VRAM USED(MiB): 43485
VRAM USED(%): 66
SDMA USED: 0
PID: 142236
PASID: 32774
HCU Node(Include CPU sort):
HCU Index:
GPUID:
PCI BUS:
VRAM USED(MiB): 0
VRAM USED(%): inf
SDMA USED: 0
PID: 142216
PASID: 32773
HCU Node(Include CPU sort): ['6']
HCU Index: ['4']
GPUID: ['18915']
PCI BUS: ['0000:9c:00.0']
VRAM USED(MiB): 37438
VRAM USED(%): 57
SDMA USED: 0
PID: 142214
PASID: 32771
HCU Node(Include CPU sort): ['3']
HCU Index: ['1']
GPUID: ['51742']
PCI BUS: ['0000:54:00.0']
VRAM USED(MiB): 54815
VRAM USED(%): 84
SDMA USED: 0
PID: 142212
PASID: 32768
HCU Node(Include CPU sort): ['4']
HCU Index: ['2']
GPUID: ['14451']
PCI BUS: ['0000:5e:00.0']
VRAM USED(MiB): 40722
VRAM USED(%): 62
SDMA USED: 0
PID: 142249
PASID: 32780
HCU Node(Include CPU sort):
HCU Index:
GPUID:
PCI BUS:
VRAM USED(MiB): 0
VRAM USED(%): inf
SDMA USED: 0
PID: 142239
PASID: 32776
HCU Node(Include CPU sort):
HCU Index:
GPUID:
PCI BUS:
VRAM USED(MiB): 0
VRAM USED(%): inf
SDMA USED: 0
PID: 142247
PASID: 32781
HCU Node(Include CPU sort):
HCU Index:
GPUID:
PCI BUS:
VRAM USED(MiB): 0
VRAM USED(%): inf
SDMA USED: 0
PID: 142217
PASID: 32770
HCU Node(Include CPU sort): ['5']
HCU Index: ['3']
GPUID: ['34940']
PCI BUS: ['0000:67:00.0']
VRAM USED(MiB): 40826
VRAM USED(%): 62
SDMA USED: 0
PID: 142245
PASID: 32779
HCU Node(Include CPU sort):
HCU Index:
GPUID:
PCI BUS:
VRAM USED(MiB): 0
VRAM USED(%): inf
SDMA USED: 0
PID: 142235
PASID: 32784
HCU Node(Include CPU sort):
HCU Index:
GPUID:
PCI BUS:
VRAM USED(MiB): 0
VRAM USED(%): inf
SDMA USED: 0
PID: 142215
PASID: 32797
HCU Node(Include CPU sort): ['9']
HCU Index: ['7']
GPUID: ['46537']
PCI BUS: ['0000:dd:00.0']
VRAM USED(MiB): 43279
VRAM USED(%): 66
SDMA USED: 0
PID: 142243
PASID: 32778
HCU Node(Include CPU sort):
HCU Index:
GPUID:
PCI BUS:
VRAM USED(MiB): 0
VRAM USED(%): inf
SDMA USED: 0
PID: 142213
PASID: 32769
HCU Node(Include CPU sort): ['7']
HCU Index: ['5']
GPUID: ['4240']
PCI BUS: ['0000:bc:00.0']
VRAM USED(MiB): 37436
VRAM USED(%): 57
SDMA USED: 0
PID: 142241
PASID: 32777
HCU Node(Include CPU sort):
HCU Index:
GPUID:
PCI BUS:
VRAM USED(MiB): 0
VRAM USED(%): inf
SDMA USED: 0
================================================================================================
======================================== End of SMI Log ========================================`
)
func TestRegexp(t *testing.T) {
str := "[\"7\"]"
ss := make([]string, 0)
err := json.Unmarshal([]byte(str), &ss)
if err != nil {
t.Fatal(err)
}
t.Logf("%v", ss)
}
func TestParseDCUPidInfo(t *testing.T) {
i, e := parseDCUPidInfo(Data2)
if e != nil {
t.Fatal(e)
}
for _, info := range i {
t.Logf("%+v\n", info)
}
ii, e := parseDCUPidInfo(Data1)
if e != nil {
t.Fatal(e)
}
for _, info := range ii {
t.Logf("%+v\n", info)
}
}
package utils
import (
"os"
"path/filepath"
"strings"
)
const (
DefaultPATH = "/bin:/sbin:/usr/bin:/usr/sbin:/usr/local/bin"
)
// DetectCmd 探测是否有指定的命令
func DetectCmd(c string) (bool, string) {
path := os.Getenv("PATH")
if path == "" {
path = DefaultPATH
}
prefix := strings.Split(strings.ReplaceAll(path, " ", ""), ":")
for _, p := range prefix {
fullPath := filepath.Join(p, c)
finfo, err := os.Stat(fullPath)
if err != nil {
continue
}
if !finfo.IsDir() && finfo.Mode()&0111 != 0 {
return true, fullPath
}
}
return false, ""
}
package utils
import "testing"
func TestDetectCmd(t *testing.T) {
a, b := DetectCmd("ps")
t.Logf("%v,%v", a, b)
}
package utils
import (
"fmt"
"log"
"os"
"regexp"
"strconv"
"github.com/shirou/gopsutil/v4/process"
)
var (
RePidNS = regexp.MustCompile(`pid:\[([1-9][0-9]*)]$`) // 匹配pid命名空间的正则表达式 pid:[4026545939]
)
// GetPidNS 获取指定进程的Pid命名空间号
func GetPidNS(pid uint64) (uint64, error) {
str, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/pid", pid))
if err != nil {
return 0, err
}
if !RePidNS.MatchString(str) {
return 0, fmt.Errorf("error matching pid")
}
strs := RePidNS.FindStringSubmatch(str)
if len(strs) < 2 {
return 0, fmt.Errorf("error matching pid")
}
return strconv.ParseUint(strs[1], 10, 64)
}
// GetProcessInfo 获取进程信息
func GetProcessInfo(pid int32) {
p, err := process.NewProcess(pid)
if err != nil {
return
}
cmdStr, err := p.Cmdline()
if err != nil {
return
}
log.Println(cmdStr)
}
func GetProcessByName(cmdline string) ([]*process.Process, error) {
p, err := process.Processes()
if err != nil {
return nil, err
}
result := make([]*process.Process, 0)
for _, i := range p {
c, innerErr := i.CmdlineSlice()
if innerErr != nil || len(c) <= 0 {
continue
}
if c[0] == cmdline {
result = append(result, i)
}
}
return result, nil
}
package utils
import "testing"
func TestGetProcessInfo(t *testing.T) {
GetProcessInfo(1)
}
package utils
import (
"os"
"os/exec"
"strconv"
"strings"
)
/*
获取Linux系统上所有用户的信息,如果系统安装了nis,还要解析nis的用户
*/
const (
NISClient = "ypbind"
NISCat = "ypcat"
)
// detectNis 探测系统是否为Nis的客户端,即解析是否存在ypbind命令,且ypbind命令在运行
func detectNis() (bool, error) {
haveCmd, _ := DetectCmd(NISClient)
if haveCmd == false {
return false, nil
}
ps, err := GetProcessByName(NISClient)
if err != nil {
return false, err
}
if len(ps) > 0 {
return true, nil
}
return false, nil
}
// GetNisUsers 获取Nis中的所有用户信息
func GetNisUsers() ([]SysUser, error) {
haveCmd, cmdPath := DetectCmd(NISCat)
if !haveCmd {
return make([]SysUser, 0), nil
}
output, err := exec.Command(cmdPath, "passwd").Output()
if err != nil {
return nil, err
}
return parseSysUser(string(output))
}
type SysUser struct {
Name string
Uid int
Gid int
IsSystemUser bool
HomeDir string
Shell string
GECOS string // 用户信息
}
// parseSysUser 从字符串中解析系统用户信息
func parseSysUser(str string) ([]SysUser, error) {
sysUsers := make([]SysUser, 0)
lines := strings.Split(strings.Trim(str, "\n"), "\n")
if len(lines) == 0 {
return sysUsers, nil
}
for _, line := range lines {
fields := strings.Split(line, ":")
if len(fields) != 7 {
continue
}
user := SysUser{}
user.Name = strings.TrimSpace(fields[0])
uid, err := strconv.Atoi(strings.TrimSpace(fields[2]))
if err != nil {
return nil, err
}
user.Uid = uid
if user.Uid >= 1000 {
user.IsSystemUser = false
} else {
user.IsSystemUser = true
}
gid, err := strconv.Atoi(strings.TrimSpace(fields[3]))
if err != nil {
return nil, err
}
user.Gid = gid
user.GECOS = fields[4]
user.HomeDir = strings.TrimSpace(fields[5])
user.Shell = strings.TrimSpace(fields[6])
sysUsers = append(sysUsers, user)
}
return sysUsers, nil
}
// GetSysUsers 获取系统上所有的用户
func GetSysUsers() ([]SysUser, error) {
detect, err := detectNis()
if err != nil {
return nil, err
}
result := make([]SysUser, 0)
if detect {
u, _ := GetNisUsers()
result = append(result, u...)
}
u, err := os.ReadFile("/etc/passwd")
if err != nil {
return result, nil
}
su, err := parseSysUser(string(u))
if err != nil {
return result, nil
}
result = append(result, su...)
return result, nil
}
package utils
import (
"fmt"
"regexp"
"strconv"
"strings"
)
type StorageCapacityUnit uint64
const (
Byte StorageCapacityUnit = 1
KB StorageCapacityUnit = Byte * 1000
MB StorageCapacityUnit = KB * 1000
GB StorageCapacityUnit = MB * 1000
TB StorageCapacityUnit = GB * 1000
PB StorageCapacityUnit = TB * 1000
KiB StorageCapacityUnit = 1 << 10
MiB StorageCapacityUnit = 1 << 20
GiB StorageCapacityUnit = 1 << 30
TiB StorageCapacityUnit = 1 << 40
PiB StorageCapacityUnit = 1 << 50
)
var (
ReStorageSize = regexp.MustCompile(`^([1-9][0-9]*)((?:[KMGTPkmgtp]|)i?[bB])$`)
ReUnit = regexp.MustCompile(`^([kmgtpKMGTP]|)(|i)([bB])$`)
)
// MemorySize 内存大小,Num代表数字,Unit代表单位
type MemorySize struct {
Num uint64
Unit StorageCapacityUnit
}
func ParseUnit(s string) (StorageCapacityUnit, error) {
s = strings.Trim(strings.TrimSpace(s), "\n")
s = strings.ReplaceAll(s, " ", "")
if !ReUnit.MatchString(s) {
return 0, fmt.Errorf("invalid storage size unit: %s", s)
}
s = strings.ToLower(s)
// [MiB M i B]
// [KB K '' B]
// [B '' '' B]
matchs := ReUnit.FindStringSubmatch(s)
if matchs == nil || len(matchs) < 4 {
return 0, fmt.Errorf("invalid storage size unit: %s", s)
}
isI := matchs[2] == "i"
switch matchs[1] {
case "":
return Byte, nil
case "k":
if isI {
return KiB, nil
}
return KB, nil
case "m":
if isI {
return MiB, nil
}
return MB, nil
case "g":
if isI {
return GiB, nil
}
return GB, nil
case "t":
if isI {
return TiB, nil
}
return TB, nil
case "p":
if isI {
return PiB, nil
}
return PB, nil
default:
return 0, fmt.Errorf("invalid storage size unit: %s", s)
}
}
// ParseMemorySize 解析容量字符串,支持的格式有:123MiB 123MB "123 MiB" "123 MB"
func ParseMemorySize(s string) (*MemorySize, error) {
s = strings.TrimSpace(strings.Trim(s, " \n"))
s = strings.ReplaceAll(s, " ", "")
if !ReStorageSize.MatchString(s) {
return nil, fmt.Errorf("invalid memory size format: %s", s)
}
matchs := ReStorageSize.FindStringSubmatch(s)
if matchs == nil || len(matchs) < 3 {
return nil, fmt.Errorf("invalid memory size format: %s", s)
}
num, err := strconv.ParseUint(matchs[1], 10, 64)
if err != nil {
return nil, err
}
result := MemorySize{}
result.Num = num
unit, err := ParseUnit(matchs[2])
if err != nil {
return nil, err
}
result.Unit = unit
return &result, nil
}
package utils
import (
"testing"
)
func TestRegexp(t *testing.T) {
t.Log(ReUnit.MatchString("b"))
t.Log(ReUnit.MatchString("kb"))
t.Log(ReUnit.MatchString("kib"))
t.Logf("%+v, %d", ReUnit.FindStringSubmatch("b"), len(ReUnit.FindStringSubmatch("b")))
t.Logf("%+v, %d", ReUnit.FindStringSubmatch("kb"), len(ReUnit.FindStringSubmatch("kb")))
t.Logf("%+v, %d", ReUnit.FindStringSubmatch("kib"), len(ReUnit.FindStringSubmatch("kib")))
if ReStorageSize.MatchString("123MiB") {
t.Logf("%+v", ReStorageSize.FindStringSubmatch("123MiB"))
} else {
t.Errorf("Error match 123MiB")
}
if ReStorageSize.MatchString("123MB") {
t.Logf("%+v", ReStorageSize.FindStringSubmatch("123MB"))
} else {
t.Errorf("Error match 123MB")
}
}
func TestParseUnit(t *testing.T) {
testData := []string{"MiB", "MB", "B", "b"}
result := []StorageCapacityUnit{MiB, MB, Byte, Byte}
for index, unit := range testData {
u, err := ParseUnit(unit)
if err != nil {
t.Errorf("Error match %d: %s", index, err)
}
if u != result[index] {
t.Errorf("Error match %d: expected %d, got %d", index, u, result[index])
}
}
}
func TestParseMemorySize(t *testing.T) {
testData := []string{"1MiB", "2MB", "3B", "4b", "5 PiB"}
result := []MemorySize{{Num: 1, Unit: MiB}, {Num: 2, Unit: MB}, {Num: 3, Unit: Byte}, {Num: 4, Unit: Byte}, {Num: 5, Unit: PiB}}
for index, unit := range testData {
u, err := ParseMemorySize(unit)
if err != nil {
t.Errorf("Error match %d: %s", index, err)
}
if u.Num != result[index].Num || u.Unit != result[index].Unit {
t.Errorf("Error match %d: expected %d, got %d", index, u, result[index])
}
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment