package gpu import ( "encoding/json" "get-container/utils" "os/exec" "regexp" "sort" "strconv" "strings" ) const ( DCUBinaryFile = "hy-smi" PIDHeader = "PID" PASIDHeader = "PASID" HCUNodeHeader = "HCU Node(Include CPU sort)" HCUIndexHeader = "HCU Index" GPUIDHeader = "GPUID" PCIBusHeader = "PCI BUS" VRamUsedHeader = "VRAM USED(MiB)" VRamUsedPercentHeader = "VRAM USED(%)" SDMAUsedHeader = "SDMA USED" ) var ( ReEmptyLine = regexp.MustCompile(`^\s*$`) ReUselessLine = regexp.MustCompile(`^=[ =a-zA-Z0-9]*=$`) ReInfoHeader = regexp.MustCompile(`^HCU\[(\d+)]\s+:\s+(.*)$`) ReDriId = regexp.MustCompile(`(?mi)^Device\s*ID\s*:\s*([x0-9]*)$`) ReVBIOSVersion = regexp.MustCompile(`(?mi)^VBIOS\s*version\s*:\s*([0-9a-zA-Z.]*)$`) ReTempEdge = regexp.MustCompile(`(?mi)^Temperature\s*\(Sensor\s*edge\)\s*\(C\)\s*:\s*([0-9.]*)$`) ReTempJunction = regexp.MustCompile(`(?mi)^Temperature\s*\(Sensor\s*junction\)\s*\(C\)\s*:\s*([0-9.]*)$`) ReTempMem = regexp.MustCompile(`(?mi)^Temperature\s*\(Sensor\s*mem\)\s*\(C\)\s*:\s*([0-9.]*)$`) ReTempCore = regexp.MustCompile(`(?mi)^Temperature\s*\(Sensor\s*core\)\s*\(C\)\s*:\s*([0-9.]*)$`) ReFClk = regexp.MustCompile(`(?mi)^fclk\s*clock\s*level\s*:\s*([0-9]*)\s*\(([0-9a-zA-Z]*)\)$`) ReMClk = regexp.MustCompile(`(?mi)^mclk\s*clock\s*level\s*:\s*([0-9]*)\s*\(([0-9a-zA-Z]*)\)$`) ReSClk = regexp.MustCompile(`(?mi)^sclk\s*clock\s*level\s*:\s*([0-9]*)\s*\(([0-9a-zA-Z]*)\)$`) ReSOCClk = regexp.MustCompile(`(?mi)^socclk\s*clock\s*level\s*:\s*([0-9]*)\s*\(([0-9a-zA-Z]*)\)$`) RePCIClk = regexp.MustCompile(`(?mi)^pcie\s*clock\s*level\s*([0-9]*)\s*\(([0-9.a-zA-Z/]*)\s*,\s*([x0-9]*)\s([0-9a-zA-Z]*)\)$`) RePreLevel = regexp.MustCompile(`(?mi)^Performance\s*Level\s*:\s*([0-9a-zA-Z]*)$`) ReMaxPwr = regexp.MustCompile(`(?mi)^Max\s*Graphics\s*Package\s*Power\s*\((.*)\)\s*:\s*([0-9.]*)$`) ReAvgPwr = regexp.MustCompile(`(?mi)^Average\s*Graphics\s*Package\s*Power\s*\((.*)\)\s*:\s*([0-9.]*)$`) ReHCUUsage = regexp.MustCompile(`(?mi)^HCU\s*use\s*\(.*\)\s*:\s*([0-9.]*)$`) ReHCUMemUsage = regexp.MustCompile(`(?mi)^HCU\s*memory\s*use\s*\(.*\)\s*:\s*([0-9.]*)$`) ReHCUMemVendor = regexp.MustCompile(`(?mi)^HCU\s*Memory\s*Vendor\s*:\s*(.*)$`) RePCIeRelay = regexp.MustCompile(`(?mi)^PCIe\s*Replay\s*Count\s*:\s*([0-9]*)$`) ReSerialNum = regexp.MustCompile(`(?mi)^Serial\s*Number\s*:\s*([0-9a-zA-Z]*)$`) ReVoltage = regexp.MustCompile(`(?mi)^Voltage\s*\((.*)\)\s*:\s*([0-9.]*)$`) RePCIBus = regexp.MustCompile(`(?mi)^PCI\s*Bus\s*:\s*([0-9a-zA-Z.:]*)$`) ReMECFWVersion = regexp.MustCompile(`(?mi)^MEC\s*Firmware\s*Version\s*:\s*([0-9.]*)$`) ReMEC2FWVersion = regexp.MustCompile(`(?mi)^MEC2\s*Firmware\s*Version\s*:\s*([0-9.]*)$`) ReRLCFWVersion = regexp.MustCompile(`(?mi)^RLC\s*Firmware\s*Version\s*:\s*([0-9.]*)$`) ReSDMAFWVersion = regexp.MustCompile(`(?mi)^SDMA\s*Firmware\s*Version\s*:\s*([0-9.]*)$`) ReSDMA2FWVersion = regexp.MustCompile(`(?mi)^SDMA2\s*Firmware\s*Version\s*:\s*([0-9.]*)$`) ReSMCFWVersion = regexp.MustCompile(`(?mi)^SMC\s*Firmware\s*Version\s*:\s*([0-9.]*)$`) ReCardSerial = regexp.MustCompile(`(?mi)^Card\s*Series\s*:\s*(.*)$`) ReCardVendor = regexp.MustCompile(`(?mi)^Card\s*Vendor\s*:\s*(.*)$`) ReVersion = regexp.MustCompile(`(?i)^version\s*([0-9a-zA-Z.]*)\s*\(.*\)$`) ReDriVersion = regexp.MustCompile(`(?mi)^driver\s*version\s*:\s*(.*)$`) ) type HYVersionInfo struct { SMIVersion string // --version DriverVersion string // --showdriverversion } func GetHYVersionInfo() (*HYVersionInfo, error) { versionBytes, err := exec.Command(DCUBinaryFile, "--version").Output() if err != nil { return nil, err } driBytes, err := exec.Command(DCUBinaryFile, "--showdriverversion").Output() if err != nil { return nil, err } result := HYVersionInfo{} mv := ReVersion.FindStringSubmatch(strings.TrimSpace(strings.Trim(string(versionBytes), "\n"))) if len(mv) >= 2 { result.SMIVersion = mv[1] } mrv := ReDriVersion.FindStringSubmatch(strings.TrimSpace(strings.Trim(string(driBytes), "\n"))) if len(mrv) >= 2 { result.DriverVersion = mrv[1] } return &result, nil } type DCUInfo struct { Id int // id Name string // DCU名称 PerformanceLevel string // 性能等级 FanSpeed float32 // 风扇转速 Temperature float32 // 平均温度 PwrUsage int16 PwrCapacity int16 BusId string MemTotal int32 MemUsed int32 } type ClockInfo struct { Level int // 时钟等级 Freq string // 频率 } type PcieClockInfo struct { Level int // 时钟等级 Freq string // 频率 BandWidth string // 带宽 Times string // 倍率 } // SMIAllOutput hy-smi -a输出的信息,列出了DCU全面的信息 type SMIAllOutput struct { Id int DeviceId string VBIOSVersion string TempEdge float32 TempJunction float32 TempMem float32 TempCores float32 FClock *ClockInfo MClock *ClockInfo SClock *ClockInfo SOCClock *ClockInfo PCIEClock *PcieClockInfo PerLevel string MaxPwr float32 AvgPwr float32 HCUUsage float32 HCUMemUsage float32 HCUMemVendor string PCIERelayCount int SerialNumber string Voltage float32 PCIBus string MECFWVersion string MEC2FWVersion string RLCFWVersion string SDMAVersion string SDMA2Version string SMCVersion string CardSeries string CardVendor string } func GetSMIAllOutput() ([]*SMIAllOutput, error) { b, err := exec.Command(DCUBinaryFile, "-a").CombinedOutput() if err != nil { return nil, err } lines := strings.Split(strings.Trim(strings.TrimSpace(string(b)), "\n"), "\n") info := make(map[int][]string) for _, line := range lines { if ReUselessLine.MatchString(line) || ReEmptyLine.MatchString(line) { continue } if ReInfoHeader.MatchString(line) { fields := ReInfoHeader.FindStringSubmatch(strings.TrimSpace(strings.ReplaceAll(line, "\t", " "))) if len(fields) < 2 { continue } id, innerErr := strconv.Atoi(fields[1]) if innerErr != nil { return nil, innerErr } if v, ok := info[id]; !ok { info[id] = make([]string, 0) info[id] = append(v, fields[2]) } else { info[id] = append(v, fields[2]) } } } result := make([]*SMIAllOutput, 0) for k, v := range info { item, innerErr := parseSMIAllOutput(k, strings.Join(v, "\n")) if innerErr != nil { return nil, innerErr } if item != nil { result = append(result, item) } } // 按照id进行排序 sort.Slice(result, func(i, j int) bool { return result[i].Id < result[j].Id }) return result, nil } func parseSMIAllOutput(id int, str string) (*SMIAllOutput, error) { if len(strings.TrimSpace(str)) == 0 { return nil, nil } result := SMIAllOutput{} result.Id = id if s := regMatch(ReDriId, str, 1); s != nil { result.DeviceId = s[0] } if s := regMatch(ReVBIOSVersion, str, 1); s != nil { result.VBIOSVersion = s[0] } if s := regMatch(ReTempEdge, str, 1); s != nil { if t, err := strconv.ParseFloat(strings.TrimSpace(s[0]), 32); err == nil { result.TempEdge = float32(t) } } if s := regMatch(ReTempJunction, str, 1); s != nil { if t, err := strconv.ParseFloat(strings.TrimSpace(s[0]), 32); err == nil { result.TempJunction = float32(t) } } if s := regMatch(ReTempMem, str, 1); s != nil { if t, err := strconv.ParseFloat(strings.TrimSpace(s[0]), 32); err == nil { result.TempMem = float32(t) } } if s := regMatch(ReTempCore, str, 1); s != nil { if t, err := strconv.ParseFloat(strings.TrimSpace(s[0]), 32); err == nil { result.TempCores = float32(t) } } if s := regMatch(ReFClk, str, 1, 2); s != nil { c := ClockInfo{} level, err := strconv.Atoi(strings.TrimSpace(s[0])) if err == nil { c.Level = level } c.Freq = strings.TrimSpace(s[1]) result.FClock = &c } if s := regMatch(ReMClk, str, 1, 2); s != nil { c := ClockInfo{} level, err := strconv.Atoi(strings.TrimSpace(s[0])) if err == nil { c.Level = level } c.Freq = strings.TrimSpace(s[1]) result.MClock = &c } if s := regMatch(ReSClk, str, 1, 2); s != nil { c := ClockInfo{} level, err := strconv.Atoi(strings.TrimSpace(s[0])) if err == nil { c.Level = level } c.Freq = strings.TrimSpace(s[1]) result.SClock = &c } if s := regMatch(ReSOCClk, str, 1, 2); s != nil { c := ClockInfo{} level, err := strconv.Atoi(strings.TrimSpace(s[0])) if err == nil { c.Level = level } c.Freq = strings.TrimSpace(s[1]) result.SOCClock = &c } if s := regMatch(RePCIClk, str, 1, 2); s != nil { c := ClockInfo{} level, err := strconv.Atoi(strings.TrimSpace(s[0])) if err == nil { c.Level = level } c.Freq = strings.TrimSpace(s[1]) result.SOCClock = &c } if s := regMatch(RePCIClk, str, 1, 2, 3, 4); s != nil { c := PcieClockInfo{} level, err := strconv.Atoi(strings.TrimSpace(s[0])) if err == nil { c.Level = level } c.BandWidth = strings.TrimSpace(s[1]) c.Times = strings.TrimSpace(s[2]) c.Freq = strings.TrimSpace(s[3]) result.PCIEClock = &c } if s := regMatch(RePreLevel, str, 1); s != nil { result.PerLevel = s[0] } if s := regMatch(ReMaxPwr, str, 2); s != nil { p, err := strconv.ParseFloat(strings.TrimSpace(s[0]), 32) if err == nil { result.MaxPwr = float32(p) } } if s := regMatch(ReAvgPwr, str, 2); s != nil { p, err := strconv.ParseFloat(strings.TrimSpace(s[0]), 32) if err == nil { result.AvgPwr = float32(p) } } if s := regMatch(ReHCUUsage, str, 1); s != nil { p, err := strconv.ParseFloat(strings.TrimSpace(s[0]), 32) if err == nil { result.HCUUsage = float32(p) } } if s := regMatch(ReHCUMemUsage, str, 1); s != nil { p, err := strconv.ParseFloat(strings.TrimSpace(s[0]), 32) if err == nil { result.HCUMemUsage = float32(p) } } if s := regMatch(ReHCUMemVendor, str, 1); s != nil { result.HCUMemVendor = s[0] } if s := regMatch(RePCIeRelay, str, 1); s != nil { i, err := strconv.Atoi(strings.TrimSpace(s[0])) if err == nil { result.PCIERelayCount = i } } if s := regMatch(ReSerialNum, str, 1); s != nil { result.SerialNumber = s[0] } if s := regMatch(ReVoltage, str, 2); s != nil { p, err := strconv.ParseFloat(strings.TrimSpace(s[0]), 32) if err == nil { result.Voltage = float32(p) } } if s := regMatch(RePCIBus, str, 2); s != nil { result.PCIBus = s[0] } if s := regMatch(ReMECFWVersion, str, 1); s != nil { result.MECFWVersion = s[0] } if s := regMatch(ReMEC2FWVersion, str, 1); s != nil { result.MEC2FWVersion = s[0] } if s := regMatch(ReRLCFWVersion, str, 1); s != nil { result.RLCFWVersion = s[0] } if s := regMatch(ReSDMAFWVersion, str, 1); s != nil { result.SDMAVersion = s[0] } if s := regMatch(ReSDMA2FWVersion, str, 1); s != nil { result.SDMA2Version = s[0] } if s := regMatch(ReSMCFWVersion, str, 1); s != nil { result.SMCVersion = s[0] } if s := regMatch(ReCardSerial, str, 1); s != nil { result.CardSeries = s[0] } if s := regMatch(ReCardVendor, str, 1); s != nil { result.CardVendor = s[0] } return &result, nil } // DCURunningInfo DCU运行状态信息 type DCURunningInfo struct { Id int Temp float32 AvgPower float32 PerformanceLevel string MemPerc float32 HCUPerc float32 } // GetRunningInfo 获取DCU运行相关信息 func GetRunningInfo() ([]DCURunningInfo, error) { output, err := exec.Command(DCUBinaryFile).CombinedOutput() if err != nil { return nil, err } return parseRunningInfo(string(output)) } func parseRunningInfo(info string) ([]DCURunningInfo, error) { lines := strings.Split(strings.Trim(strings.TrimSpace(info), "\n"), "\n") result := make([]DCURunningInfo, 0) for _, line := range lines { if ReUselessLine.MatchString(line) || ReEmptyLine.MatchString(line) { continue } fields := strings.Fields(strings.TrimSpace(line)) if len(fields) < 8 { continue } item := DCURunningInfo{} id, err := strconv.Atoi(fields[0]) if err != nil { continue } item.Id = id temp, err := strconv.ParseFloat(strings.TrimSuffix(strings.ToLower(fields[1]), "c"), 32) if err != nil { return nil, err } item.Temp = float32(temp) avgPwr, err := strconv.ParseFloat(strings.TrimSuffix(strings.ToLower(fields[2]), "w"), 32) if err != nil { return nil, err } item.AvgPower = float32(avgPwr) item.PerformanceLevel = fields[3] vram, err := strconv.ParseFloat(strings.TrimSuffix(fields[5], "%"), 32) if err != nil { return nil, err } item.MemPerc = float32(vram) utl, err := strconv.ParseFloat(strings.TrimSuffix(fields[6], "%"), 32) if err != nil { return nil, err } item.HCUPerc = float32(utl) result = append(result, item) } return result, nil } type DCUPidInfo struct { Pid uint64 PASId uint64 HCUNode []string HCUIndex []string GPUID []string PCIBus []string VRamUsed utils.MemorySize VRamUsedPercent int SDMAUsed int } // GetDCUPidInfo 获取Pid相关信息 // 需要剔除PCIBus为空的项 func GetDCUPidInfo() ([]DCUPidInfo, error) { output, err := exec.Command(DCUBinaryFile, "--showpids").Output() if err != nil { return nil, err } return parseDCUPidInfo(string(output)) } func parseDCUPidInfo(s string) ([]DCUPidInfo, error) { lines := strings.Split(strings.Trim(string(s), "\n"), "\n") linesArray := make([][]string, 0) for _, line := range lines { if ReEmptyLine.MatchString(line) || ReUselessLine.MatchString(line) { continue } linesArray = append(linesArray, strings.SplitN(strings.TrimSpace(line), ":", 2)) } infosArray := make([]map[string]string, 0) index := -1 for _, line := range linesArray { if line == nil || len(line) != 2 { continue } line[1] = strings.TrimSpace(line[1]) switch line[0] { case PIDHeader: index += 1 infosArray = append(infosArray, make(map[string]string)) infosArray[index][PIDHeader] = line[1] case PASIDHeader: infosArray[index][PASIDHeader] = line[1] case HCUNodeHeader: infosArray[index][HCUNodeHeader] = line[1] case HCUIndexHeader: infosArray[index][HCUIndexHeader] = line[1] case GPUIDHeader: infosArray[index][GPUIDHeader] = line[1] case PCIBusHeader: infosArray[index][PCIBusHeader] = line[1] case VRamUsedHeader: infosArray[index][VRamUsedHeader] = line[1] case VRamUsedPercentHeader: infosArray[index][VRamUsedPercentHeader] = line[1] case SDMAUsedHeader: infosArray[index][SDMAUsedHeader] = line[1] } } result := make([]DCUPidInfo, 0) if len(infosArray) == 0 { return result, nil } for _, info := range infosArray { i := DCUPidInfo{} _ = json.Unmarshal([]byte(strings.ReplaceAll(info[HCUNodeHeader], "'", `"`)), &i.HCUNode) _ = json.Unmarshal([]byte(strings.ReplaceAll(info[HCUIndexHeader], "'", `"`)), &i.HCUIndex) _ = json.Unmarshal([]byte(strings.ReplaceAll(info[GPUIDHeader], "'", `"`)), &i.GPUID) _ = json.Unmarshal([]byte(strings.ReplaceAll(info[PCIBusHeader], "'", `"`)), &i.PCIBus) if len(i.PCIBus) == 0 { continue } pid, innerErr := strconv.ParseUint(info[PIDHeader], 10, 64) if innerErr != nil { return nil, innerErr } i.Pid = pid i.PASId, innerErr = strconv.ParseUint(info[PASIDHeader], 10, 64) if innerErr != nil { return nil, innerErr } s, innerErr := strconv.ParseUint(info[VRamUsedHeader], 10, 64) if innerErr != nil { return nil, innerErr } i.VRamUsed = utils.MemorySize{Num: s, Unit: utils.MiB} if info[VRamUsedPercentHeader] == "" || info[VRamUsedPercentHeader] == "inf" { i.VRamUsedPercent = -1 } else { i.VRamUsedPercent, innerErr = strconv.Atoi(info[VRamUsedPercentHeader]) if innerErr != nil { return nil, innerErr } } i.SDMAUsed, innerErr = strconv.Atoi(info[SDMAUsedHeader]) if innerErr != nil { return nil, innerErr } result = append(result, i) } // 按照DCU index排序 sort.Slice(result, func(i, j int) bool { ii, err := strconv.Atoi(result[i].HCUIndex[0]) if err != nil { return false } jj, err := strconv.Atoi(result[j].HCUIndex[0]) if err != nil { return false } return ii < jj }) return result, nil } func regMatch(reg *regexp.Regexp, s string, index ...int) []string { i := reg.FindStringSubmatch(s) if i == nil { return nil } result := make([]string, 0) l := len(i) for _, ind := range index { if ind >= 0 && ind < l { result = append(result, i[ind]) } else { return nil } } return result }