Commit ee110e57 authored by liming6's avatar liming6
Browse files

feature 添加直接使用.so文件获取dcu信息的代码

parent 800b8f4d
package lib
/*
#cgo CFLAGS: -I.
#cgo LDFLAGS: -L/opt/hyhal/lib/ -lrocm_smi64
#include <stdlib.h>
#include <string.h>
#include "rocm_smi.h"
*/
import "C"
import (
"errors"
"fmt"
"unsafe"
)
type RSMIResult uint32
const (
RSMISuccess RSMIResult = 0
RSMIInvalidArgs RSMIResult = 1
RSMINotSupported RSMIResult = 2
RSMIFileError RSMIResult = 3
RSMIPermission RSMIResult = 4
RSMIOutOfResource RSMIResult = 5
RSMIInternalException RSMIResult = 6
RSMIInputOutOfBounds RSMIResult = 7
RSMIInitError RSMIResult = 8
RSMIInitalizationError RSMIResult = RSMIInitError
RSMINotYetImplemented RSMIResult = 9
RSMINotFound RSMIResult = 10
RSMIInsufficientSize RSMIResult = 11
RSMIInterrupt RSMIResult = 12
RSMIUnexpectedSize RSMIResult = 13
RSMINoData RSMIResult = 14
RSMIUnexpectedData RSMIResult = 15
RSMIBusy RSMIResult = 16
RSMIRefcountOverflow RSMIResult = 17
RSMISettingUnavaliable RSMIResult = 18
RSMIAmdGPURestartErr RSMIResult = 19
RSMIMallocError RSMIResult = 0xFFFFFFFE // malloc执行失败
RSMIUnknowError RSMIResult = 0xFFFFFFFF
)
var (
PerfNameMap = map[int32]string{
0: "auto",
1: "low",
2: "high",
3: "manual",
4: "stable std",
5: "stable peak",
6: "stable min mclk",
7: "stable min sclk",
8: "determinism",
256: "unknow",
}
ErrInvalidArgs = errors.New("rocm invalid args")
ErrNotSupported = errors.New("rocm not supported")
ErrFileError = errors.New("rocm file error")
ErrPermission = errors.New("rocm permission error")
ErrOutOfResource = errors.New("rocm out of resource")
ErrInternalException = errors.New("rocm internal exception")
ErrInputOutOfBounds = errors.New("rocm input out of bounds ")
ErrInitError = errors.New("rocm init error")
ErrInitalizationError = ErrInitError
ErrNotYetImplemented = errors.New("rocm not yet implemented")
ErrNotFound = errors.New("rocm not found")
ErrInsufficientSize = errors.New("rocm insufficient size")
ErrInterrupt = errors.New("rocm interrupt")
ErrUnexpectedSize = errors.New("rocm unexpected size")
ErrNoData = errors.New("rocm no data")
ErrUnexpectedData = errors.New("rocm unexpected data")
ErrBusy = errors.New("rocm busy")
ErrRefcountOverflow = errors.New("rocm refcount overflow")
ErrSettingUnavaliable = errors.New("rocm setting unavaliable")
ErrAmdGPURestartErr = errors.New("rocm amd gpu restart error")
ErrMallocError = errors.New("alloc momery error")
ErrUnknowError = errors.New("rocm unknow error")
RocmErrMap = map[RSMIResult]error{
RSMISuccess: nil,
RSMIInvalidArgs: ErrInvalidArgs,
RSMINotSupported: ErrNotSupported,
RSMIFileError: ErrFileError,
RSMIPermission: ErrPermission,
RSMIOutOfResource: ErrOutOfResource,
RSMIInternalException: ErrInternalException,
RSMIInputOutOfBounds: ErrInputOutOfBounds,
RSMIInitError: ErrInitError,
RSMINotYetImplemented: ErrNotYetImplemented,
RSMINotFound: ErrNotFound,
RSMIInsufficientSize: ErrInsufficientSize,
RSMIInterrupt: ErrInterrupt,
RSMIUnexpectedSize: ErrUnexpectedSize,
RSMINoData: ErrNoData,
RSMIUnexpectedData: ErrUnexpectedData,
RSMIBusy: ErrBusy,
RSMIRefcountOverflow: ErrRefcountOverflow,
RSMISettingUnavaliable: ErrSettingUnavaliable,
RSMIAmdGPURestartErr: ErrAmdGPURestartErr,
RSMIMallocError: ErrMallocError,
RSMIUnknowError: ErrUnknowError,
}
)
func ToRSMIResult(c C.rsmi_status_t) error {
e, have := RocmErrMap[RSMIResult(c)]
if have {
return e
}
return ErrUnknowError
}
// RSMIProcessInfo 对应rsmi_process_info_t
type RSMIProcessInfo struct {
Pid uint32 // Process ID
ProcessAddressSpaceId uint32 // PASID: (Process Address Space ID)
VarmUsage uint64 // VRAM usage
SdmaUsage uint64 // SDMA usage in microseconds
CuOccupancy uint32 // Compute Unit usage in percent
UsedGPUIndex []uint32
}
func (pi *RSMIProcessInfo) FromC(c C.rsmi_process_info_t) {
pi.Pid = uint32(c.process_id)
pi.ProcessAddressSpaceId = uint32(c.pasid)
pi.VarmUsage = uint64(c.vram_usage)
pi.SdmaUsage = uint64(c.sdma_usage)
pi.CuOccupancy = uint32(c.cu_occupancy)
}
type RSMIProcessInfoV2 struct {
Pid uint32
VramUsageSize uint64 // VRAM usage size in MiB
VramUsageRate float32 // VRAM usage rate as a percentage
UsedGPUs int // Used gpu number
GPUUsage map[int]float32 // GPU usage rate as a percentage
}
func (pi2 *RSMIProcessInfoV2) FromC(c C.rsmi_process_info_v2_t) {
pi2.Pid = uint32(c.processId)
pi2.VramUsageSize = uint64(c.vramUsageSize)
pi2.VramUsageRate = float32(c.vramUsageRate)
pi2.UsedGPUs = int(c.usedGpus)
pi2.GPUUsage = make(map[int]float32)
for k, v := range c.gpuIndex {
pi2.GPUUsage[int(v)] = float32(c.gpuUsageRate[k])
}
}
// RSMI_init 初始化rsmi
func RSMI_init() error {
return ToRSMIResult(C.rsmi_init(0))
}
// RSMI_shut_down 关闭rsmi
func RSMI_shut_down() error {
return ToRSMIResult(C.rsmi_shut_down())
}
// RSMI_num_monitor_devices 获取设备数量
func RSMI_num_monitor_devices() (uint32, error) {
var num C.uint
ptr := (*C.uint)(unsafe.Pointer(&num))
res := C.rsmi_num_monitor_devices(ptr)
return uint32(num), ToRSMIResult(res)
}
// RSMI_version_get 获取当前运行的RSMI版本
func RSMI_version_get() (uint32, uint32, uint32, error) {
result := (*C.rsmi_version_t)(C.malloc(C.sizeof_rsmi_version_t))
if unsafe.Pointer(result) != C.NULL {
defer func() {
C.free(unsafe.Pointer(result))
}()
} else {
return 0, 0, 0, ErrMallocError
}
res := C.rsmi_version_get(result)
return uint32(result.major), uint32(result.minor), uint32(result.patch), ToRSMIResult(res)
}
// RSMI_version_str_get 获取当前系统的驱动程序版本
func RSMI_version_str_get() (string, error) {
buff := make([]uint8, 128)
res := C.rsmi_version_str_get(C.RSMI_SW_COMP_DRIVER, (*C.char)(unsafe.Pointer(&buff[0])), 128)
return string(buff), ToRSMIResult(res)
}
// RSMI_dev_vbios_version_get 获取VBIOS版本
func RSMI_dev_vbios_version_get(deviceIndex uint32) (string, error) {
buff := make([]uint8, 128)
res := C.rsmi_dev_vbios_version_get(C.uint(deviceIndex), (*C.char)(unsafe.Pointer(&buff[0])), 128)
return string(buff), ToRSMIResult(res)
}
// RSMI_dev_name_get 获取设备名称,只有开头的字符,没有数字,如BW
func RSMI_dev_name_get(deviceIndex uint32) (string, error) {
buff := make([]uint8, 128)
res := C.rsmi_dev_name_get(C.uint(deviceIndex), (*C.char)(unsafe.Pointer(&buff[0])), 128)
return string(buff), ToRSMIResult(res)
}
// RSMI_dev_id_get 获取设备id
func RSMI_dev_id_get(deviceIndex uint32) (uint16, error) {
var id C.ushort
res := C.rsmi_dev_id_get(C.uint(deviceIndex), (*C.ushort)(unsafe.Pointer(&id)))
return uint16(id), ToRSMIResult(res)
}
// RSMI_dev_sku_get 获取设备的sku号
func RSMI_dev_sku_get(deviceIndex uint32) (uint16, error) {
var sku C.ushort
res := C.rsmi_dev_sku_get(C.uint(deviceIndex), (*C.ushort)(unsafe.Pointer(&sku)))
return uint16(sku), ToRSMIResult(res)
}
func RSMI_dev_vendor_id_get(deviceIndex uint32) (uint16, error) {
var vendor C.ushort
res := C.rsmi_dev_vendor_id_get(C.uint(deviceIndex), (*C.ushort)(unsafe.Pointer(&vendor)))
return uint16(vendor), ToRSMIResult(res)
}
func RSMI_dev_brand_get(deviceIndex uint32) (string, error) {
buff := make([]uint8, 128)
res := C.rsmi_dev_brand_get(C.uint(deviceIndex), (*C.char)(unsafe.Pointer(&buff[0])), 128)
return string(buff), ToRSMIResult(res)
}
func RSMI_dev_serial_number_get(deviceIndex uint32) (string, error) {
buff := make([]uint8, 128)
res := C.rsmi_dev_serial_number_get(C.uint(deviceIndex), (*C.char)(unsafe.Pointer(&buff[0])), 128)
return string(buff), ToRSMIResult(res)
}
// RSMI_dev_subsystem_name_get 获取设备全名
func RSMI_dev_subsystem_name_get(deviceIndex uint32) (string, error) {
buff := make([]uint8, 128)
res := C.rsmi_dev_subsystem_name_get(C.uint(deviceIndex), (*C.char)(unsafe.Pointer(&buff[0])), 128)
return string(buff), ToRSMIResult(res)
}
// RSMI_dev_perf_level_get 获取设备运行等级
func RSMI_dev_perf_level_get(deviceIndex uint32) (int32, error) {
var level C.rsmi_dev_perf_level_t
res := C.rsmi_dev_perf_level_get(C.uint(deviceIndex), (*C.rsmi_dev_perf_level_t)(unsafe.Pointer(&level)))
return int32(level), ToRSMIResult(res)
}
// RSMI_compute_process_info_get 获取所有使用显卡的进程信息
func RSMI_compute_process_info_get() ([]RSMIProcessInfo, error) {
ps := (*C.rsmi_process_info_t)(C.malloc(C.sizeof_rsmi_process_info_t * 128))
if unsafe.Pointer(ps) != C.NULL {
defer func() {
C.free(unsafe.Pointer(ps))
}()
} else {
return nil, ErrMallocError
}
var num C.uint = C.uint(128)
res := C.rsmi_compute_process_info_get(ps, (*C.uint)(unsafe.Pointer(&num)))
if res != C.RSMI_STATUS_SUCCESS {
return nil, ToRSMIResult(res)
}
psSlice := unsafe.Slice((*C.rsmi_process_info_t)(unsafe.Pointer(ps)), int(num))
if len(psSlice) == 0 {
return make([]RSMIProcessInfo, 0), ToRSMIResult(res)
}
result := make([]RSMIProcessInfo, int(num))
for i := range int(num) {
result[i].FromC(psSlice[i])
}
return result, ToRSMIResult(res)
}
// RSMI_compute_process_info_by_pid_get_v2 获取进程的详细信息,注意:不是所有版本的so文件都支持该方法,可能导致进程崩溃
func RSMI_compute_process_info_by_pid_get_v2(pid uint32) (info *RSMIProcessInfoV2, res error) {
ps2 := (*C.rsmi_process_info_v2_t)(C.malloc(C.sizeof_rsmi_process_info_v2_t))
if unsafe.Pointer(ps2) != C.NULL {
defer func() {
C.free(unsafe.Pointer(ps2))
}()
} else {
info = nil
res = ErrMallocError
return
}
defer func() {
if r := recover(); r != nil {
info = nil
res = ErrUnknowError
}
}()
r := C.rsmi_compute_process_info_by_pid_get_v2(C.uint(pid), ps2)
if res != nil {
info = nil
res = ToRSMIResult(r)
return
}
result := RSMIProcessInfoV2{}
result.FromC(*ps2)
info = &result
res = nil
return
}
func RSMI_dev_fan_rpms_get(devIndex uint32) (int64, error) {
var rpm C.long = C.long(0)
ptr := (*C.long)(unsafe.Pointer(&rpm))
res := C.rsmi_dev_fan_rpms_get(C.uint(devIndex), 0, ptr)
if ToRSMIResult(res) != nil {
return 0, ToRSMIResult(res)
}
return int64(rpm), nil
}
// RSMI_dev_temp_metric_get 获取设备核心温度,结果除以1000就是摄氏度
func RSMI_dev_temp_metric_get(devIndex uint32) (int64, error) {
var temp C.long = C.long(0)
ptr := (*C.long)(unsafe.Pointer(&temp))
res := C.rsmi_dev_temp_metric_get(C.uint(devIndex), C.RSMI_TEMP_TYPE_CORE, C.RSMI_TEMP_CURRENT, ptr)
return int64(temp), ToRSMIResult(res)
}
// RSMI_dev_power_ave_get 获取设备的平均功耗,单位是微瓦
func RSMI_dev_power_ave_get(devIndex uint32) (uint64, error) {
var power C.ulong = C.ulong(0)
ptr := (*C.ulong)(unsafe.Pointer(&power))
res := C.rsmi_dev_power_ave_get(C.uint(devIndex), 0, ptr)
return uint64(power), ToRSMIResult(res)
}
// RSMI_dev_power_cap_get 获取设备的功耗墙,单位是微瓦
func RSMI_dev_power_cap_get(devIndex uint32) (uint64, error) {
var cap C.ulong = C.ulong(0)
ptr := (*C.ulong)(unsafe.Pointer(&cap))
res := C.rsmi_dev_power_cap_get(C.uint(devIndex), 0, ptr)
return uint64(cap), ToRSMIResult(res)
}
// RSMI_dev_pci_id_get 获取设备的PCI id,对于0000:49:00.0,返回的是49:00部分
func RSMI_dev_pci_id_get(devIndex uint32) (uint64, error) {
var pciid C.ulong = C.ulong(0)
res := C.rsmi_dev_pci_id_get(C.uint(devIndex), (*C.ulong)(unsafe.Pointer(&pciid)))
return uint64(pciid), ToRSMIResult(res)
}
func RSMI_dev_memory_total_get(devIndex uint32) (uint64, error) {
var mem C.ulong = C.ulong(0)
res := C.rsmi_dev_memory_total_get(C.uint(devIndex), C.RSMI_MEM_TYPE_VRAM, (*C.ulong)(unsafe.Pointer(&mem)))
return uint64(mem), ToRSMIResult(res)
}
func RSMI_dev_memory_usage_get(devIndex uint32) (uint64, error) {
var mem C.ulong = C.ulong(0)
res := C.rsmi_dev_memory_usage_get(C.uint(devIndex), C.RSMI_MEM_TYPE_VRAM, (*C.ulong)(unsafe.Pointer(&mem)))
return uint64(mem), ToRSMIResult(res)
}
// nvmlDeviceGetMigMode
// dmi/dmi_mig.h
// RSMI_dev_busy_percent_get 获取设备的忙碌百分比
func RSMI_dev_busy_percent_get(devIndex uint32) (uint32, error) {
var percent C.uint = 0
res := C.rsmi_dev_busy_percent_get(C.uint(devIndex), (*C.uint)(unsafe.Pointer(&percent)))
return uint32(percent), ToRSMIResult(res)
}
func RSMI_ecc_enable(devIndex uint32) (bool, error) {
var blocks C.ulong = 0
res := C.rsmi_dev_ecc_enabled_get(C.uint(devIndex), (*C.ulong)(unsafe.Pointer(&blocks)))
if ToRSMIResult(res) != nil {
return false, ToRSMIResult(res)
}
fmt.Printf("%X\n", blocks)
ss := C.rsmi_gpu_block_t(blocks)
var stat C.rsmi_ras_err_state_t = 0
res = C.rsmi_dev_ecc_status_get(C.uint(devIndex), ss, (*C.rsmi_ras_err_state_t)(unsafe.Pointer(&stat)))
return uint32(stat) == uint32(C.RSMI_RAS_ERR_STATE_ENABLED), ToRSMIResult(res)
}
func RSMI_compute_process_gpus_get(pid uint32) ([]uint32, error) {
var devIds [32]C.uint
var devNum C.uint = 32
ptrDevIds := (*C.uint)(unsafe.Pointer(&devIds[0]))
res := C.rsmi_compute_process_gpus_get(C.uint(pid), ptrDevIds, (*C.uint)(unsafe.Pointer(&devNum)))
if ToRSMIResult(res) != nil {
return nil, ToRSMIResult(res)
}
if devNum > 0 {
result := make([]uint32, devNum)
for i := range devNum {
result[i] = uint32(devIds[i])
}
return result, nil
}
return make([]uint32, 0), nil
}
package lib
import (
"testing"
)
func TestRSMI_compute_process_info_get(t *testing.T) {
RSMI_init()
defer RSMI_shut_down()
info, result := RSMI_compute_process_info_get()
if result != nil {
t.Error(result)
}
t.Logf("%+v", info)
}
func TestRSMI_dev_fan_rpms_get(t *testing.T) {
RSMI_init()
defer RSMI_shut_down()
rpm, res := RSMI_dev_fan_rpms_get(0)
t.Log(rpm, res)
}
func TestRSMI_dev_temp_metric_get(t *testing.T) {
RSMI_init()
defer RSMI_shut_down()
tmp, res := RSMI_dev_temp_metric_get(0)
t.Log(tmp, res)
}
func TestRSMI_dev_power_ave_get(t *testing.T) {
RSMI_init()
defer RSMI_shut_down()
pwr, res := RSMI_dev_power_ave_get(0)
t.Log(pwr, res)
}
func TestRSMI_dev_power_cap_get(t *testing.T) {
RSMI_init()
defer RSMI_shut_down()
pwr, res := RSMI_dev_power_cap_get(0)
t.Log(pwr, res)
}
func TestRSMI_dev_pci_id_get(t *testing.T) {
RSMI_init()
defer RSMI_shut_down()
pwr, res := RSMI_dev_pci_id_get(0)
t.Log(pwr, res)
t.Logf("%X", pwr)
}
func TestRSMI_dev_memory_total_get(t *testing.T) {
RSMI_init()
defer RSMI_shut_down()
mem, res := RSMI_dev_memory_total_get(0)
t.Log(mem, res)
}
func TestRSMI_dev_memory_usage_get(t *testing.T) {
RSMI_init()
defer RSMI_shut_down()
mem, res := RSMI_dev_memory_usage_get(0)
t.Log(mem, res)
}
func TestRSMI_ecc_enable(t *testing.T) {
RSMI_init()
defer RSMI_shut_down()
a, res := RSMI_ecc_enable(0)
t.Log(a, res)
}
func TestRSMI_dev_brand_get(t *testing.T) {
RSMI_init()
defer RSMI_shut_down()
num, res := RSMI_num_monitor_devices()
if res != nil {
t.Error()
}
for i := range num {
name, res := RSMI_dev_subsystem_name_get(i)
t.Log(i, name, res)
}
}
func TestRocmlib(t *testing.T) {
rlib := GetRocmlib()
b, err := rlib.Init()
if err != nil || !b {
t.Error(err)
}
defer rlib.Shutdown()
num, err := rlib.GetDevNumber()
if err != nil {
t.Error(err)
}
t.Logf("car num is : %d", num)
name, err := rlib.GetDevName()
if err != nil {
t.Error(err)
}
t.Logf("%+v", name)
plevel, err := rlib.GetPerfLevel()
if err != nil {
t.Error(err)
}
t.Logf("%+v", plevel)
fan, err := rlib.GetFanSpeed()
if err != nil {
t.Error(err)
}
t.Logf("%+v", fan)
temp, err := rlib.GetTemp()
if err != nil {
t.Error(err)
}
t.Logf("%+v", temp)
avg, err := rlib.GetPowerAvg()
if err != nil {
t.Error(err)
}
t.Logf("%+v", avg)
cap, err := rlib.GetPowerCap()
if err != nil {
t.Error(err)
}
t.Logf("%+v", cap)
pci, err := rlib.GetPCIBusId()
if err != nil {
t.Error(err)
}
t.Logf("%+v", pci)
memTotal, err := rlib.GetMemTotal()
if err != nil {
t.Error(err)
}
t.Logf("%+v", memTotal)
memUsed, err := rlib.GetMemUsed()
if err != nil {
t.Error(err)
}
t.Logf("%+v", memUsed)
busy, err := rlib.GetBusyPercent()
if err != nil {
t.Error(err)
}
t.Logf("%+v", busy)
v, err := rlib.GetSystemDriverVersion()
if err != nil {
t.Error(err)
}
t.Log(v)
infos, err := rlib.GetProcessInfo()
if err != nil {
t.Error(err)
}
t.Log(infos)
}
package lib
import (
"errors"
"get-container/utils"
"sync/atomic"
)
var (
rocmlib_flag = atomic.Int32{}
rocmlib_instance *rocmlib = nil
ErrNotGetDevNum = errors.New("not get dev num yet")
ErrNotInit = errors.New("not init rocm lib yet")
)
// rocmlib rocm库实例
type rocmlib struct {
status atomic.Int32 // 定义库的状态,0表示没有初始化,1表示初始化了
carNum int // 卡数量,初始值为-1,表示没有获取,须调用GetDevNumber方法
}
func GetRocmlib() *rocmlib {
if rocmlib_flag.Load() == 1 {
return rocmlib_instance
} else {
result := &rocmlib{
status: atomic.Int32{},
carNum: -1,
}
result.status.Store(0)
rocmlib_flag.Store(1)
return result
}
}
func (r *rocmlib) Init() (bool, error) {
if r.status.CompareAndSwap(0, 1) {
result := RSMI_init()
if result == nil {
return true, nil
} else {
r.status.Store(0)
return false, result
}
} else {
return true, nil
}
}
func (r *rocmlib) Shutdown() (bool, error) {
if r.status.CompareAndSwap(1, 0) {
result := RSMI_shut_down()
if result == nil {
r.carNum = -1
return true, nil
} else {
r.status.Store(1)
return false, result
}
} else {
return true, nil
}
}
func (r *rocmlib) IsInited() bool {
return r.status.Load() == 1
}
func (r *rocmlib) GetDevNumber() (int, error) {
if !r.status.CompareAndSwap(1, 1) {
return 0, ErrNotInit
}
num, err := RSMI_num_monitor_devices()
if err != nil {
return 0, err
}
r.carNum = int(num)
return r.carNum, nil
}
func (r *rocmlib) GetDevName() (map[int]string, error) {
num := r.carNum
if !r.status.CompareAndSwap(1, 1) {
return nil, ErrNotInit
}
if num == -1 {
return nil, ErrNotGetDevNum
}
result := make(map[int]string)
if num == 0 {
return result, nil
}
for i := range num {
name, err := RSMI_dev_subsystem_name_get(uint32(i))
if err != nil {
result[i] = "unknow"
} else {
result[i] = name
}
}
return result, nil
}
func (r *rocmlib) GetPerfLevel() (map[int]string, error) {
num := r.carNum
if !r.status.CompareAndSwap(1, 1) {
return nil, ErrNotInit
}
result := make(map[int]string)
if num == -1 {
return nil, ErrNotGetDevNum
}
if num == 0 {
return result, nil
}
for i := range num {
level, err := RSMI_dev_perf_level_get(uint32(i))
if err != nil {
result[i] = "unknow"
} else {
levelName, have := PerfNameMap[level]
if have {
result[i] = levelName
} else {
result[i] = "unknow"
}
}
}
return result, nil
}
func (r *rocmlib) GetFanSpeed() (map[int]int64, error) {
num := r.carNum
if !r.status.CompareAndSwap(1, 1) {
return nil, ErrNotInit
}
if num == -1 {
return nil, ErrNotGetDevNum
}
result := make(map[int]int64)
if num == 0 {
return result, nil
}
for i := range num {
rpm, err := RSMI_dev_fan_rpms_get(uint32(i))
if err != nil {
result[i] = 0
} else {
result[i] = rpm
}
}
return result, nil
}
func (r *rocmlib) GetTemp() (map[int]int64, error) {
num := r.carNum
if !r.status.CompareAndSwap(1, 1) {
return nil, ErrNotInit
}
if num == -1 {
return nil, ErrNotGetDevNum
}
result := make(map[int]int64)
if num == 0 {
return result, nil
}
for i := range num {
temp, err := RSMI_dev_temp_metric_get(uint32(i))
if err != nil {
result[i] = 0
} else {
result[i] = temp
}
}
return result, nil
}
// GetPowerAvg 获取所有卡的平均功率,单位是毫瓦
func (r *rocmlib) GetPowerAvg() (map[int]uint64, error) {
num := r.carNum
if !r.status.CompareAndSwap(1, 1) {
return nil, ErrNotInit
}
if num == -1 {
return nil, ErrNotGetDevNum
}
result := make(map[int]uint64)
if num == 0 {
return result, nil
}
for i := range num {
pwr, err := RSMI_dev_power_ave_get(uint32(i))
if err != nil {
result[i] = 0
} else {
result[i] = pwr
}
}
return result, nil
}
// GetPowerAvg 获取所有卡的功率墙,单位是毫瓦
func (r *rocmlib) GetPowerCap() (map[int]uint64, error) {
num := r.carNum
if !r.status.CompareAndSwap(1, 1) {
return nil, ErrNotInit
}
if num == -1 {
return nil, ErrNotGetDevNum
}
result := make(map[int]uint64)
if num == 0 {
return result, nil
}
for i := range num {
pwr, err := RSMI_dev_power_cap_get(uint32(i))
if err != nil {
result[i] = 0
} else {
result[i] = pwr
}
}
return result, nil
}
func (r *rocmlib) GetPCIBusId() (map[int]string, error) {
num := r.carNum
if !r.status.CompareAndSwap(1, 1) {
return nil, ErrNotInit
}
if num == -1 {
return nil, ErrNotGetDevNum
}
result := make(map[int]string)
if num == 0 {
return result, nil
}
for i := range num {
pci, err := RSMI_dev_pci_id_get(uint32(i))
if err != nil {
result[i] = "unknow"
} else {
result[i] = utils.PCIBus(pci, 0)
}
}
return result, nil
}
func (r *rocmlib) GetMemTotal() (map[int]uint64, error) {
num := r.carNum
if !r.status.CompareAndSwap(1, 1) {
return nil, ErrNotInit
}
if num == -1 {
return nil, ErrNotGetDevNum
}
result := make(map[int]uint64)
if num == 0 {
return result, nil
}
for i := range num {
mem, err := RSMI_dev_memory_total_get(uint32(i))
if err != nil {
result[i] = 0
} else {
result[i] = mem
}
}
return result, nil
}
func (r *rocmlib) GetMemUsed() (map[int]uint64, error) {
num := r.carNum
if !r.status.CompareAndSwap(1, 1) {
return nil, ErrNotInit
}
if num == -1 {
return nil, ErrNotGetDevNum
}
result := make(map[int]uint64)
if num == 0 {
return result, nil
}
for i := range num {
mem, err := RSMI_dev_memory_usage_get(uint32(i))
if err != nil {
result[i] = 0
} else {
result[i] = mem
}
}
return result, nil
}
func (r *rocmlib) GetBusyPercent() (map[int]uint32, error) {
num := r.carNum
if !r.status.CompareAndSwap(1, 1) {
return nil, ErrNotInit
}
if num == -1 {
return nil, ErrNotGetDevNum
}
result := make(map[int]uint32)
if num == 0 {
return result, nil
}
for i := range num {
mem, err := RSMI_dev_busy_percent_get(uint32(i))
if err != nil {
result[i] = 0
} else {
result[i] = mem
}
}
return result, nil
}
func (r *rocmlib) GetSystemDriverVersion() (string, error) {
if !r.status.CompareAndSwap(1, 1) {
return "", ErrNotInit
}
return RSMI_version_str_get()
}
func (r *rocmlib) GetProcessInfo() ([]RSMIProcessInfo, error) {
num := r.carNum
if !r.status.CompareAndSwap(1, 1) {
return nil, ErrNotInit
}
if num == -1 {
return nil, ErrNotGetDevNum
}
result := make([]RSMIProcessInfo, 0)
if num == 0 {
return result, nil
}
result, err := RSMI_compute_process_info_get()
if err != nil {
return result, err
}
if len(result) == 0 {
return result, nil
}
l := len(result)
for i := range l {
indexs, err := RSMI_compute_process_gpus_get(result[i].Pid)
if err != nil {
return nil, err
}
result[i].UsedGPUIndex = indexs
}
return result, nil
}
This source diff could not be displayed because it is too large. You can view the blob instead.
package utils
import (
"fmt"
"strings"
)
// 0000:01:00.0 # 域0000,总线01,设备00,功能0
func PCIBus(id uint64, funcId uint) string {
str := fmt.Sprintf("%X", id)
fstr := fmt.Sprintf("%X", funcId)
l := len(str)
if l == 0 {
return "0000:00:00." + fstr
}
if l <= 2 {
return "0000:00" + ":" + strings.Repeat("0", 2-l) + str + "." + fstr
}
if l <= 4 {
return "0000:" + strings.Repeat("0", 4-l) + str[:l-2] + ":" + str[l-2:] + "." + fstr
}
if l <= 8 {
return strings.Repeat("0", 8-l) + str[:l-4] + ":" + str[l-4:l-2] + ":" + str[l-2:] + "." + fstr
}
strs := str[l-8:]
return strs[:4] + ":" + str[4:6] + ":" + str[6:8] + "." + fstr
}
package utils
import (
"strings"
"sync/atomic"
"testing"
)
......@@ -63,11 +63,16 @@ func TestHumanReadStr(t *testing.T) {
t.Log(ms.HumanReadStr(1))
}
func TestFo(t *testing.T) {
sb := strings.Builder{}
sb.WriteString("hello world\n")
sb.WriteString("time is come\n")
t.Log(sb.String())
sb.WriteString("okoko")
t.Log(sb.String())
func TestPCIBusId(t *testing.T) {
t.Log(PCIBus(0x1, 1))
t.Log(PCIBus(0x12, 1))
t.Log(PCIBus(0x123, 1))
t.Log(PCIBus(0x1234, 1))
t.Log(PCIBus(0x12345, 1))
t.Log(PCIBus(0x123456, 1))
t.Log(PCIBus(0x1234567, 1))
t.Log(PCIBus(0x12345678, 1))
t.Log(PCIBus(0x123456789, 1))
i := atomic.Int32{}
t.Log(i.Load())
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment