lib.go 13.3 KB
Newer Older
1
2
3
4
package lib

/*
#cgo CFLAGS: -I.
liming6's avatar
liming6 committed
5
#cgo LDFLAGS: -L/opt/hyhal/lib -lamd_smi
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403

#include <stdlib.h>
#include <string.h>
#include "rocm_smi.h"
*/
import "C"
import (
	"errors"
	"fmt"
	"unsafe"
)

type RSMIResult uint32

const (
	RSMISuccess            RSMIResult = 0
	RSMIInvalidArgs        RSMIResult = 1
	RSMINotSupported       RSMIResult = 2
	RSMIFileError          RSMIResult = 3
	RSMIPermission         RSMIResult = 4
	RSMIOutOfResource      RSMIResult = 5
	RSMIInternalException  RSMIResult = 6
	RSMIInputOutOfBounds   RSMIResult = 7
	RSMIInitError          RSMIResult = 8
	RSMIInitalizationError RSMIResult = RSMIInitError
	RSMINotYetImplemented  RSMIResult = 9
	RSMINotFound           RSMIResult = 10
	RSMIInsufficientSize   RSMIResult = 11
	RSMIInterrupt          RSMIResult = 12
	RSMIUnexpectedSize     RSMIResult = 13
	RSMINoData             RSMIResult = 14
	RSMIUnexpectedData     RSMIResult = 15
	RSMIBusy               RSMIResult = 16
	RSMIRefcountOverflow   RSMIResult = 17
	RSMISettingUnavaliable RSMIResult = 18
	RSMIAmdGPURestartErr   RSMIResult = 19

	RSMIMallocError RSMIResult = 0xFFFFFFFE // malloc执行失败
	RSMIUnknowError RSMIResult = 0xFFFFFFFF
)

var (
	PerfNameMap = map[int32]string{
		0:   "auto",
		1:   "low",
		2:   "high",
		3:   "manual",
		4:   "stable std",
		5:   "stable peak",
		6:   "stable min mclk",
		7:   "stable min sclk",
		8:   "determinism",
		256: "unknow",
	}

	ErrInvalidArgs        = errors.New("rocm invalid args")
	ErrNotSupported       = errors.New("rocm not supported")
	ErrFileError          = errors.New("rocm file error")
	ErrPermission         = errors.New("rocm permission error")
	ErrOutOfResource      = errors.New("rocm out of resource")
	ErrInternalException  = errors.New("rocm internal exception")
	ErrInputOutOfBounds   = errors.New("rocm input out of bounds ")
	ErrInitError          = errors.New("rocm init error")
	ErrInitalizationError = ErrInitError
	ErrNotYetImplemented  = errors.New("rocm not yet implemented")
	ErrNotFound           = errors.New("rocm not found")
	ErrInsufficientSize   = errors.New("rocm insufficient size")
	ErrInterrupt          = errors.New("rocm interrupt")
	ErrUnexpectedSize     = errors.New("rocm unexpected size")
	ErrNoData             = errors.New("rocm no data")
	ErrUnexpectedData     = errors.New("rocm unexpected data")
	ErrBusy               = errors.New("rocm busy")
	ErrRefcountOverflow   = errors.New("rocm refcount overflow")
	ErrSettingUnavaliable = errors.New("rocm setting unavaliable")
	ErrAmdGPURestartErr   = errors.New("rocm amd gpu restart error")
	ErrMallocError        = errors.New("alloc momery error")
	ErrUnknowError        = errors.New("rocm unknow error")

	RocmErrMap = map[RSMIResult]error{
		RSMISuccess:            nil,
		RSMIInvalidArgs:        ErrInvalidArgs,
		RSMINotSupported:       ErrNotSupported,
		RSMIFileError:          ErrFileError,
		RSMIPermission:         ErrPermission,
		RSMIOutOfResource:      ErrOutOfResource,
		RSMIInternalException:  ErrInternalException,
		RSMIInputOutOfBounds:   ErrInputOutOfBounds,
		RSMIInitError:          ErrInitError,
		RSMINotYetImplemented:  ErrNotYetImplemented,
		RSMINotFound:           ErrNotFound,
		RSMIInsufficientSize:   ErrInsufficientSize,
		RSMIInterrupt:          ErrInterrupt,
		RSMIUnexpectedSize:     ErrUnexpectedSize,
		RSMINoData:             ErrNoData,
		RSMIUnexpectedData:     ErrUnexpectedData,
		RSMIBusy:               ErrBusy,
		RSMIRefcountOverflow:   ErrRefcountOverflow,
		RSMISettingUnavaliable: ErrSettingUnavaliable,
		RSMIAmdGPURestartErr:   ErrAmdGPURestartErr,
		RSMIMallocError:        ErrMallocError,
		RSMIUnknowError:        ErrUnknowError,
	}
)

func ToRSMIResult(c C.rsmi_status_t) error {
	e, have := RocmErrMap[RSMIResult(c)]
	if have {
		return e
	}
	return ErrUnknowError
}

// RSMIProcessInfo 对应rsmi_process_info_t
type RSMIProcessInfo struct {
	Pid                   uint32 // Process ID
	ProcessAddressSpaceId uint32 // PASID: (Process Address Space ID)
	VarmUsage             uint64 // VRAM usage
	SdmaUsage             uint64 // SDMA usage in microseconds
	CuOccupancy           uint32 // Compute Unit usage in percent
	UsedGPUIndex          []uint32
}

func (pi *RSMIProcessInfo) FromC(c C.rsmi_process_info_t) {
	pi.Pid = uint32(c.process_id)
	pi.ProcessAddressSpaceId = uint32(c.pasid)
	pi.VarmUsage = uint64(c.vram_usage)
	pi.SdmaUsage = uint64(c.sdma_usage)
	pi.CuOccupancy = uint32(c.cu_occupancy)
}

type RSMIProcessInfoV2 struct {
	Pid           uint32
	VramUsageSize uint64          // VRAM usage size in MiB
	VramUsageRate float32         // VRAM usage rate as a percentage
	UsedGPUs      int             // Used gpu number
	GPUUsage      map[int]float32 // GPU usage rate as a percentage
}

func (pi2 *RSMIProcessInfoV2) FromC(c C.rsmi_process_info_v2_t) {
	pi2.Pid = uint32(c.processId)
	pi2.VramUsageSize = uint64(c.vramUsageSize)
	pi2.VramUsageRate = float32(c.vramUsageRate)
	pi2.UsedGPUs = int(c.usedGpus)
	pi2.GPUUsage = make(map[int]float32)
	for k, v := range c.gpuIndex {
		pi2.GPUUsage[int(v)] = float32(c.gpuUsageRate[k])
	}
}

// RSMI_init 初始化rsmi
func RSMI_init() error {
	return ToRSMIResult(C.rsmi_init(0))
}

// RSMI_shut_down 关闭rsmi
func RSMI_shut_down() error {
	return ToRSMIResult(C.rsmi_shut_down())
}

// RSMI_num_monitor_devices 获取设备数量
func RSMI_num_monitor_devices() (uint32, error) {
	var num C.uint
	ptr := (*C.uint)(unsafe.Pointer(&num))
	res := C.rsmi_num_monitor_devices(ptr)
	return uint32(num), ToRSMIResult(res)
}

// RSMI_version_get 获取当前运行的RSMI版本
func RSMI_version_get() (uint32, uint32, uint32, error) {
	result := (*C.rsmi_version_t)(C.malloc(C.sizeof_rsmi_version_t))
	if unsafe.Pointer(result) != C.NULL {
		defer func() {
			C.free(unsafe.Pointer(result))
		}()
	} else {
		return 0, 0, 0, ErrMallocError
	}
	res := C.rsmi_version_get(result)
	return uint32(result.major), uint32(result.minor), uint32(result.patch), ToRSMIResult(res)
}

// RSMI_version_str_get 获取当前系统的驱动程序版本
func RSMI_version_str_get() (string, error) {
	buff := make([]uint8, 128)
	res := C.rsmi_version_str_get(C.RSMI_SW_COMP_DRIVER, (*C.char)(unsafe.Pointer(&buff[0])), 128)
	return string(buff), ToRSMIResult(res)
}

// RSMI_dev_vbios_version_get 获取VBIOS版本
func RSMI_dev_vbios_version_get(deviceIndex uint32) (string, error) {
	buff := make([]uint8, 128)
	res := C.rsmi_dev_vbios_version_get(C.uint(deviceIndex), (*C.char)(unsafe.Pointer(&buff[0])), 128)
	return string(buff), ToRSMIResult(res)
}

// RSMI_dev_name_get 获取设备名称,只有开头的字符,没有数字,如BW
func RSMI_dev_name_get(deviceIndex uint32) (string, error) {
	buff := make([]uint8, 128)
	res := C.rsmi_dev_name_get(C.uint(deviceIndex), (*C.char)(unsafe.Pointer(&buff[0])), 128)
	return string(buff), ToRSMIResult(res)
}

// RSMI_dev_id_get 获取设备id
func RSMI_dev_id_get(deviceIndex uint32) (uint16, error) {
	var id C.ushort
	res := C.rsmi_dev_id_get(C.uint(deviceIndex), (*C.ushort)(unsafe.Pointer(&id)))
	return uint16(id), ToRSMIResult(res)
}

// RSMI_dev_sku_get 获取设备的sku号
func RSMI_dev_sku_get(deviceIndex uint32) (uint16, error) {
	var sku C.ushort
	res := C.rsmi_dev_sku_get(C.uint(deviceIndex), (*C.ushort)(unsafe.Pointer(&sku)))
	return uint16(sku), ToRSMIResult(res)
}

func RSMI_dev_vendor_id_get(deviceIndex uint32) (uint16, error) {
	var vendor C.ushort
	res := C.rsmi_dev_vendor_id_get(C.uint(deviceIndex), (*C.ushort)(unsafe.Pointer(&vendor)))
	return uint16(vendor), ToRSMIResult(res)
}

func RSMI_dev_brand_get(deviceIndex uint32) (string, error) {
	buff := make([]uint8, 128)
	res := C.rsmi_dev_brand_get(C.uint(deviceIndex), (*C.char)(unsafe.Pointer(&buff[0])), 128)
	return string(buff), ToRSMIResult(res)
}

func RSMI_dev_serial_number_get(deviceIndex uint32) (string, error) {
	buff := make([]uint8, 128)
	res := C.rsmi_dev_serial_number_get(C.uint(deviceIndex), (*C.char)(unsafe.Pointer(&buff[0])), 128)
	return string(buff), ToRSMIResult(res)
}

// RSMI_dev_subsystem_name_get 获取设备全名
func RSMI_dev_subsystem_name_get(deviceIndex uint32) (string, error) {
	buff := make([]uint8, 128)
	res := C.rsmi_dev_subsystem_name_get(C.uint(deviceIndex), (*C.char)(unsafe.Pointer(&buff[0])), 128)
	return string(buff), ToRSMIResult(res)
}

// RSMI_dev_perf_level_get 获取设备运行等级
func RSMI_dev_perf_level_get(deviceIndex uint32) (int32, error) {
	var level C.rsmi_dev_perf_level_t
	res := C.rsmi_dev_perf_level_get(C.uint(deviceIndex), (*C.rsmi_dev_perf_level_t)(unsafe.Pointer(&level)))
	return int32(level), ToRSMIResult(res)
}

// RSMI_compute_process_info_get 获取所有使用显卡的进程信息
func RSMI_compute_process_info_get() ([]RSMIProcessInfo, error) {
	ps := (*C.rsmi_process_info_t)(C.malloc(C.sizeof_rsmi_process_info_t * 128))
	if unsafe.Pointer(ps) != C.NULL {
		defer func() {
			C.free(unsafe.Pointer(ps))
		}()
	} else {
		return nil, ErrMallocError
	}
	var num C.uint = C.uint(128)
	res := C.rsmi_compute_process_info_get(ps, (*C.uint)(unsafe.Pointer(&num)))
	if res != C.RSMI_STATUS_SUCCESS {
		return nil, ToRSMIResult(res)
	}
	psSlice := unsafe.Slice((*C.rsmi_process_info_t)(unsafe.Pointer(ps)), int(num))
	if len(psSlice) == 0 {
		return make([]RSMIProcessInfo, 0), ToRSMIResult(res)
	}
	result := make([]RSMIProcessInfo, int(num))
	for i := range int(num) {
		result[i].FromC(psSlice[i])
	}
	return result, ToRSMIResult(res)
}

// RSMI_compute_process_info_by_pid_get_v2 获取进程的详细信息,注意:不是所有版本的so文件都支持该方法,可能导致进程崩溃
func RSMI_compute_process_info_by_pid_get_v2(pid uint32) (info *RSMIProcessInfoV2, res error) {
	ps2 := (*C.rsmi_process_info_v2_t)(C.malloc(C.sizeof_rsmi_process_info_v2_t))
	if unsafe.Pointer(ps2) != C.NULL {
		defer func() {
			C.free(unsafe.Pointer(ps2))
		}()
	} else {
		info = nil
		res = ErrMallocError
		return
	}
	defer func() {
		if r := recover(); r != nil {
			info = nil
			res = ErrUnknowError
		}
	}()
	r := C.rsmi_compute_process_info_by_pid_get_v2(C.uint(pid), ps2)
	if res != nil {
		info = nil
		res = ToRSMIResult(r)
		return
	}
	result := RSMIProcessInfoV2{}
	result.FromC(*ps2)
	info = &result
	res = nil
	return
}

func RSMI_dev_fan_rpms_get(devIndex uint32) (int64, error) {
	var rpm C.long = C.long(0)
	ptr := (*C.long)(unsafe.Pointer(&rpm))
	res := C.rsmi_dev_fan_rpms_get(C.uint(devIndex), 0, ptr)
	if ToRSMIResult(res) != nil {
		return 0, ToRSMIResult(res)
	}
	return int64(rpm), nil
}

// RSMI_dev_temp_metric_get 获取设备核心温度,结果除以1000就是摄氏度
func RSMI_dev_temp_metric_get(devIndex uint32) (int64, error) {
	var temp C.long = C.long(0)
	ptr := (*C.long)(unsafe.Pointer(&temp))
	res := C.rsmi_dev_temp_metric_get(C.uint(devIndex), C.RSMI_TEMP_TYPE_CORE, C.RSMI_TEMP_CURRENT, ptr)
	return int64(temp), ToRSMIResult(res)
}

// RSMI_dev_power_ave_get 获取设备的平均功耗,单位是微瓦
func RSMI_dev_power_ave_get(devIndex uint32) (uint64, error) {
	var power C.ulong = C.ulong(0)
	ptr := (*C.ulong)(unsafe.Pointer(&power))
	res := C.rsmi_dev_power_ave_get(C.uint(devIndex), 0, ptr)
	return uint64(power), ToRSMIResult(res)
}

// RSMI_dev_power_cap_get 获取设备的功耗墙,单位是微瓦
func RSMI_dev_power_cap_get(devIndex uint32) (uint64, error) {
	var cap C.ulong = C.ulong(0)
	ptr := (*C.ulong)(unsafe.Pointer(&cap))
	res := C.rsmi_dev_power_cap_get(C.uint(devIndex), 0, ptr)
	return uint64(cap), ToRSMIResult(res)
}

// RSMI_dev_pci_id_get 获取设备的PCI id,对于0000:49:00.0,返回的是49:00部分
func RSMI_dev_pci_id_get(devIndex uint32) (uint64, error) {
	var pciid C.ulong = C.ulong(0)
	res := C.rsmi_dev_pci_id_get(C.uint(devIndex), (*C.ulong)(unsafe.Pointer(&pciid)))
	return uint64(pciid), ToRSMIResult(res)
}

func RSMI_dev_memory_total_get(devIndex uint32) (uint64, error) {
	var mem C.ulong = C.ulong(0)
	res := C.rsmi_dev_memory_total_get(C.uint(devIndex), C.RSMI_MEM_TYPE_VRAM, (*C.ulong)(unsafe.Pointer(&mem)))
	return uint64(mem), ToRSMIResult(res)
}

func RSMI_dev_memory_usage_get(devIndex uint32) (uint64, error) {
	var mem C.ulong = C.ulong(0)
	res := C.rsmi_dev_memory_usage_get(C.uint(devIndex), C.RSMI_MEM_TYPE_VRAM, (*C.ulong)(unsafe.Pointer(&mem)))
	return uint64(mem), ToRSMIResult(res)
}

// nvmlDeviceGetMigMode
// dmi/dmi_mig.h

// RSMI_dev_busy_percent_get 获取设备的忙碌百分比
func RSMI_dev_busy_percent_get(devIndex uint32) (uint32, error) {
	var percent C.uint = 0
	res := C.rsmi_dev_busy_percent_get(C.uint(devIndex), (*C.uint)(unsafe.Pointer(&percent)))
	return uint32(percent), ToRSMIResult(res)
}

func RSMI_ecc_enable(devIndex uint32) (bool, error) {
	var blocks C.ulong = 0
	res := C.rsmi_dev_ecc_enabled_get(C.uint(devIndex), (*C.ulong)(unsafe.Pointer(&blocks)))
	if ToRSMIResult(res) != nil {
		return false, ToRSMIResult(res)
	}
	fmt.Printf("%X\n", blocks)
	ss := C.rsmi_gpu_block_t(blocks)
	var stat C.rsmi_ras_err_state_t = 0
	res = C.rsmi_dev_ecc_status_get(C.uint(devIndex), ss, (*C.rsmi_ras_err_state_t)(unsafe.Pointer(&stat)))
	return uint32(stat) == uint32(C.RSMI_RAS_ERR_STATE_ENABLED), ToRSMIResult(res)
}

func RSMI_compute_process_gpus_get(pid uint32) ([]uint32, error) {
	var devIds [32]C.uint
	var devNum C.uint = 32
	ptrDevIds := (*C.uint)(unsafe.Pointer(&devIds[0]))
	res := C.rsmi_compute_process_gpus_get(C.uint(pid), ptrDevIds, (*C.uint)(unsafe.Pointer(&devNum)))
	if ToRSMIResult(res) != nil {
		return nil, ToRSMIResult(res)
	}
	if devNum > 0 {
		result := make([]uint32, devNum)
		for i := range devNum {
			result[i] = uint32(devIds[i])
		}
		return result, nil
	}
	return make([]uint32, 0), nil
}