Commit 020bd8c6 authored by liming6's avatar liming6
Browse files

feature 添加更多smi相关的api

parent 66a86495
......@@ -2,6 +2,15 @@
# It is not intended for manual editing.
version = 4
[[package]]
name = "anyhow"
version = "1.0.102"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
[[package]]
name = "rust_hysmi"
version = "0.1.0"
dependencies = [
"anyhow",
]
......@@ -4,3 +4,8 @@ version = "0.1.0"
edition = "2024"
[dependencies]
anyhow = "1.0.102"
[target.x86_64-unknown-linux-musl]
linker = "musl-gcc"
fn main() {
println!("cargo:rustc-link-lib=dylib=rocm_smi64");
//println!("cargo:rustc-link-lib=dylib=rocm_smi64");
println!("cargo:rustc-link-lib=dylib=amd_smi");
println!("cargo:rustc-link-search=native=/opt/hyhal/lib");
// 对于 musl 目标,可能需要额外的链接参数
let target = std::env::var("TARGET").unwrap_or_default();
if target.contains("musl") {
println!("cargo:rustc-link-arg=-Wl,-rpath,/opt/hyhal/lib");
}
}
\ No newline at end of file
use std::ffi::*;
use rust_hysmi::smi::rocm_smi;
fn main() {
unsafe {
let stat = rocm_smi::rsmi_init(0);
assert_eq!(stat, 0);
let mut build = vec![0u8; 256];
let mut version:rocm_smi::rsmi_version_t = rocm_smi::rsmi_version_t{
major: 0,
minor: 0,
patch: 0,
build: build.as_mut_ptr() as *const c_char,
};
let stat = rocm_smi::rsmi_version_get(&mut version);
assert_eq!(stat, 0);
println!("ROCM SMI Version: {}.{}.{}-{}", version.major, version.minor, version.patch, CStr::from_ptr(version.build).to_string_lossy());
let mut num_devices = 0;
rocm_smi::rsmi_num_monitor_devices(&mut num_devices);
println!("Number of devices: {}", num_devices);
let stat = rocm_smi::rsmi_shut_down();
assert_eq!(stat, 0);
}
use rust_hysmi::smi;
use anyhow::Result;
fn main() -> Result<()> {
let smi = smi::ROCmSmi::new()?;
println!("cu num: {:?}", smi.get_cu_number()?);
smi.shut_down();
Ok(())
}
use std::{sync::atomic::{AtomicBool, AtomicU32, Ordering::SeqCst}};
use anyhow::{Result,anyhow};
use crate::smi::rocm_smi::rsmi_process_info_t;
pub mod rocm_smi;
/// rocm库是否被初始化
pub static ROCM_IS_INITED: AtomicBool = AtomicBool::new(false);
/// ROCm SMI (System Management Interface) wrapper
#[derive(Debug, Default)]
pub struct ROCmSmi {
pub device_num: AtomicU32,
}
#[derive(Debug, Default)]
pub struct DCUProcessInfo {
pub process_id: u32,
pub pasid: u32,
pub vram_usage: u64,
pub sdma_usage: u64,
pub cu_occupancy: u32,
pub used_dcu_index: Vec<u32>
}
impl DCUProcessInfo {
pub fn new(info: &rsmi_process_info_t) -> Self {
Self { process_id: info.process_id,
pasid: info.pasid,
vram_usage: info.vram_usage,
sdma_usage: info.sdma_usage,
cu_occupancy: info.cu_occupancy,
used_dcu_index: vec![]
}
}
}
impl ROCmSmi {
pub fn new() -> Result<Self, anyhow::Error> {
unsafe {
if let Ok(o) = ROCM_IS_INITED.compare_exchange(false, true,SeqCst, SeqCst) {
if !o {
let result = rocm_smi::rsmi_init(0);
if result != rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS {
return Err(anyhow::anyhow!("rocm_smi::rsmi_init failed"));
}
}
}
let mut device_num = 0;
let result = rocm_smi::rsmi_num_monitor_devices(&mut device_num);
if result != rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS {
return Err(anyhow::anyhow!("rocm_smi::rsmi_num_monitor_devices failed"));
}
if device_num == 0 {
rocm_smi::rsmi_shut_down();
return Err(anyhow::anyhow!("No ROCm devices found"));
}
return Ok(ROCmSmi {
device_num: AtomicU32::new(device_num),
});
}
}
pub fn shut_down(self) {
if let Ok(o) = ROCM_IS_INITED.compare_exchange(true, false, SeqCst, SeqCst) {
if o {
unsafe {
self.device_num.store(0, SeqCst);
rocm_smi::rsmi_shut_down();
}
}
}
}
pub fn get_device_num(&self) -> anyhow::Result<u32> {
if !ROCM_IS_INITED.load(SeqCst) {
return Err(anyhow::anyhow!("ROCm SMI is not initialized"));
}
unsafe {
let mut device_num = 0;
let result = rocm_smi::rsmi_num_monitor_devices(&mut device_num);
if result != rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS {
return Err(anyhow::anyhow!("rocm_smi::get_device_num failed"));
}
self.device_num.store(device_num, SeqCst);
return Ok(device_num);
}
}
/// 获取设备名称
pub fn get_device_name(&self) -> anyhow::Result<Vec<String>> {
if !ROCM_IS_INITED.load(SeqCst) {
return Err(anyhow::anyhow!("ROCm SMI is not initialized"));
}
unsafe {
let num = self.get_device_num()?;
let mut device_names = Vec::with_capacity(num as usize);
let mut name = [0u8; 256];
for i in 0..num {
let result = rocm_smi::rsmi_dev_subsystem_name_get(i, name.as_mut_ptr() as *mut i8, 256);
if result != rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS {
return Err(anyhow!("error get_device_name"));
}
device_names.push(String::from_utf8_lossy(&name).trim_end_matches('\0').to_string());
name.fill(0);
}
return Ok(device_names);
}
}
/// 获取总显存数,单位是字节
pub fn get_vmem_total(&self) -> Result<Vec<u64>>{
if !ROCM_IS_INITED.load(SeqCst) {
return Err(anyhow::anyhow!("ROCm SMI is not initialized"));
}
unsafe {
let num = self.get_device_num()?;
let mut mems = Vec::with_capacity(num as usize);
let mut mem: u64 = 0;
for i in 0..num {
let r = rocm_smi::rsmi_dev_memory_total_get(i, 0,&mut mem);
if r != rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS {
return Err(anyhow!("error get_vmem_total"));
}
mems.push(mem);
}
return Ok(mems);
}
}
/// 获取已用的显存,单位是字节
pub fn get_vmem_used(&self) -> Result<Vec<u64>>{
if !ROCM_IS_INITED.load(SeqCst) {
return Err(anyhow::anyhow!("ROCm SMI is not initialized"));
}
unsafe {
let num = self.get_device_num()?;
let mut mems = Vec::with_capacity(num as usize);
let mut mem: u64 = 0;
for i in 0..num {
let r = rocm_smi::rsmi_dev_memory_usage_get(i, 0,&mut mem);
if r != rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS {
return Err(anyhow!("error get_vmem_used"));
}
mems.push(mem);
}
return Ok(mems);
}
}
/// 获取DCU忙碌比
pub fn get_busy_percent(&self) -> Result<Vec<u32>> {
if !ROCM_IS_INITED.load(SeqCst) {
return Err(anyhow!("ROCm SMI is not initialized"));
}
unsafe {
let num = self.get_device_num()?;
let mut result = Vec::with_capacity(num as usize);
let mut busy:u32 = 0;
for i in 0..num {
let r = rocm_smi::rsmi_dev_busy_percent_get(i, &mut busy);
if r != rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS {
return Err(anyhow!("rocm_smi::rsmi_dev_power_ave_get failed"));
}
result.push(busy);
}
Ok(result)
}
}
/// 获取设备功耗墙,除以100000,的单位是瓦
pub fn get_power_ave(&self) -> Result<Vec<u64>> {
if !ROCM_IS_INITED.load(SeqCst) {
return Err(anyhow!("ROCm SMI is not initialized"));
}
unsafe {
let num = self.get_device_num()?;
let mut result = Vec::with_capacity(num as usize);
let mut pw:u64 = 0;
for i in 0..num {
let r = rocm_smi::rsmi_dev_power_cap_get(i, 0, &mut pw);
if r != rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS {
return Err(anyhow!("rocm_smi::rsmi_dev_power_ave_get failed"));
}
result.push(pw);
}
Ok(result)
}
}
/// 获取设备平均功率,除以100000,的单位是瓦
pub fn get_power_cap(&self) -> Result<Vec<u64>> {
if !ROCM_IS_INITED.load(SeqCst) {
return Err(anyhow!("ROCm SMI is not initialized"));
}
unsafe {
let num = self.get_device_num()?;
let mut result = Vec::with_capacity(num as usize);
let mut pw:u64 = 0;
for i in 0..num {
let r = rocm_smi::rsmi_dev_power_ave_get(i, 0, &mut pw);
if r != rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS {
return Err(anyhow!("rocm_smi::rsmi_dev_power_ave_get failed"));
}
result.push(pw);
}
Ok(result)
}
}
/// 获取vbios版本
pub fn get_vbios_version(&self) -> Result<Vec<String>> {
if !ROCM_IS_INITED.load(SeqCst) {
return Err(anyhow!("ROCm SMI is not initialized"));
}
unsafe {
let num = self.get_device_num()?;
let mut result = Vec::with_capacity(num as usize);
let mut buff = vec![0u8; 256];
for i in 0..num {
if rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS != rocm_smi::rsmi_dev_vbios_version_get(i, buff.as_mut_ptr() as *mut i8, 256) {
return Err(anyhow!("error get_vbios_version"));
}
result.push(String::from_utf8_lossy(&buff).trim_end_matches('\0').to_owned());
}
Ok(result)
}
}
/// 获取驱动版本
pub fn get_driver_version(&self) -> Result<String> {
if !ROCM_IS_INITED.load(SeqCst) {
return Err(anyhow!("ROCm SMI is not initialized"));
}
unsafe {
let mut buff = vec![0u8; 256];
if rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS != rocm_smi::rsmi_version_str_get(rocm_smi::rsmi_sw_component_t_RSMI_SW_COMP_DRIVER, buff.as_mut_ptr() as *mut i8, 256) {
return Err(anyhow!("error get_vbios_version"));
}
Ok(String::from_utf8_lossy(&buff).trim_end_matches('\0').to_owned())
}
}
/// 获取温度
pub fn get_device_temperature(&self) -> Result<Vec<i64>> {
if !ROCM_IS_INITED.load(SeqCst) {
return Err(anyhow!("ROCm SMI is not initialized"));
}
unsafe {
let num = self.get_device_num()?;
let mut temp = 0;
let mut result = Vec::with_capacity(num as usize);
for i in 0..num {
let r = rocm_smi::rsmi_dev_temp_metric_get(i, rocm_smi::rsmi_temperature_type_t_RSMI_TEMP_TYPE_CORE, rocm_smi::rsmi_temperature_metric_t_RSMI_TEMP_CURRENT, &mut temp);
if r != rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS {
return Err(anyhow!("error rsmi_dev_temp_metric_get"));
}
result.push(temp);
}
Ok(result)
}
}
/// 获取使用dcu的进程信息
pub fn get_use_dcu_process(&self) -> Result<Vec<DCUProcessInfo>> {
if !ROCM_IS_INITED.load(SeqCst) {
return Err(anyhow!("ROCm SMI is not initialized"));
}
unsafe {
let mut procs = vec![rsmi_process_info_t::default();128];
let mut num = 128;
let r = rocm_smi::rsmi_compute_process_info_get(procs.as_mut_ptr(), &mut num);
if r != rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS {
return Err(anyhow!("error rsmi_compute_process_info_get"));
}
procs.truncate(num as usize);
procs.iter_mut().for_each(|x| {
rocm_smi::rsmi_compute_process_info_by_pid_get(x.process_id, x);
});
Ok(procs.iter().map(|x| {
let mut num = self.device_num.load(SeqCst);
let mut dev = vec![0u32; num as usize];
let r = rocm_smi::rsmi_compute_process_gpus_get(x.process_id, dev.as_mut_ptr(), &mut num);
if r == rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS {
dev.truncate(num as usize);
}
let mut res = DCUProcessInfo::new(x);
res.used_dcu_index = dev;
res
}).collect())
}
}
/// 获取设备PCI总线ID
pub fn get_pci_bus(&self) -> Result<Vec<u64>> {
if !ROCM_IS_INITED.load(SeqCst) {
return Err(anyhow!("ROCm SMI is not initialized"));
}
unsafe {
let num = self.get_device_num()?;
let mut result = Vec::with_capacity(num as usize);
for i in 0..num {
let mut bdfid = 0;
let r = rocm_smi::rsmi_dev_pci_id_get(i, &mut bdfid);
if r != rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS {
return Err(anyhow!("error rsmi_dev_pci_id_get"));
}
result.push(bdfid);
}
return Ok(result);
}
}
/// 获取设备时钟频率
pub fn get_dev_clock(&self,clk: rocm_smi::rsmi_clk_type_t) -> Result<Vec<rocm_smi::rsmi_frequencies_t>> {
if !ROCM_IS_INITED.load(SeqCst) {
return Err(anyhow!("ROCm SMI is not initialized"));
}
unsafe {
let num = self.get_device_num()?;
let mut result = Vec::with_capacity(num as usize);
for i in 0..num {
let mut freq = rocm_smi::rsmi_frequencies_t{
current: 0,
frequency: [0; 33],
has_deep_sleep: false,
num_supported: 0,
};
let r = rocm_smi::rsmi_dev_gpu_clk_freq_get(i, clk, &mut freq);
if r != rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS {
return Err(anyhow!("error get_dev_clock"));
}
result.push(freq);
}
Ok(result)
}
}
/// 获取驱动状态,0:没有找到 1:正常 2:加载中 3:没有加载 4:未知
pub fn get_driver_status(&self) -> Result<u32> {
if !ROCM_IS_INITED.load(SeqCst) {
return Err(anyhow!("ROCm SMI is not initialized"));
}
unsafe {
let mut stat = 0;
let r = rocm_smi::rsmi_driver_status(&mut stat);
if r != rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS {
return Err(anyhow!("error get_driver_status"));
}
Ok(stat)
}
}
/// 获取序列号
pub fn get_device_serial_number(&self) -> Result<Vec<String>> {
if !ROCM_IS_INITED.load(SeqCst) {
return Err(anyhow!("ROCm SMI is not initialized"));
}
unsafe {
let num = self.get_device_num()?;
let mut result = Vec::with_capacity(num as usize);
let mut buff = vec![0u8; 128];
for i in 0..num {
let r = rocm_smi::rsmi_dev_serial_number_get(i, buff.as_mut_ptr() as *mut i8, 128 );
if r != rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS {
return Err(anyhow!("error get_device_num"));
}
result.push(String::from_utf8_lossy(&buff).trim_end_matches('\0').to_owned());
}
Ok(result)
}
}
/*
rocm_smi::rsmi_dev_pcie_slot_type_get(dv_ind, type_)
*/
pub fn get_cu_number(&self) -> Result<Vec<i32>> {
if !ROCM_IS_INITED.load(SeqCst) {
return Err(anyhow!("ROCm SMI is not initialized"));
}
unsafe {
let num = self.get_device_num()?;
let mut result = Vec::with_capacity(num as usize);
for i in 0..num {
let mut cu_num = 0;
let r = rocm_smi::rsmi_dev_cu_num_get(i, &mut cu_num);
if r != rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS {
return Err(anyhow!("error get_cu_number"));
}
result.push(cu_num);
}
Ok(result)
}
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_rocm_smi() -> anyhow::Result<()> {
let smi = ROCmSmi::new()?;
println!("ROCm devices found: {}", smi.device_num.load(SeqCst));
println!("ROCm devices pw ave: {:?}", smi.get_power_ave()?);
println!("ROCm devices pw cap: {:?}", smi.get_power_cap()?);
println!("ROCm device names: {:?}", smi.get_device_name()?);
println!("ROCm device busy: {:?}", smi.get_busy_percent()?);
println!("ROCm device vram total: {:?}", smi.get_vmem_total()?);
println!("ROCm device vram usage: {:?}", smi.get_vmem_used()?);
println!("ROCm device vbios version: {:?}", smi.get_vbios_version()?);
println!("ROCm driver version: {:?}", smi.get_driver_version()?);
println!("ROCm dcu temperature: {:?}", smi.get_device_temperature()?);
println!("ROCm dcu process: {:?}", smi.get_use_dcu_process()?);
println!("ROCm dcu pci: {:?}", smi.get_pci_bus()?);
println!("ROCm sock clock: {:?}",smi.get_dev_clock(rocm_smi::rsmi_clk_type_t_RSMI_CLK_TYPE_SYS)?);
println!("ROCm driver status: {:?}", smi.get_driver_status()?);
println!("ROCm device serial number: {:?}", smi.get_device_serial_number()?);
println!("ROCm device cu number: {:?}", smi.get_cu_number()?);
smi.shut_down();
Ok(())
}
}
\ No newline at end of file
......@@ -1582,7 +1582,7 @@ const _: () = {
};
#[doc = " @brief This structure contains information specific to a process."]
#[repr(C)]
#[derive(Debug, Copy, Clone)]
#[derive(Debug, Copy, Clone, Default)]
pub struct rsmi_process_info_t {
#[doc = "!< Process ID"]
pub process_id: u32,
......@@ -2686,3 +2686,37 @@ unsafe extern "C" {
unsafe extern "C" {
pub fn rsmi_dev_cu_num_get(dv_ind: u32, cu_cnt: *mut ::std::os::raw::c_int) -> rsmi_status_t;
}
#[cfg(test)]
mod test {
use super::*;
use std::ffi::{c_char, CStr};
#[test]
fn test_rocm_smi() {
unsafe {
let stat = rsmi_init(0);
assert_eq!(stat, 0);
let mut build = vec![0u8; 256];
let mut version:rsmi_version_t = rsmi_version_t{
major: 0,
minor: 0,
patch: 0,
build: build.as_mut_ptr() as *const c_char,
};
let stat = rsmi_version_get(&mut version);
assert_eq!(stat, 0);
println!("ROCM SMI Version: {}.{}.{}-{}", version.major, version.minor, version.patch, CStr::from_ptr(version.build).to_string_lossy());
let mut num_devices = 0;
rsmi_num_monitor_devices(&mut num_devices);
println!("Number of devices: {}", num_devices);
let stat = rsmi_shut_down();
assert_eq!(stat, 0);
}
}
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment