use std::{sync::atomic::{AtomicBool, AtomicU32, Ordering::SeqCst}}; use anyhow::{Result,anyhow}; use crate::smi::rocm_smi::rsmi_process_info_t; pub mod rocm_smi; /// rocm库是否被初始化 pub static ROCM_IS_INITED: AtomicBool = AtomicBool::new(false); /// ROCm SMI (System Management Interface) wrapper #[derive(Debug, Default)] pub struct ROCmSmi { pub device_num: AtomicU32, } #[derive(Debug, Default)] pub struct DCUProcessInfo { pub process_id: u32, pub pasid: u32, pub vram_usage: u64, pub sdma_usage: u64, pub cu_occupancy: u32, pub used_dcu_index: Vec } impl DCUProcessInfo { pub fn new(info: &rsmi_process_info_t) -> Self { Self { process_id: info.process_id, pasid: info.pasid, vram_usage: info.vram_usage, sdma_usage: info.sdma_usage, cu_occupancy: info.cu_occupancy, used_dcu_index: vec![] } } } impl ROCmSmi { pub fn new() -> Result { unsafe { if let Ok(o) = ROCM_IS_INITED.compare_exchange(false, true,SeqCst, SeqCst) { if !o { let result = rocm_smi::rsmi_init(0); if result != rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS { return Err(anyhow::anyhow!("rocm_smi::rsmi_init failed")); } } } let mut device_num = 0; let result = rocm_smi::rsmi_num_monitor_devices(&mut device_num); if result != rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS { return Err(anyhow::anyhow!("rocm_smi::rsmi_num_monitor_devices failed")); } if device_num == 0 { rocm_smi::rsmi_shut_down(); return Err(anyhow::anyhow!("No ROCm devices found")); } return Ok(ROCmSmi { device_num: AtomicU32::new(device_num), }); } } pub fn shut_down(self) { if let Ok(o) = ROCM_IS_INITED.compare_exchange(true, false, SeqCst, SeqCst) { if o { unsafe { self.device_num.store(0, SeqCst); rocm_smi::rsmi_shut_down(); } } } } pub fn get_device_num(&self) -> anyhow::Result { if !ROCM_IS_INITED.load(SeqCst) { return Err(anyhow::anyhow!("ROCm SMI is not initialized")); } unsafe { let mut device_num = 0; let result = rocm_smi::rsmi_num_monitor_devices(&mut device_num); if result != rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS { return Err(anyhow::anyhow!("rocm_smi::get_device_num failed")); } self.device_num.store(device_num, SeqCst); return Ok(device_num); } } /// 获取设备名称 pub fn get_device_name(&self) -> anyhow::Result> { if !ROCM_IS_INITED.load(SeqCst) { return Err(anyhow::anyhow!("ROCm SMI is not initialized")); } unsafe { let num = self.get_device_num()?; let mut device_names = Vec::with_capacity(num as usize); let mut name = [0u8; 256]; for i in 0..num { let result = rocm_smi::rsmi_dev_subsystem_name_get(i, name.as_mut_ptr() as *mut i8, 256); if result != rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS { return Err(anyhow!("error get_device_name")); } device_names.push(String::from_utf8_lossy(&name).trim_end_matches('\0').to_string()); name.fill(0); } return Ok(device_names); } } /// 获取总显存数,单位是字节 pub fn get_vmem_total(&self) -> Result>{ if !ROCM_IS_INITED.load(SeqCst) { return Err(anyhow::anyhow!("ROCm SMI is not initialized")); } unsafe { let num = self.get_device_num()?; let mut mems = Vec::with_capacity(num as usize); let mut mem: u64 = 0; for i in 0..num { let r = rocm_smi::rsmi_dev_memory_total_get(i, 0,&mut mem); if r != rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS { return Err(anyhow!("error get_vmem_total")); } mems.push(mem); } return Ok(mems); } } /// 获取已用的显存,单位是字节 pub fn get_vmem_used(&self) -> Result>{ if !ROCM_IS_INITED.load(SeqCst) { return Err(anyhow::anyhow!("ROCm SMI is not initialized")); } unsafe { let num = self.get_device_num()?; let mut mems = Vec::with_capacity(num as usize); let mut mem: u64 = 0; for i in 0..num { let r = rocm_smi::rsmi_dev_memory_usage_get(i, 0,&mut mem); if r != rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS { return Err(anyhow!("error get_vmem_used")); } mems.push(mem); } return Ok(mems); } } /// 获取DCU忙碌比 pub fn get_busy_percent(&self) -> Result> { if !ROCM_IS_INITED.load(SeqCst) { return Err(anyhow!("ROCm SMI is not initialized")); } unsafe { let num = self.get_device_num()?; let mut result = Vec::with_capacity(num as usize); let mut busy:u32 = 0; for i in 0..num { let r = rocm_smi::rsmi_dev_busy_percent_get(i, &mut busy); if r != rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS { return Err(anyhow!("rocm_smi::rsmi_dev_power_ave_get failed")); } result.push(busy); } Ok(result) } } /// 获取设备功耗墙,除以100000,的单位是瓦 pub fn get_power_ave(&self) -> Result> { if !ROCM_IS_INITED.load(SeqCst) { return Err(anyhow!("ROCm SMI is not initialized")); } unsafe { let num = self.get_device_num()?; let mut result = Vec::with_capacity(num as usize); let mut pw:u64 = 0; for i in 0..num { let r = rocm_smi::rsmi_dev_power_cap_get(i, 0, &mut pw); if r != rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS { return Err(anyhow!("rocm_smi::rsmi_dev_power_ave_get failed")); } result.push(pw); } Ok(result) } } /// 获取设备平均功率,除以100000,的单位是瓦 pub fn get_power_cap(&self) -> Result> { if !ROCM_IS_INITED.load(SeqCst) { return Err(anyhow!("ROCm SMI is not initialized")); } unsafe { let num = self.get_device_num()?; let mut result = Vec::with_capacity(num as usize); let mut pw:u64 = 0; for i in 0..num { let r = rocm_smi::rsmi_dev_power_ave_get(i, 0, &mut pw); if r != rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS { return Err(anyhow!("rocm_smi::rsmi_dev_power_ave_get failed")); } result.push(pw); } Ok(result) } } /// 获取vbios版本 pub fn get_vbios_version(&self) -> Result> { if !ROCM_IS_INITED.load(SeqCst) { return Err(anyhow!("ROCm SMI is not initialized")); } unsafe { let num = self.get_device_num()?; let mut result = Vec::with_capacity(num as usize); let mut buff = vec![0u8; 256]; for i in 0..num { if rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS != rocm_smi::rsmi_dev_vbios_version_get(i, buff.as_mut_ptr() as *mut i8, 256) { return Err(anyhow!("error get_vbios_version")); } result.push(String::from_utf8_lossy(&buff).trim_end_matches('\0').to_owned()); } Ok(result) } } /// 获取驱动版本 pub fn get_driver_version(&self) -> Result { if !ROCM_IS_INITED.load(SeqCst) { return Err(anyhow!("ROCm SMI is not initialized")); } unsafe { let mut buff = vec![0u8; 256]; if rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS != rocm_smi::rsmi_version_str_get(rocm_smi::rsmi_sw_component_t_RSMI_SW_COMP_DRIVER, buff.as_mut_ptr() as *mut i8, 256) { return Err(anyhow!("error get_vbios_version")); } Ok(String::from_utf8_lossy(&buff).trim_end_matches('\0').to_owned()) } } /// 获取温度 pub fn get_device_temperature(&self) -> Result> { if !ROCM_IS_INITED.load(SeqCst) { return Err(anyhow!("ROCm SMI is not initialized")); } unsafe { let num = self.get_device_num()?; let mut temp = 0; let mut result = Vec::with_capacity(num as usize); for i in 0..num { let r = rocm_smi::rsmi_dev_temp_metric_get(i, rocm_smi::rsmi_temperature_type_t_RSMI_TEMP_TYPE_CORE, rocm_smi::rsmi_temperature_metric_t_RSMI_TEMP_CURRENT, &mut temp); if r != rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS { return Err(anyhow!("error rsmi_dev_temp_metric_get")); } result.push(temp); } Ok(result) } } /// 获取使用dcu的进程信息 pub fn get_use_dcu_process(&self) -> Result> { if !ROCM_IS_INITED.load(SeqCst) { return Err(anyhow!("ROCm SMI is not initialized")); } unsafe { let mut procs = vec![rsmi_process_info_t::default();128]; let mut num = 128; let r = rocm_smi::rsmi_compute_process_info_get(procs.as_mut_ptr(), &mut num); if r != rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS { return Err(anyhow!("error rsmi_compute_process_info_get")); } procs.truncate(num as usize); procs.iter_mut().for_each(|x| { rocm_smi::rsmi_compute_process_info_by_pid_get(x.process_id, x); }); Ok(procs.iter().map(|x| { let mut num = self.device_num.load(SeqCst); let mut dev = vec![0u32; num as usize]; let r = rocm_smi::rsmi_compute_process_gpus_get(x.process_id, dev.as_mut_ptr(), &mut num); if r == rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS { dev.truncate(num as usize); } let mut res = DCUProcessInfo::new(x); res.used_dcu_index = dev; res }).collect()) } } /// 获取设备PCI总线ID pub fn get_pci_bus(&self) -> Result> { if !ROCM_IS_INITED.load(SeqCst) { return Err(anyhow!("ROCm SMI is not initialized")); } unsafe { let num = self.get_device_num()?; let mut result = Vec::with_capacity(num as usize); for i in 0..num { let mut bdfid = 0; let r = rocm_smi::rsmi_dev_pci_id_get(i, &mut bdfid); if r != rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS { return Err(anyhow!("error rsmi_dev_pci_id_get")); } result.push(bdfid); } return Ok(result); } } /// 获取设备时钟频率 pub fn get_dev_clock(&self,clk: rocm_smi::rsmi_clk_type_t) -> Result> { if !ROCM_IS_INITED.load(SeqCst) { return Err(anyhow!("ROCm SMI is not initialized")); } unsafe { let num = self.get_device_num()?; let mut result = Vec::with_capacity(num as usize); for i in 0..num { let mut freq = rocm_smi::rsmi_frequencies_t{ current: 0, frequency: [0; 33], has_deep_sleep: false, num_supported: 0, }; let r = rocm_smi::rsmi_dev_gpu_clk_freq_get(i, clk, &mut freq); if r != rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS { return Err(anyhow!("error get_dev_clock")); } result.push(freq); } Ok(result) } } /// 获取驱动状态,0:没有找到 1:正常 2:加载中 3:没有加载 4:未知 pub fn get_driver_status(&self) -> Result { if !ROCM_IS_INITED.load(SeqCst) { return Err(anyhow!("ROCm SMI is not initialized")); } unsafe { let mut stat = 0; let r = rocm_smi::rsmi_driver_status(&mut stat); if r != rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS { return Err(anyhow!("error get_driver_status")); } Ok(stat) } } /// 获取序列号 pub fn get_device_serial_number(&self) -> Result> { if !ROCM_IS_INITED.load(SeqCst) { return Err(anyhow!("ROCm SMI is not initialized")); } unsafe { let num = self.get_device_num()?; let mut result = Vec::with_capacity(num as usize); let mut buff = vec![0u8; 128]; for i in 0..num { let r = rocm_smi::rsmi_dev_serial_number_get(i, buff.as_mut_ptr() as *mut i8, 128 ); if r != rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS { return Err(anyhow!("error get_device_num")); } result.push(String::from_utf8_lossy(&buff).trim_end_matches('\0').to_owned()); } Ok(result) } } /* rocm_smi::rsmi_dev_pcie_slot_type_get(dv_ind, type_) */ pub fn get_cu_number(&self) -> Result> { if !ROCM_IS_INITED.load(SeqCst) { return Err(anyhow!("ROCm SMI is not initialized")); } unsafe { let num = self.get_device_num()?; let mut result = Vec::with_capacity(num as usize); for i in 0..num { let mut cu_num = 0; let r = rocm_smi::rsmi_dev_cu_num_get(i, &mut cu_num); if r != rocm_smi::rsmi_status_t_RSMI_STATUS_SUCCESS { return Err(anyhow!("error get_cu_number")); } result.push(cu_num); } Ok(result) } } } #[cfg(test)] mod test { use super::*; #[test] fn test_rocm_smi() -> anyhow::Result<()> { let smi = ROCmSmi::new()?; println!("ROCm devices found: {}", smi.device_num.load(SeqCst)); println!("ROCm devices pw ave: {:?}", smi.get_power_ave()?); println!("ROCm devices pw cap: {:?}", smi.get_power_cap()?); println!("ROCm device names: {:?}", smi.get_device_name()?); println!("ROCm device busy: {:?}", smi.get_busy_percent()?); println!("ROCm device vram total: {:?}", smi.get_vmem_total()?); println!("ROCm device vram usage: {:?}", smi.get_vmem_used()?); println!("ROCm device vbios version: {:?}", smi.get_vbios_version()?); println!("ROCm driver version: {:?}", smi.get_driver_version()?); println!("ROCm dcu temperature: {:?}", smi.get_device_temperature()?); println!("ROCm dcu process: {:?}", smi.get_use_dcu_process()?); println!("ROCm dcu pci: {:?}", smi.get_pci_bus()?); println!("ROCm sock clock: {:?}",smi.get_dev_clock(rocm_smi::rsmi_clk_type_t_RSMI_CLK_TYPE_SYS)?); println!("ROCm driver status: {:?}", smi.get_driver_status()?); println!("ROCm device serial number: {:?}", smi.get_device_serial_number()?); println!("ROCm device cu number: {:?}", smi.get_cu_number()?); smi.shut_down(); Ok(()) } }