Unverified Commit c635f755 authored by guoshzhao's avatar guoshzhao Committed by GitHub
Browse files

Monitor - Upgrade pyrsmi to amdsmi python library. (#601)

**Description**
Upgrade to amdsmi python library since pyrsmi will be retired as AMD
guys suggested:

AMD SMI Python Library:
https://github.com/ROCm/amdsmi/tree/develop/py-interface
pyrsmi: https://github.com/RadeonOpenCompute/pyrsmi
parent 6e50f022
...@@ -144,6 +144,10 @@ RUN cd /opt/ && \ ...@@ -144,6 +144,10 @@ RUN cd /opt/ && \
.. && \ .. && \
make -j${NUM_MAKE_JOBS} make -j${NUM_MAKE_JOBS}
# Install AMD SMI Python Library
RUN cd /opt/rocm/share/amd_smi && \
python3 -m pip install --user .
ENV PATH="/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \ ENV PATH="/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \ LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \
LD_LIBRARY_PATH="/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \ LD_LIBRARY_PATH="/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
if gpu.vendor == 'nvidia' or gpu.vendor == 'nvidia-graphics': if gpu.vendor == 'nvidia' or gpu.vendor == 'nvidia-graphics':
import py3nvml.py3nvml as nvml import py3nvml.py3nvml as nvml
elif gpu.vendor == 'amd' or gpu.vendor == 'amd-graphics': elif gpu.vendor == 'amd' or gpu.vendor == 'amd-graphics':
from pyrsmi import rocml import amdsmi as rocml
class DeviceManager: class DeviceManager:
...@@ -150,7 +150,7 @@ def get_device_compute_capability(self): ...@@ -150,7 +150,7 @@ def get_device_compute_capability(self):
try: try:
cap = nvml.nvmlDeviceGetCudaComputeCapability(self._device_handlers[0]) cap = nvml.nvmlDeviceGetCudaComputeCapability(self._device_handlers[0])
except Exception as err: except Exception as err:
logger.error('Get device compute capability failed: {}'.format(str(err))) logger.warning('Get device compute capability failed: {}'.format(str(err)))
return None return None
return cap return cap
...@@ -166,7 +166,7 @@ def get_device_utilization(self, idx): ...@@ -166,7 +166,7 @@ def get_device_utilization(self, idx):
try: try:
util = nvml.nvmlDeviceGetUtilizationRates(self._device_handlers[idx]) util = nvml.nvmlDeviceGetUtilizationRates(self._device_handlers[idx])
except Exception as err: except Exception as err:
logger.error('Get device utilization failed: {}'.format(str(err))) logger.warning('Get device utilization failed: {}'.format(str(err)))
return None return None
return util.gpu return util.gpu
...@@ -182,7 +182,7 @@ def get_device_temperature(self, idx): ...@@ -182,7 +182,7 @@ def get_device_temperature(self, idx):
try: try:
temp = nvml.nvmlDeviceGetTemperature(self._device_handlers[idx], nvml.NVML_TEMPERATURE_GPU) temp = nvml.nvmlDeviceGetTemperature(self._device_handlers[idx], nvml.NVML_TEMPERATURE_GPU)
except Exception as err: except Exception as err:
logger.error('Get device temperature failed: {}'.format(str(err))) logger.warning('Get device temperature failed: {}'.format(str(err)))
temp = None temp = None
return temp return temp
...@@ -198,7 +198,7 @@ def get_device_power(self, idx): ...@@ -198,7 +198,7 @@ def get_device_power(self, idx):
try: try:
power = nvml.nvmlDeviceGetPowerUsage(self._device_handlers[idx]) power = nvml.nvmlDeviceGetPowerUsage(self._device_handlers[idx])
except Exception as err: except Exception as err:
logger.error('Get device power failed: {}'.format(str(err))) logger.warning('Get device power failed: {}'.format(str(err)))
return None return None
return int(int(power) / 1000) return int(int(power) / 1000)
...@@ -214,7 +214,7 @@ def get_device_power_limit(self, idx): ...@@ -214,7 +214,7 @@ def get_device_power_limit(self, idx):
try: try:
powerlimit = nvml.nvmlDeviceGetPowerManagementLimit(self._device_handlers[idx]) powerlimit = nvml.nvmlDeviceGetPowerManagementLimit(self._device_handlers[idx])
except Exception as err: except Exception as err:
logger.error('Get device power limitation failed: {}'.format(str(err))) logger.warning('Get device power limitation failed: {}'.format(str(err)))
return None return None
return int(int(powerlimit) / 1000) return int(int(powerlimit) / 1000)
...@@ -231,7 +231,7 @@ def get_device_memory(self, idx): ...@@ -231,7 +231,7 @@ def get_device_memory(self, idx):
try: try:
mem = nvml.nvmlDeviceGetMemoryInfo(self._device_handlers[idx]) mem = nvml.nvmlDeviceGetMemoryInfo(self._device_handlers[idx])
except Exception as err: except Exception as err:
logger.error('Get device memory failed: {}'.format(str(err))) logger.warning('Get device memory failed: {}'.format(str(err)))
return None, None return None, None
return mem.used, mem.total return mem.used, mem.total
...@@ -304,7 +304,7 @@ def get_device_ecc_error(self, idx): ...@@ -304,7 +304,7 @@ def get_device_ecc_error(self, idx):
except nvml.NVMLError: except nvml.NVMLError:
pass pass
except Exception as err: except Exception as err:
logger.error('Get device ECC information failed: {}'.format(str(err))) logger.warning('Get device ECC information failed: {}'.format(str(err)))
return None, None return None, None
try: try:
...@@ -316,7 +316,7 @@ def get_device_ecc_error(self, idx): ...@@ -316,7 +316,7 @@ def get_device_ecc_error(self, idx):
except nvml.NVMLError: except nvml.NVMLError:
pass pass
except Exception as err: except Exception as err:
logger.error('Get device ECC information failed: {}'.format(str(err))) logger.warning('Get device ECC information failed: {}'.format(str(err)))
return None, None return None, None
return corrected_ecc, uncorrected_ecc return corrected_ecc, uncorrected_ecc
...@@ -326,12 +326,13 @@ class AmdDeviceManager(DeviceManager): ...@@ -326,12 +326,13 @@ class AmdDeviceManager(DeviceManager):
"""Device management module for AMD.""" """Device management module for AMD."""
def __init__(self): def __init__(self):
"""Constructor.""" """Constructor."""
rocml.smi_initialize() rocml.amdsmi_init()
self._device_handlers = rocml.amdsmi_get_processor_handles()
super().__init__() super().__init__()
def __del__(self): def __del__(self):
"""Destructor.""" """Destructor."""
rocml.smi_shutdown() rocml.amdsmi_shut_down()
def get_device_count(self): def get_device_count(self):
"""Get the number of device. """Get the number of device.
...@@ -339,7 +340,7 @@ def get_device_count(self): ...@@ -339,7 +340,7 @@ def get_device_count(self):
Return: Return:
count (int): count of device. count (int): count of device.
""" """
return rocml.smi_get_device_count() return len(self._device_handlers)
def get_device_utilization(self, idx): def get_device_utilization(self, idx):
"""Get the utilization of device. """Get the utilization of device.
...@@ -351,11 +352,11 @@ def get_device_utilization(self, idx): ...@@ -351,11 +352,11 @@ def get_device_utilization(self, idx):
util (int): the utilization of device, None means failed to get the data. util (int): the utilization of device, None means failed to get the data.
""" """
try: try:
util = rocml.smi_get_device_utilization(idx) engine_usage = rocml.amdsmi_get_gpu_activity(self._device_handlers[idx])
except Exception as err: except Exception as err:
logger.error('Get device utilization failed: {}'.format(str(err))) logger.warning('Get device utilization failed: {}'.format(str(err)))
return None return None
return util return engine_usage['gfx_activity']
def get_device_temperature(self, idx): def get_device_temperature(self, idx):
"""Get the temperature of device, unit: celsius. """Get the temperature of device, unit: celsius.
...@@ -366,8 +367,16 @@ def get_device_temperature(self, idx): ...@@ -366,8 +367,16 @@ def get_device_temperature(self, idx):
Return: Return:
temp (int): the temperature of device, None means failed to get the data. temp (int): the temperature of device, None means failed to get the data.
""" """
# Currently no API provided in rocml. try:
return None temp = rocml.amdsmi_get_temp_metric(
self._device_handlers[idx], rocml.AmdSmiTemperatureType.EDGE, rocml.AmdSmiTemperatureMetric.CURRENT
)
except (rocml.AmdSmiLibraryException, rocml.AmdSmiParameterException):
pass
except Exception as err:
logger.warning('Get device temperature failed: {}'.format(str(err)))
temp = None
return temp
def get_device_power(self, idx): def get_device_power(self, idx):
"""Get the realtime power of device, unit: watt. """Get the realtime power of device, unit: watt.
...@@ -379,11 +388,11 @@ def get_device_power(self, idx): ...@@ -379,11 +388,11 @@ def get_device_power(self, idx):
temp (int): the realtime power of device, None means failed to get the data. temp (int): the realtime power of device, None means failed to get the data.
""" """
try: try:
power = rocml.smi_get_device_average_power(idx) power_measure = rocml.amdsmi_get_power_info(self._device_handlers[idx])
except Exception as err: except Exception as err:
logger.error('Get device power failed: {}'.format(str(err))) logger.warning('Get device power failed: {}'.format(str(err)))
return None return None
return int(int(power) / 1000) return int(power_measure['average_socket_power'])
def get_device_power_limit(self, idx): def get_device_power_limit(self, idx):
"""Get the power management limit of device, unit: watt. """Get the power management limit of device, unit: watt.
...@@ -394,8 +403,12 @@ def get_device_power_limit(self, idx): ...@@ -394,8 +403,12 @@ def get_device_power_limit(self, idx):
Return: Return:
temp (int): the power management limit of device, None means failed to get the data. temp (int): the power management limit of device, None means failed to get the data.
""" """
# Currently no API provided in rocml. try:
power_measure = rocml.amdsmi_get_power_info(self._device_handlers[idx])
except Exception as err:
logger.warning('Get device power limit failed: {}'.format(str(err)))
return None return None
return int(power_measure['power_limit'])
def get_device_memory(self, idx): def get_device_memory(self, idx):
"""Get the memory information of device, unit: byte. """Get the memory information of device, unit: byte.
...@@ -408,10 +421,10 @@ def get_device_memory(self, idx): ...@@ -408,10 +421,10 @@ def get_device_memory(self, idx):
total (int): the total device memory in bytes, None means failed to get the data. total (int): the total device memory in bytes, None means failed to get the data.
""" """
try: try:
mem_used = rocml.smi_get_device_memory_used(idx) mem_used = rocml.amdsmi_get_gpu_memory_usage(self._device_handlers[idx], rocml.AmdSmiMemoryType.VRAM)
mem_total = rocml.smi_get_device_memory_total(idx) mem_total = rocml.amdsmi_get_gpu_memory_total(self._device_handlers[idx], rocml.AmdSmiMemoryType.VRAM)
except Exception as err: except Exception as err:
logger.error('Get device memory failed: {}'.format(str(err))) logger.warning('Get device memory failed: {}'.format(str(err)))
return None, None return None, None
return mem_used, mem_total return mem_used, mem_total
...@@ -425,8 +438,19 @@ def get_device_ecc_error(self, idx): ...@@ -425,8 +438,19 @@ def get_device_ecc_error(self, idx):
corrected_ecc (int) : the count of single bit ecc error. corrected_ecc (int) : the count of single bit ecc error.
uncorrected_ecc (int): the count of double bit ecc error. uncorrected_ecc (int): the count of double bit ecc error.
""" """
# Currently no API provided in rocml. corrected_ecc = 0
return None, None uncorrected_ecc = 0
for block in rocml.AmdSmiGpuBlock:
try:
ecc_count = rocml.amdsmi_get_gpu_ecc_count(self._device_handlers[idx], block)
corrected_ecc += ecc_count['correctable_count']
uncorrected_ecc += ecc_count['uncorrectable_count']
except (rocml.AmdSmiLibraryException, rocml.AmdSmiParameterException):
pass
except Exception as err:
logger.info('Get device ECC information failed: {}'.format(str(err)))
return corrected_ecc, uncorrected_ecc
device_manager: Optional[DeviceManager] = DeviceManager() device_manager: Optional[DeviceManager] = DeviceManager()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment