Unverified Commit 028819b3 authored by guoshzhao's avatar guoshzhao Committed by GitHub
Browse files

Monitor - Add support for AMD GPU. (#580)

**Description**
Add AMD support in monitor.

**Major Revision**
- Add library pyrsmi to collect metrics.
- Currently can get device_utilization, device_power, device_used_memory
and device_total_memory.
parent 1ad1c21c
...@@ -183,7 +183,7 @@ def run(self): ...@@ -183,7 +183,7 @@ def run(self):
**x, **x,
'develop': x['dev'] + x['test'], 'develop': x['dev'] + x['test'],
'cpuworker': x['torch'], 'cpuworker': x['torch'],
'amdworker': x['torch'] + x['ort'], 'amdworker': x['torch'] + x['ort'] + x['amd'],
'nvworker': x['torch'] + x['ort'] + x['nvidia'], 'nvworker': x['torch'] + x['ort'] + x['nvidia'],
} }
)( )(
...@@ -217,6 +217,7 @@ def run(self): ...@@ -217,6 +217,7 @@ def run(self):
'onnxruntime-gpu; python_version>="3.10"', 'onnxruntime-gpu; python_version>="3.10"',
], ],
'nvidia': ['py3nvml>=0.2.6'], 'nvidia': ['py3nvml>=0.2.6'],
'amd': ['pyrsmi>=1.0.1'],
} }
), ),
include_package_data=True, include_package_data=True,
......
...@@ -3,24 +3,138 @@ ...@@ -3,24 +3,138 @@
"""Device Managerment Library Utility.""" """Device Managerment Library Utility."""
import py3nvml.py3nvml as nvml from typing import Optional
from superbench.common.utils import logger from superbench.common.utils import logger
from superbench.common.utils import process from superbench.common.utils import process
from superbench.common.devices import GPU
gpu = GPU()
if gpu.vendor == 'nvidia' or gpu.vendor == 'nvidia-graphics':
import py3nvml.py3nvml as nvml
elif gpu.vendor == 'amd' or gpu.vendor == 'amd-graphics':
from pyrsmi import rocml
class DeviceManager: class DeviceManager:
"""Device management module.""" """Device management base module."""
def __init__(self): def __init__(self):
"""Constructor.""" """Constructor."""
nvml.nvmlInit()
self._device_count = self.get_device_count() self._device_count = self.get_device_count()
def get_device_count(self):
"""Get the number of device.
Return:
count (int): count of device.
"""
return 0
def get_device_compute_capability(self):
"""Get the compute capability of device.
Return:
cap (float): the compute capability of device, None means failed to get the data.
"""
return None
def get_device_utilization(self, idx):
"""Get the utilization of device.
Args:
idx (int): device index.
Return:
util (int): the utilization of device, None means failed to get the data.
"""
return None
def get_device_temperature(self, idx):
"""Get the temperature of device, unit: celsius.
Args:
idx (int): device index.
Return:
temp (int): the temperature of device, None means failed to get the data.
"""
return None
def get_device_power(self, idx):
"""Get the realtime power of device, unit: watt.
Args:
idx (int): device index.
Return:
temp (int): the realtime power of device, None means failed to get the data.
"""
return None
def get_device_power_limit(self, idx):
"""Get the power management limit of device, unit: watt.
Args:
idx (int): device index.
Return:
temp (int): the power management limit of device, None means failed to get the data.
"""
return None
def get_device_memory(self, idx):
"""Get the memory information of device, unit: byte.
Args:
idx (int): device index.
Return:
used (int): the used device memory in bytes, None means failed to get the data.
total (int): the total device memory in bytes, None means failed to get the data.
"""
return None, None
def get_device_row_remapped_info(self, idx):
"""Get the row remapped information of device.
Args:
idx (int): device index.
Return:
remapped_metrics (dict): the row remapped information, None means failed to get the data.
"""
return None
def get_device_ecc_error(self, idx):
"""Get the ecc error information of device.
Args:
idx (int): device index.
Return:
corrected_ecc (int) : the count of single bit ecc error.
uncorrected_ecc (int): the count of double bit ecc error.
"""
return None, None
class NvidiaDeviceManager(DeviceManager):
"""Device management module for Nvidia."""
def __init__(self):
"""Constructor."""
nvml.nvmlInit()
super().__init__()
self._device_handlers = list() self._device_handlers = list()
for i in range(self._device_count): for i in range(self._device_count):
self._device_handlers.append(nvml.nvmlDeviceGetHandleByIndex(i)) self._device_handlers.append(nvml.nvmlDeviceGetHandleByIndex(i))
def __del__(self):
"""Destructor."""
nvml.nvmlShutdown()
def get_device_count(self): def get_device_count(self):
"""Get the compute capability of device. """Get the number of device.
Return: Return:
count (int): count of device. count (int): count of device.
...@@ -79,7 +193,7 @@ def get_device_power(self, idx): ...@@ -79,7 +193,7 @@ def get_device_power(self, idx):
idx (int): device index. idx (int): device index.
Return: Return:
temp (float): the realtime power of device, None means failed to get the data. temp (int): the realtime power of device, None means failed to get the data.
""" """
try: try:
power = nvml.nvmlDeviceGetPowerUsage(self._device_handlers[idx]) power = nvml.nvmlDeviceGetPowerUsage(self._device_handlers[idx])
...@@ -95,7 +209,7 @@ def get_device_power_limit(self, idx): ...@@ -95,7 +209,7 @@ def get_device_power_limit(self, idx):
idx (int): device index. idx (int): device index.
Return: Return:
temp (float): the power management limit of device, None means failed to get the data. temp (int): the power management limit of device, None means failed to get the data.
""" """
try: try:
powerlimit = nvml.nvmlDeviceGetPowerManagementLimit(self._device_handlers[idx]) powerlimit = nvml.nvmlDeviceGetPowerManagementLimit(self._device_handlers[idx])
...@@ -111,8 +225,8 @@ def get_device_memory(self, idx): ...@@ -111,8 +225,8 @@ def get_device_memory(self, idx):
idx (int): device index. idx (int): device index.
Return: Return:
used (float): the used device memory, None means failed to get the data. used (int): the used device memory in bytes, None means failed to get the data.
total (float): the total device memory, None means failed to get the data. total (int): the total device memory in bytes, None means failed to get the data.
""" """
try: try:
mem = nvml.nvmlDeviceGetMemoryInfo(self._device_handlers[idx]) mem = nvml.nvmlDeviceGetMemoryInfo(self._device_handlers[idx])
...@@ -208,4 +322,115 @@ def get_device_ecc_error(self, idx): ...@@ -208,4 +322,115 @@ def get_device_ecc_error(self, idx):
return corrected_ecc, uncorrected_ecc return corrected_ecc, uncorrected_ecc
device_manager = DeviceManager() class AmdDeviceManager(DeviceManager):
"""Device management module for AMD."""
def __init__(self):
"""Constructor."""
rocml.smi_initialize()
super().__init__()
def __del__(self):
"""Destructor."""
rocml.smi_shutdown()
def get_device_count(self):
"""Get the number of device.
Return:
count (int): count of device.
"""
return rocml.smi_get_device_count()
def get_device_utilization(self, idx):
"""Get the utilization of device.
Args:
idx (int): device index.
Return:
util (int): the utilization of device, None means failed to get the data.
"""
try:
util = rocml.smi_get_device_utilization(idx)
except Exception as err:
logger.error('Get device utilization failed: {}'.format(str(err)))
return None
return util
def get_device_temperature(self, idx):
"""Get the temperature of device, unit: celsius.
Args:
idx (int): device index.
Return:
temp (int): the temperature of device, None means failed to get the data.
"""
# Currently no API provided in rocml.
return None
def get_device_power(self, idx):
"""Get the realtime power of device, unit: watt.
Args:
idx (int): device index.
Return:
temp (int): the realtime power of device, None means failed to get the data.
"""
try:
power = rocml.smi_get_device_average_power(idx)
except Exception as err:
logger.error('Get device power failed: {}'.format(str(err)))
return None
return int(int(power) / 1000)
def get_device_power_limit(self, idx):
"""Get the power management limit of device, unit: watt.
Args:
idx (int): device index.
Return:
temp (int): the power management limit of device, None means failed to get the data.
"""
# Currently no API provided in rocml.
return None
def get_device_memory(self, idx):
"""Get the memory information of device, unit: byte.
Args:
idx (int): device index.
Return:
used (int): the used device memory in bytes, None means failed to get the data.
total (int): the total device memory in bytes, None means failed to get the data.
"""
try:
mem_used = rocml.smi_get_device_memory_used(idx)
mem_total = rocml.smi_get_device_memory_total(idx)
except Exception as err:
logger.error('Get device memory failed: {}'.format(str(err)))
return None, None
return mem_used, mem_total
def get_device_ecc_error(self, idx):
"""Get the ecc error information of device.
Args:
idx (int): device index.
Return:
corrected_ecc (int) : the count of single bit ecc error.
uncorrected_ecc (int): the count of double bit ecc error.
"""
# Currently no API provided in rocml.
return None, None
device_manager: Optional[DeviceManager] = DeviceManager()
if gpu.vendor == 'nvidia' or gpu.vendor == 'nvidia-graphics':
device_manager = NvidiaDeviceManager()
elif gpu.vendor == 'amd' or gpu.vendor == 'amd-graphics':
device_manager = AmdDeviceManager()
...@@ -218,14 +218,14 @@ def exec(self): ...@@ -218,14 +218,14 @@ def exec(self):
monitor = None monitor = None
if self.__get_rank_id() == 0 and self._sb_monitor_config and self._sb_monitor_config.enable: if self.__get_rank_id() == 0 and self._sb_monitor_config and self._sb_monitor_config.enable:
if self.__get_platform() == Platform.CUDA: if self.__get_platform() is not Platform.CPU:
monitor = Monitor( monitor = Monitor(
None, int(self._sb_monitor_config.sample_duration or 10), None, int(self._sb_monitor_config.sample_duration or 10),
int(self._sb_monitor_config.sample_interval or 1), self.__get_monitor_path(benchmark_name) int(self._sb_monitor_config.sample_interval or 1), self.__get_monitor_path(benchmark_name)
) )
monitor.start() monitor.start()
else: else:
logger.warning('Monitor can not support ROCM/CPU platform.') logger.warning('Monitor can not support CPU platform.')
benchmark_real_name = benchmark_name.split(':')[0] benchmark_real_name = benchmark_name.split(':')[0]
for framework in benchmark_config.frameworks or [Framework.NONE.value]: for framework in benchmark_config.frameworks or [Framework.NONE.value]:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment