Unverified Commit cc70f9c1 authored by guoshzhao's avatar guoshzhao Committed by GitHub
Browse files

Benchmarks: Add Feature - Extend the device manager utility to support more functions. (#239)

**Description**
Rename `nvidia_helper` utility as `device_manager` module and support more functions:
```
device_manager.get_device_count()
device_manager.get_device_utilization(idx)
device_manager.get_device_temperature(idx)
device_manager.get_device_power_limit(idx)
device_manager.get_device_memory(idx)
device_manager.get_device_row_remapped_info(idx)
device_manager.get_device_ecc_error(idx)
```
parent 8a00c8a0
......@@ -6,7 +6,7 @@
import os
from superbench.common.utils import logger
from superbench.common.utils import nv_helper
from superbench.common.utils import device_manager as dm
from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
from superbench.benchmarks.micro_benchmarks import GemmFlopsBenchmark
......@@ -64,7 +64,7 @@ def _preprocess(self):
True if _preprocess() succeed.
"""
# Reset kernels according to compute capability.
capability = nv_helper.get_device_compute_capability()
capability = dm.device_manager.get_device_compute_capability()
if capability not in self.__kernel_map:
# After preprocess() self._result.return_code can be generated
super()._preprocess()
......
......@@ -8,7 +8,7 @@
from superbench.common.utils.lazy_import import LazyImport
from superbench.common.utils.process import run_command
nv_helper = LazyImport('superbench.common.utils.nvidia_helper')
device_manager = LazyImport('superbench.common.utils.device_manager')
__all__ = [
'LazyImport',
......@@ -17,7 +17,7 @@
'get_sb_config',
'logger',
'network',
'nv_helper',
'device_manager',
'rotate_dir',
'run_command',
]
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""Device Managerment Library Utility."""
import py3nvml.py3nvml as nvml
from superbench.common.utils import logger
from superbench.common.utils import process
class DeviceManager:
"""Device management module."""
def __init__(self):
"""Constructor."""
nvml.nvmlInit()
self._device_count = self.get_device_count()
self._device_handlers = list()
for i in range(self._device_count):
self._device_handlers.append(nvml.nvmlDeviceGetHandleByIndex(i))
def get_device_count(self):
"""Get the compute capability of device.
Return:
count (int): count of device.
"""
return nvml.nvmlDeviceGetCount()
def get_device_compute_capability(self):
"""Get the compute capability of device.
Return:
cap (float): the compute capability of device, None means failed to get the data.
"""
try:
cap = nvml.nvmlDeviceGetCudaComputeCapability(self._device_handlers[0])
except Exception as err:
logger.error('Get device compute capability failed: {}'.format(str(err)))
return None
return cap
def get_device_utilization(self, idx):
"""Get the utilization of device.
Args:
idx (int): device index.
Return:
util (int): the utilization of device, None means failed to get the data.
"""
try:
util = nvml.nvmlDeviceGetUtilizationRates(self._device_handlers[idx])
except Exception as err:
logger.error('Get device utilization failed: {}'.format(str(err)))
return None
return util.gpu
def get_device_temperature(self, idx):
"""Get the temperature of device, unit: celsius.
Args:
idx (int): device index.
Return:
temp (int): the temperature of device, None means failed to get the data.
"""
try:
temp = nvml.nvmlDeviceGetTemperature(self._device_handlers[idx], nvml.NVML_TEMPERATURE_GPU)
except Exception as err:
logger.error('Get device temperature failed: {}'.format(str(err)))
temp = None
return temp
def get_device_power_limit(self, idx):
"""Get the power management limit of device, unit: watt.
Args:
idx (int): device index.
Return:
temp (float): the power management limit of device, None means failed to get the data.
"""
try:
powerlimit = nvml.nvmlDeviceGetPowerManagementLimit(self._device_handlers[idx])
except Exception as err:
logger.error('Get device power limitation failed: {}'.format(str(err)))
return None
return int(int(powerlimit) / 1000)
def get_device_memory(self, idx):
"""Get the memory information of device, unit: byte.
Args:
idx (int): device index.
Return:
used (float): the used device memory, None means failed to get the data.
total (float): the total device memory, None means failed to get the data.
"""
try:
mem = nvml.nvmlDeviceGetMemoryInfo(self._device_handlers[idx])
except Exception as err:
logger.error('Get device memory failed: {}'.format(str(err)))
return None, None
return mem.used, mem.total
def get_device_row_remapped_info(self, idx):
"""Get the row remapped information of device.
The command 'nvidia-smi -i idx -q' contains the following output:
Remapped Rows
Correctable Error : 0
Uncorrectable Error : 0
Pending : No
Remapping Failure Occurred : No
Bank Remap Availability Histogram
Max : 640 bank(s)
High : 0 bank(s)
Partial : 0 bank(s)
Low : 0 bank(s)
None : 0 bank(s)
Temperature
GPU Current Temp : 36 C
Args:
idx (int): device index.
Return:
remapped_metrics (dict): the row remapped information, None means failed to get the data.
"""
output = process.run_command('nvidia-smi -i {} -q'.format(idx))
if output.returncode == 0:
begin = output.stdout.find('Remapped Rows')
end = output.stdout.find('Temperature', begin)
if begin != -1 and end != -1:
remapped_info = output.stdout[begin:end]
remapped_info = remapped_info.split('\n')
remapped_info = [item for item in remapped_info if ':' in item]
remapped_metrics = dict()
for item in remapped_info:
key_value = item.split(':')
key = 'gpu_remap_' + key_value[0].lower().strip().replace(' ', '_')
value = key_value[1].replace('bank(s)', '').strip()
try:
value = int(value)
remapped_metrics[key] = value
except Exception:
continue
return remapped_metrics
return None
def get_device_ecc_error(self, idx):
"""Get the ecc error information of device.
Args:
idx (int): device index.
Return:
corrected_ecc (int) : the count of single bit ecc error.
uncorrected_ecc (int): the count of double bit ecc error.
"""
corrected_ecc = 0
uncorrected_ecc = 0
for location_idx in range(nvml.NVML_MEMORY_LOCATION_COUNT):
try:
count = nvml.nvmlDeviceGetMemoryErrorCounter(
self._device_handlers[idx], nvml.NVML_MEMORY_ERROR_TYPE_CORRECTED, nvml.NVML_VOLATILE_ECC,
location_idx
)
corrected_ecc += count
except nvml.NVMLError:
pass
except Exception as err:
logger.error('Get device ECC information failed: {}'.format(str(err)))
return None, None
try:
count = nvml.nvmlDeviceGetMemoryErrorCounter(
self._device_handlers[idx], nvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, nvml.NVML_VOLATILE_ECC,
location_idx
)
uncorrected_ecc += count
except nvml.NVMLError:
pass
except Exception as err:
logger.error('Get device ECC information failed: {}'.format(str(err)))
return None, None
return corrected_ecc, uncorrected_ecc
device_manager = DeviceManager()
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""Nvidia Utility."""
import py3nvml.py3nvml as nvml
def get_device_compute_capability():
"""Get the compute capability of device.
Return:
cap (float): the compute capability of device, None means no device found.
"""
nvml.nvmlInit()
device_count = nvml.nvmlDeviceGetCount()
if device_count == 0:
return None
handle = nvml.nvmlDeviceGetHandleByIndex(0)
cap = nvml.nvmlDeviceGetCudaComputeCapability(handle)
return cap
......@@ -8,7 +8,7 @@
from pathlib import Path
from tests.helper import decorator
from superbench.common.utils import nv_helper
from superbench.common.utils import device_manager as dm
from superbench.benchmarks import BenchmarkRegistry, ReturnCode, Platform, BenchmarkType
......@@ -42,7 +42,7 @@ def test_flops_performance_cuda(self):
)
ret = benchmark._preprocess()
if nv_helper.get_device_compute_capability() not in [7.0, 8.0]:
if dm.device_manager.get_device_compute_capability() not in [7.0, 8.0]:
assert (ret is False)
assert (benchmark.return_code == ReturnCode.MICROBENCHMARK_UNSUPPORTED_ARCHITECTURE)
else:
......
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""Tests for nvidia_helper module."""
import numbers
from unittest import mock
from tests.helper import decorator
from superbench.common.utils import device_manager as dm
@decorator.cuda_test
@mock.patch('superbench.common.utils.process.run_command')
def test_nvidia_helper_utils(mock_run_command):
"""Test util functions of nvidia_helper."""
assert (isinstance(dm.device_manager.get_device_count(), numbers.Number))
assert (isinstance(dm.device_manager.get_device_compute_capability(), numbers.Number))
assert (isinstance(dm.device_manager.get_device_utilization(0), numbers.Number))
assert (isinstance(dm.device_manager.get_device_temperature(0), numbers.Number))
assert (isinstance(dm.device_manager.get_device_power_limit(0), numbers.Number))
used_mem, total_mem = dm.device_manager.get_device_memory(0)
assert (isinstance(used_mem, numbers.Number) and isinstance(total_mem, numbers.Number))
corrected_ecc, uncorrected_ecc = dm.device_manager.get_device_ecc_error(0)
assert (isinstance(corrected_ecc, numbers.Number) and isinstance(uncorrected_ecc, numbers.Number))
mock_run_command.return_value.returncode = 0
mock_run_command.return_value.stdout = """
Remapped Rows
Correctable Error : 0
Uncorrectable Error : 0
Pending : No
Remapping Failure Occurred : No
Bank Remap Availability Histogram
Max : 640 bank(s)
High : 0 bank(s)
Partial : 0 bank(s)
Low : 0 bank(s)
None : 0 bank(s)
Temperature
GPU Current Temp : 36 C
"""
gpu_remapped_info = dm.device_manager.get_device_row_remapped_info(0)
expected = {
'gpu_remap_correctable_error': 0,
'gpu_remap_uncorrectable_error': 0,
'gpu_remap_max': 640,
'gpu_remap_high': 0,
'gpu_remap_partial': 0,
'gpu_remap_low': 0,
'gpu_remap_none': 0
}
assert (gpu_remapped_info == expected)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment