Unverified Commit 10380709 authored by guoshzhao's avatar guoshzhao Committed by GitHub
Browse files

Monitor - Collect realtime GPU power when benchmarking. (#507)

**Description**
Collect realtime GPU power when benchmarking.
parent 9f18dea3
......@@ -72,6 +72,22 @@ def get_device_temperature(self, idx):
temp = None
return temp
def get_device_power(self, idx):
"""Get the realtime power of device, unit: watt.
Args:
idx (int): device index.
Return:
temp (float): the realtime power of device, None means failed to get the data.
"""
try:
power = nvml.nvmlDeviceGetPowerUsage(self._device_handlers[idx])
except Exception as err:
logger.error('Get device power failed: {}'.format(str(err)))
return None
return int(int(power) / 1000)
def get_device_power_limit(self, idx):
"""Get the power management limit of device, unit: watt.
......
......@@ -194,6 +194,7 @@ def __sample_gpu_metrics(self, record):
for i in range(device_count):
record.gpu_usage.append(dm.device_manager.get_device_utilization(i))
record.gpu_temperature.append(dm.device_manager.get_device_temperature(i))
record.gpu_power.append(dm.device_manager.get_device_power(i))
record.gpu_power_limit.append(dm.device_manager.get_device_power_limit(i))
mem_used, mem_total = dm.device_manager.get_device_memory(i)
record.gpu_mem_used.append(mem_used)
......
......@@ -14,6 +14,7 @@ class MonitorRecord:
"""Record class to save all monitoring data."""
reduce_ops = {
'gpu_temperature': ReduceType.MAX,
'gpu_power': ReduceType.MAX,
'gpu_power_limit': ReduceType.MIN,
'gpu_corrected_ecc': ReduceType.LAST,
'gpu_uncorrected_ecc': ReduceType.LAST,
......@@ -28,6 +29,7 @@ def __init__(self):
self.__mem_total = None
self.__gpu_usage = list()
self.__gpu_temperature = list()
self.__gpu_power = list()
self.__gpu_power_limit = list()
self.__gpu_mem_used = list()
self.__gpu_mem_total = list()
......@@ -112,6 +114,20 @@ def gpu_temperature(self, gpu_temperature):
"""
self.__gpu_temperature = gpu_temperature
@property
def gpu_power(self):
"""Decoration function to access __gpu_power."""
return self.__gpu_power
@gpu_power.setter
def gpu_power(self, gpu_power):
"""Set the gpu realtime power, unit: Watt.
Args:
gpu_power(list): list of gpu realtime power.
"""
self.__gpu_power = gpu_power
@property
def gpu_power_limit(self):
"""Decoration function to access __gpu_power_limit."""
......
......@@ -387,8 +387,9 @@ def __merge_monitor_metrics(self, node_path):
metrics_dict[metric].append(value)
for metric, values in metrics_dict.items():
prefix = metric.split(':')[0]
for pattern, reduce_type in MonitorRecord.reduce_ops.items():
if pattern in metric:
if pattern == prefix:
reduce_func = Reducer.get_reduce_func(reduce_type)
metric_name = 'monitor/{}'.format(metric)
metrics_summary[metric_name] = reduce_func(values)
......
......@@ -44,8 +44,8 @@ def test_monitor(self):
monitor._Monitor__sample_gpu_metrics(record)
gpu_list_metrics = [
record.gpu_usage, record.gpu_temperature, record.gpu_power_limit, record.gpu_mem_used, record.gpu_mem_total,
record.gpu_corrected_ecc, record.gpu_uncorrected_ecc
record.gpu_usage, record.gpu_temperature, record.gpu_power, record.gpu_power_limit, record.gpu_mem_used,
record.gpu_mem_total, record.gpu_corrected_ecc, record.gpu_uncorrected_ecc
]
for metric in gpu_list_metrics:
assert (metric)
......
......@@ -17,6 +17,7 @@ def test_monitor_record():
mr.mem_total = 1024
mr.gpu_usage = [90, 80, 86, 72, 79, 81, 94, 85]
mr.gpu_temperature = [62, 75, 69, 63, 72, 77, 80, 71]
mr.gpu_power = [257, 290, 280, 262, 291, 284, 281, 273]
mr.gpu_power_limit = [400, 400, 400, 350, 400, 400, 400, 400]
mr.gpu_mem_used = [2550, 2680, 2543, 2588, 2612, 2603, 2515, 2593]
mr.gpu_mem_total = [16777216, 16777216, 16777216, 16777216, 16777216, 16777216, 16777216, 16777216]
......@@ -59,6 +60,14 @@ def test_monitor_record():
'gpu_temperature:5': 77,
'gpu_temperature:6': 80,
'gpu_temperature:7': 71,
'gpu_power:0': 257,
'gpu_power:1': 290,
'gpu_power:2': 280,
'gpu_power:3': 262,
'gpu_power:4': 291,
'gpu_power:5': 284,
'gpu_power:6': 281,
'gpu_power:7': 273,
'gpu_power_limit:0': 400,
'gpu_power_limit:1': 400,
'gpu_power_limit:2': 400,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment