Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tsoc
superbenchmark
Commits
10380709
Unverified
Commit
10380709
authored
Apr 07, 2023
by
guoshzhao
Committed by
GitHub
Apr 07, 2023
Browse files
Monitor - Collect realtime GPU power when benchmarking. (#507)
**Description** Collect realtime GPU power when benchmarking.
parent
9f18dea3
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
46 additions
and
3 deletions
+46
-3
superbench/common/utils/device_manager.py
superbench/common/utils/device_manager.py
+16
-0
superbench/monitor/monitor.py
superbench/monitor/monitor.py
+1
-0
superbench/monitor/record.py
superbench/monitor/record.py
+16
-0
superbench/runner/runner.py
superbench/runner/runner.py
+2
-1
tests/monitor/test_monitor.py
tests/monitor/test_monitor.py
+2
-2
tests/monitor/test_monitor_record.py
tests/monitor/test_monitor_record.py
+9
-0
No files found.
superbench/common/utils/device_manager.py
View file @
10380709
...
...
@@ -72,6 +72,22 @@ def get_device_temperature(self, idx):
temp
=
None
return
temp
def
get_device_power
(
self
,
idx
):
"""Get the realtime power of device, unit: watt.
Args:
idx (int): device index.
Return:
temp (float): the realtime power of device, None means failed to get the data.
"""
try
:
power
=
nvml
.
nvmlDeviceGetPowerUsage
(
self
.
_device_handlers
[
idx
])
except
Exception
as
err
:
logger
.
error
(
'Get device power failed: {}'
.
format
(
str
(
err
)))
return
None
return
int
(
int
(
power
)
/
1000
)
def
get_device_power_limit
(
self
,
idx
):
"""Get the power management limit of device, unit: watt.
...
...
superbench/monitor/monitor.py
View file @
10380709
...
...
@@ -194,6 +194,7 @@ def __sample_gpu_metrics(self, record):
for
i
in
range
(
device_count
):
record
.
gpu_usage
.
append
(
dm
.
device_manager
.
get_device_utilization
(
i
))
record
.
gpu_temperature
.
append
(
dm
.
device_manager
.
get_device_temperature
(
i
))
record
.
gpu_power
.
append
(
dm
.
device_manager
.
get_device_power
(
i
))
record
.
gpu_power_limit
.
append
(
dm
.
device_manager
.
get_device_power_limit
(
i
))
mem_used
,
mem_total
=
dm
.
device_manager
.
get_device_memory
(
i
)
record
.
gpu_mem_used
.
append
(
mem_used
)
...
...
superbench/monitor/record.py
View file @
10380709
...
...
@@ -14,6 +14,7 @@ class MonitorRecord:
"""Record class to save all monitoring data."""
reduce_ops
=
{
'gpu_temperature'
:
ReduceType
.
MAX
,
'gpu_power'
:
ReduceType
.
MAX
,
'gpu_power_limit'
:
ReduceType
.
MIN
,
'gpu_corrected_ecc'
:
ReduceType
.
LAST
,
'gpu_uncorrected_ecc'
:
ReduceType
.
LAST
,
...
...
@@ -28,6 +29,7 @@ def __init__(self):
self
.
__mem_total
=
None
self
.
__gpu_usage
=
list
()
self
.
__gpu_temperature
=
list
()
self
.
__gpu_power
=
list
()
self
.
__gpu_power_limit
=
list
()
self
.
__gpu_mem_used
=
list
()
self
.
__gpu_mem_total
=
list
()
...
...
@@ -112,6 +114,20 @@ def gpu_temperature(self, gpu_temperature):
"""
self
.
__gpu_temperature
=
gpu_temperature
@
property
def
gpu_power
(
self
):
"""Decoration function to access __gpu_power."""
return
self
.
__gpu_power
@
gpu_power
.
setter
def
gpu_power
(
self
,
gpu_power
):
"""Set the gpu realtime power, unit: Watt.
Args:
gpu_power(list): list of gpu realtime power.
"""
self
.
__gpu_power
=
gpu_power
@
property
def
gpu_power_limit
(
self
):
"""Decoration function to access __gpu_power_limit."""
...
...
superbench/runner/runner.py
View file @
10380709
...
...
@@ -387,8 +387,9 @@ def __merge_monitor_metrics(self, node_path):
metrics_dict
[
metric
].
append
(
value
)
for
metric
,
values
in
metrics_dict
.
items
():
prefix
=
metric
.
split
(
':'
)[
0
]
for
pattern
,
reduce_type
in
MonitorRecord
.
reduce_ops
.
items
():
if
pattern
in
metric
:
if
pattern
==
prefix
:
reduce_func
=
Reducer
.
get_reduce_func
(
reduce_type
)
metric_name
=
'monitor/{}'
.
format
(
metric
)
metrics_summary
[
metric_name
]
=
reduce_func
(
values
)
...
...
tests/monitor/test_monitor.py
View file @
10380709
...
...
@@ -44,8 +44,8 @@ def test_monitor(self):
monitor
.
_Monitor__sample_gpu_metrics
(
record
)
gpu_list_metrics
=
[
record
.
gpu_usage
,
record
.
gpu_temperature
,
record
.
gpu_power_limit
,
record
.
gpu_mem_used
,
record
.
gpu_mem_total
,
record
.
gpu_corrected_ecc
,
record
.
gpu_uncorrected_ecc
record
.
gpu_usage
,
record
.
gpu_temperature
,
record
.
gpu_power
,
record
.
gpu_power_limit
,
record
.
gpu_mem_used
,
record
.
gpu_mem_total
,
record
.
gpu_corrected_ecc
,
record
.
gpu_uncorrected_ecc
]
for
metric
in
gpu_list_metrics
:
assert
(
metric
)
...
...
tests/monitor/test_monitor_record.py
View file @
10380709
...
...
@@ -17,6 +17,7 @@ def test_monitor_record():
mr
.
mem_total
=
1024
mr
.
gpu_usage
=
[
90
,
80
,
86
,
72
,
79
,
81
,
94
,
85
]
mr
.
gpu_temperature
=
[
62
,
75
,
69
,
63
,
72
,
77
,
80
,
71
]
mr
.
gpu_power
=
[
257
,
290
,
280
,
262
,
291
,
284
,
281
,
273
]
mr
.
gpu_power_limit
=
[
400
,
400
,
400
,
350
,
400
,
400
,
400
,
400
]
mr
.
gpu_mem_used
=
[
2550
,
2680
,
2543
,
2588
,
2612
,
2603
,
2515
,
2593
]
mr
.
gpu_mem_total
=
[
16777216
,
16777216
,
16777216
,
16777216
,
16777216
,
16777216
,
16777216
,
16777216
]
...
...
@@ -59,6 +60,14 @@ def test_monitor_record():
'gpu_temperature:5'
:
77
,
'gpu_temperature:6'
:
80
,
'gpu_temperature:7'
:
71
,
'gpu_power:0'
:
257
,
'gpu_power:1'
:
290
,
'gpu_power:2'
:
280
,
'gpu_power:3'
:
262
,
'gpu_power:4'
:
291
,
'gpu_power:5'
:
284
,
'gpu_power:6'
:
281
,
'gpu_power:7'
:
273
,
'gpu_power_limit:0'
:
400
,
'gpu_power_limit:1'
:
400
,
'gpu_power_limit:2'
:
400
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment