Unverified Commit 6e357fb9 authored by guoshzhao's avatar guoshzhao Committed by GitHub
Browse files

Monitor: Integration - Integrate monitor into Superbench (#259)

**Description**
Integrate monitor into Superbench.

**Major Revision**
- Initialize, start and stop monitor in SB executor.
- Parse the monitor data in SB runner and merge into benchmark results.
- Specify ReduceType for monitor metrics, such as MAX, MIN and LAST.
- Add monitor configs into config file.
parent afea9913
......@@ -56,6 +56,10 @@ Here is an overview of SuperBench configuration structure:
version: string
superbench:
enable: string | [ string ]
monitor:
enable: bool
sample_duration: int
sample_interval: int
var:
${var_name}: dict
benchmarks:
......@@ -69,6 +73,10 @@ superbench:
version: v0.3
superbench:
enable: benchmark_1
monitor:
enable: false
sample_duration: 10
sample_interval: 1
var:
var_1: value
benchmarks:
......@@ -98,6 +106,22 @@ If not specified, will use [`${benchmark_name}.enable`](#enable) in each benchma
* value from: benchmark names defined in `superbench.benchmarks`
* default value: `null`
### `superbench.monitor`
Enable monitor to collect system metrics periodically, currently only support CUDA platform. There are three settings:
#### `enable`
Whether enable the monitor module or not.
#### `sample_duration`
Calculate the average metrics during sample_duration seconds, such as CPU usage and NIC bandwidth.
#### `sample_interval`
Do sampling every sample_interval seconds.
### `superbench.var`
User-defined variables to be used in the configuration.
......
......@@ -15,6 +15,7 @@ class ReduceType(Enum):
MAX = 'max'
MIN = 'min'
SUM = 'sum'
LAST = 'last'
class Reducer:
......@@ -52,8 +53,23 @@ def get_reduce_func(cls, reduce_type):
return None
@staticmethod
def last(array):
"""Get the last item from the input sequence.
Args:
array (List): The input sequence.
Return:
The last item of the input sequence.
"""
if not isinstance(array, list) or len(array) == 0:
raise ValueError('last() arg is an empty sequence')
return array[-1]
Reducer.add_reduce_func(ReduceType.MAX)(max)
Reducer.add_reduce_func(ReduceType.MIN)(min)
Reducer.add_reduce_func(ReduceType.SUM)(sum)
Reducer.add_reduce_func(ReduceType.AVG)(mean)
Reducer.add_reduce_func(ReduceType.LAST)(Reducer.last)
......@@ -2,6 +2,10 @@
version: v0.3
superbench:
enable: null
monitor:
enable: false
sample_duration: 1
sample_interval: 10
var:
default_local_mode: &default_local_mode
enable: true
......
......@@ -2,6 +2,10 @@
version: v0.3
superbench:
enable: null
monitor:
enable: false
sample_duration: 1
sample_interval: 10
var:
default_local_mode: &default_local_mode
enable: true
......
......@@ -12,6 +12,7 @@
from superbench.benchmarks import Platform, Framework, BenchmarkRegistry
from superbench.common.utils import SuperBenchLogger, logger, rotate_dir
from superbench.common.devices import GPU
from superbench.monitor import Monitor
class SuperBenchExecutor():
......@@ -32,6 +33,7 @@ def __init__(self, sb_config, sb_output_dir):
logger.debug('Executor writes to: %s.', str(self._output_path))
self.__validate_sb_config()
self._sb_monitor_config = self._sb_config.superbench.monitor
self._sb_benchmarks = self._sb_config.superbench.benchmarks
self._sb_enabled = self.__get_enabled_benchmarks()
logger.debug('Executor will execute: %s', self._sb_enabled)
......@@ -131,17 +133,28 @@ def __exec_benchmark(self, context, log_suffix):
logger.error('Executor failed in %s.', log_suffix)
return None
def __get_rank_id(self):
"""Get rank ID for current process.
Return:
int: Rank ID.
"""
for rank_env in ['PROC_RANK', 'LOCAL_RANK']:
if os.getenv(rank_env):
return int(os.getenv(rank_env))
return 0
def __get_benchmark_dir(self, benchmark_name):
"""Get output directory for benchmark's current rank.
Args:
benchmark_name (str): Benchmark name.
Return:
Path: output directory.
"""
benchmark_output_dir = self._output_path / 'benchmarks' / benchmark_name
for rank_env in ['PROC_RANK', 'LOCAL_RANK']:
if os.getenv(rank_env):
return benchmark_output_dir / 'rank{}'.format(os.getenv(rank_env))
return benchmark_output_dir / 'rank0'
return self._output_path / 'benchmarks' / benchmark_name / ('rank' + str(self.__get_rank_id()))
def __create_benchmark_dir(self, benchmark_name):
"""Create output directory for benchmark.
......@@ -166,6 +179,17 @@ def __write_benchmark_results(self, benchmark_name, benchmark_results):
with (self.__get_benchmark_dir(benchmark_name) / 'results.json').open(mode='w') as f:
json.dump(benchmark_results, f, indent=2)
def __get_monitor_path(self, benchmark_name):
"""Get the output file path for the monitor.
Args:
benchmark_name (str): Benchmark name.
Return:
str: monitor output file path.
"""
return f'{self.__get_benchmark_dir(benchmark_name) / "monitor.jsonl"}'
def exec(self):
"""Run the SuperBench benchmarks locally."""
for benchmark_name in self._sb_benchmarks:
......@@ -174,6 +198,18 @@ def exec(self):
benchmark_config = self._sb_benchmarks[benchmark_name]
benchmark_results = list()
self.__create_benchmark_dir(benchmark_name)
monitor = None
if self.__get_rank_id() == 0 and self._sb_monitor_config and self._sb_monitor_config.enable:
if self.__get_platform() == Platform.CUDA:
monitor = Monitor(
None, int(self._sb_monitor_config.sample_duration or 10),
int(self._sb_monitor_config.sample_interval or 1), self.__get_monitor_path(benchmark_name)
)
monitor.start()
else:
logger.warning('Monitor can not support ROCM/CPU platform.')
for framework in benchmark_config.frameworks or [Framework.NONE.value]:
if benchmark_name.endswith('_models'):
for model in benchmark_config.models:
......@@ -199,4 +235,6 @@ def exec(self):
result = self.__exec_benchmark(context, log_suffix)
benchmark_results.append(result)
if monitor:
monitor.stop()
self.__write_benchmark_results(benchmark_name, benchmark_results)
......@@ -16,19 +16,19 @@
class Monitor(multiprocessing.Process):
"""The monitor class to collect system metrics periodically."""
def __init__(self, container_name, sample_duration, sample_freq, output_file):
def __init__(self, container_name, sample_duration, sample_interval, output_file):
"""Constructor.
Args:
container_name (str): container name that need to monitor, None means the current env.
sample_duration (int): calculate the average metirc during sample_duration seconds.
sample_freq (int): do sampling every sample_freq seconds.
sample_interval (int): do sampling every sample_interval seconds.
output_file (str): output file in jsonline format.
"""
multiprocessing.Process.__init__(self)
self.__container_name = container_name
self.__sample_duration = sample_duration
self.__sample_freq = sample_freq
self.__sample_interval = sample_interval
self.__output_file = output_file
self.__scheduler = sched.scheduler(time.time, time.sleep)
......@@ -120,7 +120,7 @@ def stop(self):
def __sample(self):
"""Method sampling system metrics."""
if self.__running.value == 1:
self.__scheduler.enter(self.__sample_freq, 1, self.__sample, ())
self.__scheduler.enter(self.__sample_interval, 1, self.__sample, ())
# Sampling
record = MonitorRecord()
self.__sample_host_metrics(record)
......
......@@ -7,9 +7,19 @@
import numbers
from datetime import datetime
from superbench.benchmarks import ReduceType
class MonitorRecord:
"""Record class to save all monitoring data."""
reduce_ops = {
'gpu_temperature': ReduceType.MAX,
'gpu_power_limit': ReduceType.MIN,
'gpu_corrected_ecc': ReduceType.LAST,
'gpu_uncorrected_ecc': ReduceType.LAST,
'gpu_remap': ReduceType.LAST,
}
def __init__(self):
"""Constructor."""
self.__time = datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
......
......@@ -9,6 +9,7 @@
from pprint import pformat
from collections import defaultdict
import jsonlines
from natsort import natsorted
from joblib import Parallel, delayed
from omegaconf import ListConfig, OmegaConf
......@@ -16,6 +17,7 @@
from superbench.common.utils import SuperBenchLogger, logger
from superbench.runner.ansible import AnsibleClient
from superbench.benchmarks import ReduceType, Reducer
from superbench.monitor import MonitorRecord
class SuperBenchRunner():
......@@ -255,13 +257,15 @@ def __create_single_node_summary(self, node_path): # pragma: no cover # noqa:
results_summary[benchmark_name][metric].append(result['result'][metric])
results_summary = self.__merge_all_metrics(results_summary, reduce_ops)
results_summary = self.__merge_benchmark_metrics(results_summary, reduce_ops)
monitor_summary = self.__merge_monitor_metrics(node_path)
results_summary = {**results_summary, **monitor_summary}
with (node_path / 'results-summary.json').open(mode='w') as f:
json.dump(results_summary, f, indent=2)
return results_summary
def __merge_all_metrics(self, results_summary, reduce_ops):
def __merge_benchmark_metrics(self, results_summary, reduce_ops):
"""Merge metrics of all benchmarks in one node.
Args:
......@@ -301,6 +305,42 @@ def __merge_all_metrics(self, results_summary, reduce_ops):
return metrics_summary
def __merge_monitor_metrics(self, node_path):
"""Merge and summarize monitor metrics of one node.
Args:
node_path (Path): The Path instance of node directory.
Returns:
dict: Flattened result with metric as key.
"""
metrics_summary = dict()
all_samples = list()
file_list = list(node_path.glob('**/monitor.jsonl'))
for results_file in file_list:
try:
with jsonlines.open(results_file) as reader:
all_samples = list(reader)
except BaseException as e:
logger.error('Invalid Jsonline file: {}, error message: {}'.format(results_file, str(e)))
continue
all_samples = sorted(all_samples, key=lambda k: k.get('time', '0'))
metrics_dict = dict()
for sample in all_samples:
for metric, value in sample.items():
if metric not in metrics_dict:
metrics_dict[metric] = list()
metrics_dict[metric].append(value)
for metric, values in metrics_dict.items():
for pattern, reduce_type in MonitorRecord.reduce_ops.items():
if pattern in metric:
reduce_func = Reducer.get_reduce_func(reduce_type)
metrics_summary[metric] = reduce_func(values)
continue
return metrics_summary
def _run_proc(self, benchmark_name, mode, vars):
"""Run the process.
......
{"time": "2021-12-06 04:19:35", "cpu_usage": 356.05527547060314, "gpu_usage:0": 56, "gpu_usage:1": 56, "gpu_usage:2": 51, "gpu_usage:3": 50, "gpu_usage:4": 50, "gpu_usage:5": 48, "gpu_usage:6": 50, "gpu_usage:7": 51, "gpu_temperature:0": 26, "gpu_temperature:1": 26, "gpu_temperature:2": 24, "gpu_temperature:3": 25, "gpu_temperature:4": 25, "gpu_temperature:5": 25, "gpu_temperature:6": 22, "gpu_temperature:7": 25, "gpu_power_limit:0": 250, "gpu_power_limit:1": 250, "gpu_power_limit:2": 250, "gpu_power_limit:3": 250, "gpu_power_limit:4": 250, "gpu_power_limit:5": 250, "gpu_power_limit:6": 250, "gpu_power_limit:7": 250, "gpu_mem_used:0": 267517952, "gpu_mem_used:1": 267517952, "gpu_mem_used:2": 267517952, "gpu_mem_used:3": 267517952, "gpu_mem_used:4": 267517952, "gpu_mem_used:5": 267517952, "gpu_mem_used:6": 267517952, "gpu_mem_used:7": 267517952, "gpu_mem_total:0": 17071734784, "gpu_mem_total:1": 17071734784, "gpu_mem_total:2": 17071734784, "gpu_mem_total:3": 17071734784, "gpu_mem_total:4": 17071734784, "gpu_mem_total:5": 17071734784, "gpu_mem_total:6": 17071734784, "gpu_mem_total:7": 17071734784, "gpu_corrected_ecc:0": 0, "gpu_corrected_ecc:1": 0, "gpu_corrected_ecc:2": 0, "gpu_corrected_ecc:3": 0, "gpu_corrected_ecc:4": 0, "gpu_corrected_ecc:5": 0, "gpu_corrected_ecc:6": 0, "gpu_corrected_ecc:7": 0, "gpu_uncorrected_ecc:0": 0, "gpu_uncorrected_ecc:1": 0, "gpu_uncorrected_ecc:2": 0, "gpu_uncorrected_ecc:3": 0, "gpu_uncorrected_ecc:4": 0, "gpu_uncorrected_ecc:5": 0, "gpu_uncorrected_ecc:6": 0, "gpu_uncorrected_ecc:7": 0, "ib0_receive_bw": 7.093990777192219e-05, "lo_receive_bw": 0.0007516388346318046, "veth867b62a_receive_bw": 0.0, "ib2_receive_bw": 0.0, "eth2_receive_bw": 7.990274558181559e-05, "eth1_receive_bw": 0.0, "docker0_receive_bw": 0.0, "ib1_receive_bw": 3.546995388596109e-05, "eth0_receive_bw": 3.680484462360479e-05, "ib0_transmit_bw": 0.0, "lo_transmit_bw": 0.0007516388346318046, "veth867b62a_transmit_bw": 0.0, "ib2_transmit_bw": 0.0, "eth2_transmit_bw": 7.761436146014067e-05, "eth1_transmit_bw": 0.0, "docker0_transmit_bw": 0.0, "ib1_transmit_bw": 3.1655980349836244e-05, "eth0_transmit_bw": 0.0}
{"time": "2021-12-06 04:19:46", "cpu_usage": 666.9910554561717, "gpu_usage:0": 56, "gpu_usage:1": 57, "gpu_usage:2": 54, "gpu_usage:3": 49, "gpu_usage:4": 47, "gpu_usage:5": 47, "gpu_usage:6": 51, "gpu_usage:7": 49, "gpu_temperature:0": 27, "gpu_temperature:1": 27, "gpu_temperature:2": 24, "gpu_temperature:3": 26, "gpu_temperature:4": 25, "gpu_temperature:5": 25, "gpu_temperature:6": 22, "gpu_temperature:7": 25, "gpu_power_limit:0": 250, "gpu_power_limit:1": 250, "gpu_power_limit:2": 250, "gpu_power_limit:3": 250, "gpu_power_limit:4": 250, "gpu_power_limit:5": 250, "gpu_power_limit:6": 250, "gpu_power_limit:7": 250, "gpu_mem_used:0": 267517952, "gpu_mem_used:1": 267517952, "gpu_mem_used:2": 267517952, "gpu_mem_used:3": 267517952, "gpu_mem_used:4": 267517952, "gpu_mem_used:5": 267517952, "gpu_mem_used:6": 267517952, "gpu_mem_used:7": 267517952, "gpu_mem_total:0": 17071734784, "gpu_mem_total:1": 17071734784, "gpu_mem_total:2": 17071734784, "gpu_mem_total:3": 17071734784, "gpu_mem_total:4": 17071734784, "gpu_mem_total:5": 17071734784, "gpu_mem_total:6": 17071734784, "gpu_mem_total:7": 17071734784, "gpu_corrected_ecc:0": 0, "gpu_corrected_ecc:1": 0, "gpu_corrected_ecc:2": 0, "gpu_corrected_ecc:3": 0, "gpu_corrected_ecc:4": 0, "gpu_corrected_ecc:5": 0, "gpu_corrected_ecc:6": 0, "gpu_corrected_ecc:7": 0, "gpu_uncorrected_ecc:0": 0, "gpu_uncorrected_ecc:1": 0, "gpu_uncorrected_ecc:2": 0, "gpu_uncorrected_ecc:3": 0, "gpu_uncorrected_ecc:4": 0, "gpu_uncorrected_ecc:5": 0, "gpu_uncorrected_ecc:6": 0, "gpu_uncorrected_ecc:7": 0, "ib0_receive_bw": 8.24993374331503e-05, "lo_receive_bw": 0.0007248700444905781, "veth867b62a_receive_bw": 0.0, "ib2_receive_bw": 0.0, "eth2_receive_bw": 7.125808822170487e-05, "eth1_receive_bw": 0.0, "docker0_receive_bw": 0.0, "ib1_receive_bw": 4.706082296994947e-05, "eth0_receive_bw": 1.752872758394879e-05, "ib0_transmit_bw": 0.0, "lo_transmit_bw": 0.0007248700444905781, "veth867b62a_transmit_bw": 0.0, "ib2_transmit_bw": 0.0, "eth2_transmit_bw": 6.0207368657911064e-05, "eth1_transmit_bw": 0.0, "docker0_transmit_bw": 0.0, "ib1_transmit_bw": 3.162792151016847e-05, "eth0_transmit_bw": 0.0}
{"time": "2021-12-06 04:19:57", "cpu_usage": 656.3735831515716, "gpu_usage:0": 55, "gpu_usage:1": 52, "gpu_usage:2": 49, "gpu_usage:3": 41, "gpu_usage:4": 21, "gpu_usage:5": 23, "gpu_usage:6": 0, "gpu_usage:7": 0, "gpu_temperature:0": 50, "gpu_temperature:1": 27, "gpu_temperature:2": 24, "gpu_temperature:3": 26, "gpu_temperature:4": 25, "gpu_temperature:5": 25, "gpu_temperature:6": 23, "gpu_temperature:7": 26, "gpu_power_limit:0": 250, "gpu_power_limit:1": 250, "gpu_power_limit:2": 250, "gpu_power_limit:3": 250, "gpu_power_limit:4": 250, "gpu_power_limit:5": 250, "gpu_power_limit:6": 250, "gpu_power_limit:7": 250, "gpu_mem_used:0": 267517952, "gpu_mem_used:1": 267517952, "gpu_mem_used:2": 267517952, "gpu_mem_used:3": 267517952, "gpu_mem_used:4": 267517952, "gpu_mem_used:5": 267517952, "gpu_mem_used:6": 267517952, "gpu_mem_used:7": 267517952, "gpu_mem_total:0": 17071734784, "gpu_mem_total:1": 17071734784, "gpu_mem_total:2": 17071734784, "gpu_mem_total:3": 17071734784, "gpu_mem_total:4": 17071734784, "gpu_mem_total:5": 17071734784, "gpu_mem_total:6": 17071734784, "gpu_mem_total:7": 17071734784, "gpu_corrected_ecc:0": 0, "gpu_corrected_ecc:1": 0, "gpu_corrected_ecc:2": 0, "gpu_corrected_ecc:3": 0, "gpu_corrected_ecc:4": 0, "gpu_corrected_ecc:5": 0, "gpu_corrected_ecc:6": 0, "gpu_corrected_ecc:7": 0, "gpu_uncorrected_ecc:0": 0, "gpu_uncorrected_ecc:1": 0, "gpu_uncorrected_ecc:2": 0, "gpu_uncorrected_ecc:3": 0, "gpu_uncorrected_ecc:4": 0, "gpu_uncorrected_ecc:5": 0, "gpu_uncorrected_ecc:6": 0, "gpu_uncorrected_ecc:7": 0, "ib0_receive_bw": 3.5438732219366325e-05, "lo_receive_bw": 0.0007402122294206353, "veth867b62a_receive_bw": 0.0, "ib2_receive_bw": 0.0, "eth2_receive_bw": 7.907028962923131e-05, "eth1_receive_bw": 0.0, "docker0_receive_bw": 0.0, "ib1_receive_bw": 3.5438732219366325e-05, "eth0_receive_bw": 7.73555122637781e-05, "ib0_transmit_bw": 0.0, "lo_transmit_bw": 0.0007402122294206353, "veth867b62a_transmit_bw": 0.0, "ib2_transmit_bw": 0.0, "eth2_transmit_bw": 8.097559781306822e-05, "eth1_transmit_bw": 0.0, "docker0_transmit_bw": 0.0, "ib1_transmit_bw": 0.0, "eth0_transmit_bw": 0.0}
{"time": "2021-12-06 04:20:08", "cpu_usage": 542.4075897252304, "gpu_usage:0": 30, "gpu_usage:1": 28, "gpu_usage:2": 26, "gpu_usage:3": 27, "gpu_usage:4": 24, "gpu_usage:5": 25, "gpu_usage:6": 27, "gpu_usage:7": 26, "gpu_temperature:0": 27, "gpu_temperature:1": 27, "gpu_temperature:2": 24, "gpu_temperature:3": 25, "gpu_temperature:4": 25, "gpu_temperature:5": 25, "gpu_temperature:6": 22, "gpu_temperature:7": 25, "gpu_power_limit:0": 250, "gpu_power_limit:1": 200, "gpu_power_limit:2": 250, "gpu_power_limit:3": 250, "gpu_power_limit:4": 250, "gpu_power_limit:5": 250, "gpu_power_limit:6": 250, "gpu_power_limit:7": 250, "gpu_mem_used:0": 267517952, "gpu_mem_used:1": 267517952, "gpu_mem_used:2": 267517952, "gpu_mem_used:3": 267517952, "gpu_mem_used:4": 267517952, "gpu_mem_used:5": 267517952, "gpu_mem_used:6": 267517952, "gpu_mem_used:7": 267517952, "gpu_mem_total:0": 17071734784, "gpu_mem_total:1": 17071734784, "gpu_mem_total:2": 17071734784, "gpu_mem_total:3": 17071734784, "gpu_mem_total:4": 17071734784, "gpu_mem_total:5": 17071734784, "gpu_mem_total:6": 17071734784, "gpu_mem_total:7": 17071734784, "gpu_corrected_ecc:0": 0, "gpu_corrected_ecc:1": 0, "gpu_corrected_ecc:2": 0, "gpu_corrected_ecc:3": 0, "gpu_corrected_ecc:4": 0, "gpu_corrected_ecc:5": 0, "gpu_corrected_ecc:6": 0, "gpu_corrected_ecc:7": 0, "gpu_uncorrected_ecc:0": 0, "gpu_uncorrected_ecc:1": 0, "gpu_uncorrected_ecc:2": 0, "gpu_uncorrected_ecc:3": 0, "gpu_uncorrected_ecc:4": 0, "gpu_uncorrected_ecc:5": 0, "gpu_uncorrected_ecc:6": 0, "gpu_uncorrected_ecc:7": 0, "ib0_receive_bw": 7.094350644144869e-05, "lo_receive_bw": 0.0007255499200443321, "veth867b62a_receive_bw": 0.0, "ib2_receive_bw": 0.0, "eth2_receive_bw": 6.255233901289024e-05, "eth1_receive_bw": 0.0, "docker0_receive_bw": 0.0, "ib1_receive_bw": 3.5471753220724344e-05, "eth0_receive_bw": 3.6806711675267735e-05, "ib0_transmit_bw": 0.0, "lo_transmit_bw": 0.0007255499200443321, "veth867b62a_transmit_bw": 0.0, "ib2_transmit_bw": 0.0, "eth2_transmit_bw": 6.026383880510157e-05, "eth1_transmit_bw": 0.0, "docker0_transmit_bw": 0.0, "ib1_transmit_bw": 3.165758620774323e-05, "eth0_transmit_bw": 0.0}
{"time": "2021-12-06 04:20:19", "cpu_usage": 565.2927013332376, "gpu_usage:0": 0, "gpu_usage:1": 0, "gpu_usage:2": 0, "gpu_usage:3": 0, "gpu_usage:4": 14, "gpu_usage:5": 13, "gpu_usage:6": 0, "gpu_usage:7": 0, "gpu_temperature:0": 27, "gpu_temperature:1": 27, "gpu_temperature:2": 24, "gpu_temperature:3": 25, "gpu_temperature:4": 25, "gpu_temperature:5": 25, "gpu_temperature:6": 22, "gpu_temperature:7": 25, "gpu_power_limit:0": 250, "gpu_power_limit:1": 250, "gpu_power_limit:2": 250, "gpu_power_limit:3": 250, "gpu_power_limit:4": 250, "gpu_power_limit:5": 250, "gpu_power_limit:6": 250, "gpu_power_limit:7": 250, "gpu_mem_used:0": 0, "gpu_mem_used:1": 0, "gpu_mem_used:2": 0, "gpu_mem_used:3": 0, "gpu_mem_used:4": 0, "gpu_mem_used:5": 0, "gpu_mem_used:6": 0, "gpu_mem_used:7": 0, "gpu_mem_total:0": 17071734784, "gpu_mem_total:1": 17071734784, "gpu_mem_total:2": 17071734784, "gpu_mem_total:3": 17071734784, "gpu_mem_total:4": 17071734784, "gpu_mem_total:5": 17071734784, "gpu_mem_total:6": 17071734784, "gpu_mem_total:7": 17071734784, "gpu_corrected_ecc:0": 12, "gpu_corrected_ecc:1": 0, "gpu_corrected_ecc:2": 0, "gpu_corrected_ecc:3": 0, "gpu_corrected_ecc:4": 0, "gpu_corrected_ecc:5": 0, "gpu_corrected_ecc:6": 0, "gpu_corrected_ecc:7": 0, "gpu_uncorrected_ecc:0": 0, "gpu_uncorrected_ecc:1": 0, "gpu_uncorrected_ecc:2": 0, "gpu_uncorrected_ecc:3": 0, "gpu_uncorrected_ecc:4": 0, "gpu_uncorrected_ecc:5": 0, "gpu_uncorrected_ecc:6": 0, "gpu_uncorrected_ecc:7": 0, "ib0_receive_bw": 7.094668432959388e-05, "lo_receive_bw": 0.0007254870623381052, "veth867b62a_receive_bw": 0.0, "ib2_receive_bw": 0.0, "eth2_receive_bw": 0.00018194391626460366, "eth1_receive_bw": 0.0, "docker0_receive_bw": 0.0, "ib1_receive_bw": 3.547334216479694e-05, "eth0_receive_bw": 0.0, "ib0_transmit_bw": 0.0, "lo_transmit_bw": 0.0007254870623381052, "veth867b62a_transmit_bw": 0.0, "ib2_transmit_bw": 0.0, "eth2_transmit_bw": 0.0006913487384268221, "eth1_transmit_bw": 0.0, "docker0_transmit_bw": 0.0, "ib1_transmit_bw": 3.165900429761447e-05, "eth0_transmit_bw": 0.0}
......@@ -185,8 +185,8 @@ def test_run_default_benchmarks(self, mock_ansible_client_run):
mock_ansible_client_run.return_value = 0
self.runner.run()
def test_merge_all_metrics(self):
"""Test __merge_all_metrics."""
def test_merge_benchmark_metrics(self):
"""Test __merge_benchmark_metrics."""
result_summary = json.loads(
'{"kernel-launch": {"overhead_event": [[0.00583], [0.00545], [0.00581], [0.00572], [0.00559], [0.00591], '
'[0.00562], [0.00586]], "overhead_wall": [[0.01018], [0.01039], [0.01067], [0.01079], [0.00978], '
......@@ -238,4 +238,43 @@ def test_merge_all_metrics(self):
'"pytorch-sharding-matmul/0/allreduce": 10.87, "pytorch-sharding-matmul/1/allreduce": 10.69, '
'"pytorch-sharding-matmul/0/allgather": 10.56, "pytorch-sharding-matmul/1/allgather": 10.16}'
)
self.assertEqual(self.runner._SuperBenchRunner__merge_all_metrics(result_summary, reduce_ops), expected)
self.assertEqual(self.runner._SuperBenchRunner__merge_benchmark_metrics(result_summary, reduce_ops), expected)
def test_merge_monitor_metrics(self):
"""Test __merge_monitor_metrics."""
path = Path('tests/data/monitor/')
expected = {
'gpu_temperature:0': 50,
'gpu_temperature:1': 27,
'gpu_temperature:2': 24,
'gpu_temperature:3': 26,
'gpu_temperature:4': 25,
'gpu_temperature:5': 25,
'gpu_temperature:6': 23,
'gpu_temperature:7': 26,
'gpu_power_limit:0': 250,
'gpu_power_limit:1': 200,
'gpu_power_limit:2': 250,
'gpu_power_limit:3': 250,
'gpu_power_limit:4': 250,
'gpu_power_limit:5': 250,
'gpu_power_limit:6': 250,
'gpu_power_limit:7': 250,
'gpu_corrected_ecc:0': 12,
'gpu_corrected_ecc:1': 0,
'gpu_corrected_ecc:2': 0,
'gpu_corrected_ecc:3': 0,
'gpu_corrected_ecc:4': 0,
'gpu_corrected_ecc:5': 0,
'gpu_corrected_ecc:6': 0,
'gpu_corrected_ecc:7': 0,
'gpu_uncorrected_ecc:0': 0,
'gpu_uncorrected_ecc:1': 0,
'gpu_uncorrected_ecc:2': 0,
'gpu_uncorrected_ecc:3': 0,
'gpu_uncorrected_ecc:4': 0,
'gpu_uncorrected_ecc:5': 0,
'gpu_uncorrected_ecc:6': 0,
'gpu_uncorrected_ecc:7': 0
}
self.assertEqual(self.runner._SuperBenchRunner__merge_monitor_metrics(path), expected)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment