Unverified Commit 6e357fb9 authored by guoshzhao's avatar guoshzhao Committed by GitHub
Browse files

Monitor: Integration - Integrate monitor into Superbench (#259)

**Description**
Integrate monitor into Superbench.

**Major Revision**
- Initialize, start and stop monitor in SB executor.
- Parse the monitor data in SB runner and merge into benchmark results.
- Specify ReduceType for monitor metrics, such as MAX, MIN and LAST.
- Add monitor configs into config file.
parent afea9913
...@@ -56,6 +56,10 @@ Here is an overview of SuperBench configuration structure: ...@@ -56,6 +56,10 @@ Here is an overview of SuperBench configuration structure:
version: string version: string
superbench: superbench:
enable: string | [ string ] enable: string | [ string ]
monitor:
enable: bool
sample_duration: int
sample_interval: int
var: var:
${var_name}: dict ${var_name}: dict
benchmarks: benchmarks:
...@@ -69,6 +73,10 @@ superbench: ...@@ -69,6 +73,10 @@ superbench:
version: v0.3 version: v0.3
superbench: superbench:
enable: benchmark_1 enable: benchmark_1
monitor:
enable: false
sample_duration: 10
sample_interval: 1
var: var:
var_1: value var_1: value
benchmarks: benchmarks:
...@@ -98,6 +106,22 @@ If not specified, will use [`${benchmark_name}.enable`](#enable) in each benchma ...@@ -98,6 +106,22 @@ If not specified, will use [`${benchmark_name}.enable`](#enable) in each benchma
* value from: benchmark names defined in `superbench.benchmarks` * value from: benchmark names defined in `superbench.benchmarks`
* default value: `null` * default value: `null`
### `superbench.monitor`
Enable monitor to collect system metrics periodically, currently only support CUDA platform. There are three settings:
#### `enable`
Whether enable the monitor module or not.
#### `sample_duration`
Calculate the average metrics during sample_duration seconds, such as CPU usage and NIC bandwidth.
#### `sample_interval`
Do sampling every sample_interval seconds.
### `superbench.var` ### `superbench.var`
User-defined variables to be used in the configuration. User-defined variables to be used in the configuration.
......
...@@ -15,6 +15,7 @@ class ReduceType(Enum): ...@@ -15,6 +15,7 @@ class ReduceType(Enum):
MAX = 'max' MAX = 'max'
MIN = 'min' MIN = 'min'
SUM = 'sum' SUM = 'sum'
LAST = 'last'
class Reducer: class Reducer:
...@@ -52,8 +53,23 @@ def get_reduce_func(cls, reduce_type): ...@@ -52,8 +53,23 @@ def get_reduce_func(cls, reduce_type):
return None return None
@staticmethod
def last(array):
"""Get the last item from the input sequence.
Args:
array (List): The input sequence.
Return:
The last item of the input sequence.
"""
if not isinstance(array, list) or len(array) == 0:
raise ValueError('last() arg is an empty sequence')
return array[-1]
Reducer.add_reduce_func(ReduceType.MAX)(max) Reducer.add_reduce_func(ReduceType.MAX)(max)
Reducer.add_reduce_func(ReduceType.MIN)(min) Reducer.add_reduce_func(ReduceType.MIN)(min)
Reducer.add_reduce_func(ReduceType.SUM)(sum) Reducer.add_reduce_func(ReduceType.SUM)(sum)
Reducer.add_reduce_func(ReduceType.AVG)(mean) Reducer.add_reduce_func(ReduceType.AVG)(mean)
Reducer.add_reduce_func(ReduceType.LAST)(Reducer.last)
...@@ -2,6 +2,10 @@ ...@@ -2,6 +2,10 @@
version: v0.3 version: v0.3
superbench: superbench:
enable: null enable: null
monitor:
enable: false
sample_duration: 1
sample_interval: 10
var: var:
default_local_mode: &default_local_mode default_local_mode: &default_local_mode
enable: true enable: true
......
...@@ -2,6 +2,10 @@ ...@@ -2,6 +2,10 @@
version: v0.3 version: v0.3
superbench: superbench:
enable: null enable: null
monitor:
enable: false
sample_duration: 1
sample_interval: 10
var: var:
default_local_mode: &default_local_mode default_local_mode: &default_local_mode
enable: true enable: true
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
from superbench.benchmarks import Platform, Framework, BenchmarkRegistry from superbench.benchmarks import Platform, Framework, BenchmarkRegistry
from superbench.common.utils import SuperBenchLogger, logger, rotate_dir from superbench.common.utils import SuperBenchLogger, logger, rotate_dir
from superbench.common.devices import GPU from superbench.common.devices import GPU
from superbench.monitor import Monitor
class SuperBenchExecutor(): class SuperBenchExecutor():
...@@ -32,6 +33,7 @@ def __init__(self, sb_config, sb_output_dir): ...@@ -32,6 +33,7 @@ def __init__(self, sb_config, sb_output_dir):
logger.debug('Executor writes to: %s.', str(self._output_path)) logger.debug('Executor writes to: %s.', str(self._output_path))
self.__validate_sb_config() self.__validate_sb_config()
self._sb_monitor_config = self._sb_config.superbench.monitor
self._sb_benchmarks = self._sb_config.superbench.benchmarks self._sb_benchmarks = self._sb_config.superbench.benchmarks
self._sb_enabled = self.__get_enabled_benchmarks() self._sb_enabled = self.__get_enabled_benchmarks()
logger.debug('Executor will execute: %s', self._sb_enabled) logger.debug('Executor will execute: %s', self._sb_enabled)
...@@ -131,17 +133,28 @@ def __exec_benchmark(self, context, log_suffix): ...@@ -131,17 +133,28 @@ def __exec_benchmark(self, context, log_suffix):
logger.error('Executor failed in %s.', log_suffix) logger.error('Executor failed in %s.', log_suffix)
return None return None
def __get_rank_id(self):
"""Get rank ID for current process.
Return:
int: Rank ID.
"""
for rank_env in ['PROC_RANK', 'LOCAL_RANK']:
if os.getenv(rank_env):
return int(os.getenv(rank_env))
return 0
def __get_benchmark_dir(self, benchmark_name): def __get_benchmark_dir(self, benchmark_name):
"""Get output directory for benchmark's current rank. """Get output directory for benchmark's current rank.
Args: Args:
benchmark_name (str): Benchmark name. benchmark_name (str): Benchmark name.
Return:
Path: output directory.
""" """
benchmark_output_dir = self._output_path / 'benchmarks' / benchmark_name return self._output_path / 'benchmarks' / benchmark_name / ('rank' + str(self.__get_rank_id()))
for rank_env in ['PROC_RANK', 'LOCAL_RANK']:
if os.getenv(rank_env):
return benchmark_output_dir / 'rank{}'.format(os.getenv(rank_env))
return benchmark_output_dir / 'rank0'
def __create_benchmark_dir(self, benchmark_name): def __create_benchmark_dir(self, benchmark_name):
"""Create output directory for benchmark. """Create output directory for benchmark.
...@@ -166,6 +179,17 @@ def __write_benchmark_results(self, benchmark_name, benchmark_results): ...@@ -166,6 +179,17 @@ def __write_benchmark_results(self, benchmark_name, benchmark_results):
with (self.__get_benchmark_dir(benchmark_name) / 'results.json').open(mode='w') as f: with (self.__get_benchmark_dir(benchmark_name) / 'results.json').open(mode='w') as f:
json.dump(benchmark_results, f, indent=2) json.dump(benchmark_results, f, indent=2)
def __get_monitor_path(self, benchmark_name):
"""Get the output file path for the monitor.
Args:
benchmark_name (str): Benchmark name.
Return:
str: monitor output file path.
"""
return f'{self.__get_benchmark_dir(benchmark_name) / "monitor.jsonl"}'
def exec(self): def exec(self):
"""Run the SuperBench benchmarks locally.""" """Run the SuperBench benchmarks locally."""
for benchmark_name in self._sb_benchmarks: for benchmark_name in self._sb_benchmarks:
...@@ -174,6 +198,18 @@ def exec(self): ...@@ -174,6 +198,18 @@ def exec(self):
benchmark_config = self._sb_benchmarks[benchmark_name] benchmark_config = self._sb_benchmarks[benchmark_name]
benchmark_results = list() benchmark_results = list()
self.__create_benchmark_dir(benchmark_name) self.__create_benchmark_dir(benchmark_name)
monitor = None
if self.__get_rank_id() == 0 and self._sb_monitor_config and self._sb_monitor_config.enable:
if self.__get_platform() == Platform.CUDA:
monitor = Monitor(
None, int(self._sb_monitor_config.sample_duration or 10),
int(self._sb_monitor_config.sample_interval or 1), self.__get_monitor_path(benchmark_name)
)
monitor.start()
else:
logger.warning('Monitor can not support ROCM/CPU platform.')
for framework in benchmark_config.frameworks or [Framework.NONE.value]: for framework in benchmark_config.frameworks or [Framework.NONE.value]:
if benchmark_name.endswith('_models'): if benchmark_name.endswith('_models'):
for model in benchmark_config.models: for model in benchmark_config.models:
...@@ -199,4 +235,6 @@ def exec(self): ...@@ -199,4 +235,6 @@ def exec(self):
result = self.__exec_benchmark(context, log_suffix) result = self.__exec_benchmark(context, log_suffix)
benchmark_results.append(result) benchmark_results.append(result)
if monitor:
monitor.stop()
self.__write_benchmark_results(benchmark_name, benchmark_results) self.__write_benchmark_results(benchmark_name, benchmark_results)
...@@ -16,19 +16,19 @@ ...@@ -16,19 +16,19 @@
class Monitor(multiprocessing.Process): class Monitor(multiprocessing.Process):
"""The monitor class to collect system metrics periodically.""" """The monitor class to collect system metrics periodically."""
def __init__(self, container_name, sample_duration, sample_freq, output_file): def __init__(self, container_name, sample_duration, sample_interval, output_file):
"""Constructor. """Constructor.
Args: Args:
container_name (str): container name that need to monitor, None means the current env. container_name (str): container name that need to monitor, None means the current env.
sample_duration (int): calculate the average metirc during sample_duration seconds. sample_duration (int): calculate the average metirc during sample_duration seconds.
sample_freq (int): do sampling every sample_freq seconds. sample_interval (int): do sampling every sample_interval seconds.
output_file (str): output file in jsonline format. output_file (str): output file in jsonline format.
""" """
multiprocessing.Process.__init__(self) multiprocessing.Process.__init__(self)
self.__container_name = container_name self.__container_name = container_name
self.__sample_duration = sample_duration self.__sample_duration = sample_duration
self.__sample_freq = sample_freq self.__sample_interval = sample_interval
self.__output_file = output_file self.__output_file = output_file
self.__scheduler = sched.scheduler(time.time, time.sleep) self.__scheduler = sched.scheduler(time.time, time.sleep)
...@@ -120,7 +120,7 @@ def stop(self): ...@@ -120,7 +120,7 @@ def stop(self):
def __sample(self): def __sample(self):
"""Method sampling system metrics.""" """Method sampling system metrics."""
if self.__running.value == 1: if self.__running.value == 1:
self.__scheduler.enter(self.__sample_freq, 1, self.__sample, ()) self.__scheduler.enter(self.__sample_interval, 1, self.__sample, ())
# Sampling # Sampling
record = MonitorRecord() record = MonitorRecord()
self.__sample_host_metrics(record) self.__sample_host_metrics(record)
......
...@@ -7,9 +7,19 @@ ...@@ -7,9 +7,19 @@
import numbers import numbers
from datetime import datetime from datetime import datetime
from superbench.benchmarks import ReduceType
class MonitorRecord: class MonitorRecord:
"""Record class to save all monitoring data.""" """Record class to save all monitoring data."""
reduce_ops = {
'gpu_temperature': ReduceType.MAX,
'gpu_power_limit': ReduceType.MIN,
'gpu_corrected_ecc': ReduceType.LAST,
'gpu_uncorrected_ecc': ReduceType.LAST,
'gpu_remap': ReduceType.LAST,
}
def __init__(self): def __init__(self):
"""Constructor.""" """Constructor."""
self.__time = datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S') self.__time = datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
......
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
from pprint import pformat from pprint import pformat
from collections import defaultdict from collections import defaultdict
import jsonlines
from natsort import natsorted from natsort import natsorted
from joblib import Parallel, delayed from joblib import Parallel, delayed
from omegaconf import ListConfig, OmegaConf from omegaconf import ListConfig, OmegaConf
...@@ -16,6 +17,7 @@ ...@@ -16,6 +17,7 @@
from superbench.common.utils import SuperBenchLogger, logger from superbench.common.utils import SuperBenchLogger, logger
from superbench.runner.ansible import AnsibleClient from superbench.runner.ansible import AnsibleClient
from superbench.benchmarks import ReduceType, Reducer from superbench.benchmarks import ReduceType, Reducer
from superbench.monitor import MonitorRecord
class SuperBenchRunner(): class SuperBenchRunner():
...@@ -255,13 +257,15 @@ def __create_single_node_summary(self, node_path): # pragma: no cover # noqa: ...@@ -255,13 +257,15 @@ def __create_single_node_summary(self, node_path): # pragma: no cover # noqa:
results_summary[benchmark_name][metric].append(result['result'][metric]) results_summary[benchmark_name][metric].append(result['result'][metric])
results_summary = self.__merge_all_metrics(results_summary, reduce_ops) results_summary = self.__merge_benchmark_metrics(results_summary, reduce_ops)
monitor_summary = self.__merge_monitor_metrics(node_path)
results_summary = {**results_summary, **monitor_summary}
with (node_path / 'results-summary.json').open(mode='w') as f: with (node_path / 'results-summary.json').open(mode='w') as f:
json.dump(results_summary, f, indent=2) json.dump(results_summary, f, indent=2)
return results_summary return results_summary
def __merge_all_metrics(self, results_summary, reduce_ops): def __merge_benchmark_metrics(self, results_summary, reduce_ops):
"""Merge metrics of all benchmarks in one node. """Merge metrics of all benchmarks in one node.
Args: Args:
...@@ -301,6 +305,42 @@ def __merge_all_metrics(self, results_summary, reduce_ops): ...@@ -301,6 +305,42 @@ def __merge_all_metrics(self, results_summary, reduce_ops):
return metrics_summary return metrics_summary
def __merge_monitor_metrics(self, node_path):
"""Merge and summarize monitor metrics of one node.
Args:
node_path (Path): The Path instance of node directory.
Returns:
dict: Flattened result with metric as key.
"""
metrics_summary = dict()
all_samples = list()
file_list = list(node_path.glob('**/monitor.jsonl'))
for results_file in file_list:
try:
with jsonlines.open(results_file) as reader:
all_samples = list(reader)
except BaseException as e:
logger.error('Invalid Jsonline file: {}, error message: {}'.format(results_file, str(e)))
continue
all_samples = sorted(all_samples, key=lambda k: k.get('time', '0'))
metrics_dict = dict()
for sample in all_samples:
for metric, value in sample.items():
if metric not in metrics_dict:
metrics_dict[metric] = list()
metrics_dict[metric].append(value)
for metric, values in metrics_dict.items():
for pattern, reduce_type in MonitorRecord.reduce_ops.items():
if pattern in metric:
reduce_func = Reducer.get_reduce_func(reduce_type)
metrics_summary[metric] = reduce_func(values)
continue
return metrics_summary
def _run_proc(self, benchmark_name, mode, vars): def _run_proc(self, benchmark_name, mode, vars):
"""Run the process. """Run the process.
......
{"time": "2021-12-06 04:19:35", "cpu_usage": 356.05527547060314, "gpu_usage:0": 56, "gpu_usage:1": 56, "gpu_usage:2": 51, "gpu_usage:3": 50, "gpu_usage:4": 50, "gpu_usage:5": 48, "gpu_usage:6": 50, "gpu_usage:7": 51, "gpu_temperature:0": 26, "gpu_temperature:1": 26, "gpu_temperature:2": 24, "gpu_temperature:3": 25, "gpu_temperature:4": 25, "gpu_temperature:5": 25, "gpu_temperature:6": 22, "gpu_temperature:7": 25, "gpu_power_limit:0": 250, "gpu_power_limit:1": 250, "gpu_power_limit:2": 250, "gpu_power_limit:3": 250, "gpu_power_limit:4": 250, "gpu_power_limit:5": 250, "gpu_power_limit:6": 250, "gpu_power_limit:7": 250, "gpu_mem_used:0": 267517952, "gpu_mem_used:1": 267517952, "gpu_mem_used:2": 267517952, "gpu_mem_used:3": 267517952, "gpu_mem_used:4": 267517952, "gpu_mem_used:5": 267517952, "gpu_mem_used:6": 267517952, "gpu_mem_used:7": 267517952, "gpu_mem_total:0": 17071734784, "gpu_mem_total:1": 17071734784, "gpu_mem_total:2": 17071734784, "gpu_mem_total:3": 17071734784, "gpu_mem_total:4": 17071734784, "gpu_mem_total:5": 17071734784, "gpu_mem_total:6": 17071734784, "gpu_mem_total:7": 17071734784, "gpu_corrected_ecc:0": 0, "gpu_corrected_ecc:1": 0, "gpu_corrected_ecc:2": 0, "gpu_corrected_ecc:3": 0, "gpu_corrected_ecc:4": 0, "gpu_corrected_ecc:5": 0, "gpu_corrected_ecc:6": 0, "gpu_corrected_ecc:7": 0, "gpu_uncorrected_ecc:0": 0, "gpu_uncorrected_ecc:1": 0, "gpu_uncorrected_ecc:2": 0, "gpu_uncorrected_ecc:3": 0, "gpu_uncorrected_ecc:4": 0, "gpu_uncorrected_ecc:5": 0, "gpu_uncorrected_ecc:6": 0, "gpu_uncorrected_ecc:7": 0, "ib0_receive_bw": 7.093990777192219e-05, "lo_receive_bw": 0.0007516388346318046, "veth867b62a_receive_bw": 0.0, "ib2_receive_bw": 0.0, "eth2_receive_bw": 7.990274558181559e-05, "eth1_receive_bw": 0.0, "docker0_receive_bw": 0.0, "ib1_receive_bw": 3.546995388596109e-05, "eth0_receive_bw": 3.680484462360479e-05, "ib0_transmit_bw": 0.0, "lo_transmit_bw": 0.0007516388346318046, "veth867b62a_transmit_bw": 0.0, "ib2_transmit_bw": 0.0, "eth2_transmit_bw": 7.761436146014067e-05, "eth1_transmit_bw": 0.0, "docker0_transmit_bw": 0.0, "ib1_transmit_bw": 3.1655980349836244e-05, "eth0_transmit_bw": 0.0}
{"time": "2021-12-06 04:19:46", "cpu_usage": 666.9910554561717, "gpu_usage:0": 56, "gpu_usage:1": 57, "gpu_usage:2": 54, "gpu_usage:3": 49, "gpu_usage:4": 47, "gpu_usage:5": 47, "gpu_usage:6": 51, "gpu_usage:7": 49, "gpu_temperature:0": 27, "gpu_temperature:1": 27, "gpu_temperature:2": 24, "gpu_temperature:3": 26, "gpu_temperature:4": 25, "gpu_temperature:5": 25, "gpu_temperature:6": 22, "gpu_temperature:7": 25, "gpu_power_limit:0": 250, "gpu_power_limit:1": 250, "gpu_power_limit:2": 250, "gpu_power_limit:3": 250, "gpu_power_limit:4": 250, "gpu_power_limit:5": 250, "gpu_power_limit:6": 250, "gpu_power_limit:7": 250, "gpu_mem_used:0": 267517952, "gpu_mem_used:1": 267517952, "gpu_mem_used:2": 267517952, "gpu_mem_used:3": 267517952, "gpu_mem_used:4": 267517952, "gpu_mem_used:5": 267517952, "gpu_mem_used:6": 267517952, "gpu_mem_used:7": 267517952, "gpu_mem_total:0": 17071734784, "gpu_mem_total:1": 17071734784, "gpu_mem_total:2": 17071734784, "gpu_mem_total:3": 17071734784, "gpu_mem_total:4": 17071734784, "gpu_mem_total:5": 17071734784, "gpu_mem_total:6": 17071734784, "gpu_mem_total:7": 17071734784, "gpu_corrected_ecc:0": 0, "gpu_corrected_ecc:1": 0, "gpu_corrected_ecc:2": 0, "gpu_corrected_ecc:3": 0, "gpu_corrected_ecc:4": 0, "gpu_corrected_ecc:5": 0, "gpu_corrected_ecc:6": 0, "gpu_corrected_ecc:7": 0, "gpu_uncorrected_ecc:0": 0, "gpu_uncorrected_ecc:1": 0, "gpu_uncorrected_ecc:2": 0, "gpu_uncorrected_ecc:3": 0, "gpu_uncorrected_ecc:4": 0, "gpu_uncorrected_ecc:5": 0, "gpu_uncorrected_ecc:6": 0, "gpu_uncorrected_ecc:7": 0, "ib0_receive_bw": 8.24993374331503e-05, "lo_receive_bw": 0.0007248700444905781, "veth867b62a_receive_bw": 0.0, "ib2_receive_bw": 0.0, "eth2_receive_bw": 7.125808822170487e-05, "eth1_receive_bw": 0.0, "docker0_receive_bw": 0.0, "ib1_receive_bw": 4.706082296994947e-05, "eth0_receive_bw": 1.752872758394879e-05, "ib0_transmit_bw": 0.0, "lo_transmit_bw": 0.0007248700444905781, "veth867b62a_transmit_bw": 0.0, "ib2_transmit_bw": 0.0, "eth2_transmit_bw": 6.0207368657911064e-05, "eth1_transmit_bw": 0.0, "docker0_transmit_bw": 0.0, "ib1_transmit_bw": 3.162792151016847e-05, "eth0_transmit_bw": 0.0}
{"time": "2021-12-06 04:19:57", "cpu_usage": 656.3735831515716, "gpu_usage:0": 55, "gpu_usage:1": 52, "gpu_usage:2": 49, "gpu_usage:3": 41, "gpu_usage:4": 21, "gpu_usage:5": 23, "gpu_usage:6": 0, "gpu_usage:7": 0, "gpu_temperature:0": 50, "gpu_temperature:1": 27, "gpu_temperature:2": 24, "gpu_temperature:3": 26, "gpu_temperature:4": 25, "gpu_temperature:5": 25, "gpu_temperature:6": 23, "gpu_temperature:7": 26, "gpu_power_limit:0": 250, "gpu_power_limit:1": 250, "gpu_power_limit:2": 250, "gpu_power_limit:3": 250, "gpu_power_limit:4": 250, "gpu_power_limit:5": 250, "gpu_power_limit:6": 250, "gpu_power_limit:7": 250, "gpu_mem_used:0": 267517952, "gpu_mem_used:1": 267517952, "gpu_mem_used:2": 267517952, "gpu_mem_used:3": 267517952, "gpu_mem_used:4": 267517952, "gpu_mem_used:5": 267517952, "gpu_mem_used:6": 267517952, "gpu_mem_used:7": 267517952, "gpu_mem_total:0": 17071734784, "gpu_mem_total:1": 17071734784, "gpu_mem_total:2": 17071734784, "gpu_mem_total:3": 17071734784, "gpu_mem_total:4": 17071734784, "gpu_mem_total:5": 17071734784, "gpu_mem_total:6": 17071734784, "gpu_mem_total:7": 17071734784, "gpu_corrected_ecc:0": 0, "gpu_corrected_ecc:1": 0, "gpu_corrected_ecc:2": 0, "gpu_corrected_ecc:3": 0, "gpu_corrected_ecc:4": 0, "gpu_corrected_ecc:5": 0, "gpu_corrected_ecc:6": 0, "gpu_corrected_ecc:7": 0, "gpu_uncorrected_ecc:0": 0, "gpu_uncorrected_ecc:1": 0, "gpu_uncorrected_ecc:2": 0, "gpu_uncorrected_ecc:3": 0, "gpu_uncorrected_ecc:4": 0, "gpu_uncorrected_ecc:5": 0, "gpu_uncorrected_ecc:6": 0, "gpu_uncorrected_ecc:7": 0, "ib0_receive_bw": 3.5438732219366325e-05, "lo_receive_bw": 0.0007402122294206353, "veth867b62a_receive_bw": 0.0, "ib2_receive_bw": 0.0, "eth2_receive_bw": 7.907028962923131e-05, "eth1_receive_bw": 0.0, "docker0_receive_bw": 0.0, "ib1_receive_bw": 3.5438732219366325e-05, "eth0_receive_bw": 7.73555122637781e-05, "ib0_transmit_bw": 0.0, "lo_transmit_bw": 0.0007402122294206353, "veth867b62a_transmit_bw": 0.0, "ib2_transmit_bw": 0.0, "eth2_transmit_bw": 8.097559781306822e-05, "eth1_transmit_bw": 0.0, "docker0_transmit_bw": 0.0, "ib1_transmit_bw": 0.0, "eth0_transmit_bw": 0.0}
{"time": "2021-12-06 04:20:08", "cpu_usage": 542.4075897252304, "gpu_usage:0": 30, "gpu_usage:1": 28, "gpu_usage:2": 26, "gpu_usage:3": 27, "gpu_usage:4": 24, "gpu_usage:5": 25, "gpu_usage:6": 27, "gpu_usage:7": 26, "gpu_temperature:0": 27, "gpu_temperature:1": 27, "gpu_temperature:2": 24, "gpu_temperature:3": 25, "gpu_temperature:4": 25, "gpu_temperature:5": 25, "gpu_temperature:6": 22, "gpu_temperature:7": 25, "gpu_power_limit:0": 250, "gpu_power_limit:1": 200, "gpu_power_limit:2": 250, "gpu_power_limit:3": 250, "gpu_power_limit:4": 250, "gpu_power_limit:5": 250, "gpu_power_limit:6": 250, "gpu_power_limit:7": 250, "gpu_mem_used:0": 267517952, "gpu_mem_used:1": 267517952, "gpu_mem_used:2": 267517952, "gpu_mem_used:3": 267517952, "gpu_mem_used:4": 267517952, "gpu_mem_used:5": 267517952, "gpu_mem_used:6": 267517952, "gpu_mem_used:7": 267517952, "gpu_mem_total:0": 17071734784, "gpu_mem_total:1": 17071734784, "gpu_mem_total:2": 17071734784, "gpu_mem_total:3": 17071734784, "gpu_mem_total:4": 17071734784, "gpu_mem_total:5": 17071734784, "gpu_mem_total:6": 17071734784, "gpu_mem_total:7": 17071734784, "gpu_corrected_ecc:0": 0, "gpu_corrected_ecc:1": 0, "gpu_corrected_ecc:2": 0, "gpu_corrected_ecc:3": 0, "gpu_corrected_ecc:4": 0, "gpu_corrected_ecc:5": 0, "gpu_corrected_ecc:6": 0, "gpu_corrected_ecc:7": 0, "gpu_uncorrected_ecc:0": 0, "gpu_uncorrected_ecc:1": 0, "gpu_uncorrected_ecc:2": 0, "gpu_uncorrected_ecc:3": 0, "gpu_uncorrected_ecc:4": 0, "gpu_uncorrected_ecc:5": 0, "gpu_uncorrected_ecc:6": 0, "gpu_uncorrected_ecc:7": 0, "ib0_receive_bw": 7.094350644144869e-05, "lo_receive_bw": 0.0007255499200443321, "veth867b62a_receive_bw": 0.0, "ib2_receive_bw": 0.0, "eth2_receive_bw": 6.255233901289024e-05, "eth1_receive_bw": 0.0, "docker0_receive_bw": 0.0, "ib1_receive_bw": 3.5471753220724344e-05, "eth0_receive_bw": 3.6806711675267735e-05, "ib0_transmit_bw": 0.0, "lo_transmit_bw": 0.0007255499200443321, "veth867b62a_transmit_bw": 0.0, "ib2_transmit_bw": 0.0, "eth2_transmit_bw": 6.026383880510157e-05, "eth1_transmit_bw": 0.0, "docker0_transmit_bw": 0.0, "ib1_transmit_bw": 3.165758620774323e-05, "eth0_transmit_bw": 0.0}
{"time": "2021-12-06 04:20:19", "cpu_usage": 565.2927013332376, "gpu_usage:0": 0, "gpu_usage:1": 0, "gpu_usage:2": 0, "gpu_usage:3": 0, "gpu_usage:4": 14, "gpu_usage:5": 13, "gpu_usage:6": 0, "gpu_usage:7": 0, "gpu_temperature:0": 27, "gpu_temperature:1": 27, "gpu_temperature:2": 24, "gpu_temperature:3": 25, "gpu_temperature:4": 25, "gpu_temperature:5": 25, "gpu_temperature:6": 22, "gpu_temperature:7": 25, "gpu_power_limit:0": 250, "gpu_power_limit:1": 250, "gpu_power_limit:2": 250, "gpu_power_limit:3": 250, "gpu_power_limit:4": 250, "gpu_power_limit:5": 250, "gpu_power_limit:6": 250, "gpu_power_limit:7": 250, "gpu_mem_used:0": 0, "gpu_mem_used:1": 0, "gpu_mem_used:2": 0, "gpu_mem_used:3": 0, "gpu_mem_used:4": 0, "gpu_mem_used:5": 0, "gpu_mem_used:6": 0, "gpu_mem_used:7": 0, "gpu_mem_total:0": 17071734784, "gpu_mem_total:1": 17071734784, "gpu_mem_total:2": 17071734784, "gpu_mem_total:3": 17071734784, "gpu_mem_total:4": 17071734784, "gpu_mem_total:5": 17071734784, "gpu_mem_total:6": 17071734784, "gpu_mem_total:7": 17071734784, "gpu_corrected_ecc:0": 12, "gpu_corrected_ecc:1": 0, "gpu_corrected_ecc:2": 0, "gpu_corrected_ecc:3": 0, "gpu_corrected_ecc:4": 0, "gpu_corrected_ecc:5": 0, "gpu_corrected_ecc:6": 0, "gpu_corrected_ecc:7": 0, "gpu_uncorrected_ecc:0": 0, "gpu_uncorrected_ecc:1": 0, "gpu_uncorrected_ecc:2": 0, "gpu_uncorrected_ecc:3": 0, "gpu_uncorrected_ecc:4": 0, "gpu_uncorrected_ecc:5": 0, "gpu_uncorrected_ecc:6": 0, "gpu_uncorrected_ecc:7": 0, "ib0_receive_bw": 7.094668432959388e-05, "lo_receive_bw": 0.0007254870623381052, "veth867b62a_receive_bw": 0.0, "ib2_receive_bw": 0.0, "eth2_receive_bw": 0.00018194391626460366, "eth1_receive_bw": 0.0, "docker0_receive_bw": 0.0, "ib1_receive_bw": 3.547334216479694e-05, "eth0_receive_bw": 0.0, "ib0_transmit_bw": 0.0, "lo_transmit_bw": 0.0007254870623381052, "veth867b62a_transmit_bw": 0.0, "ib2_transmit_bw": 0.0, "eth2_transmit_bw": 0.0006913487384268221, "eth1_transmit_bw": 0.0, "docker0_transmit_bw": 0.0, "ib1_transmit_bw": 3.165900429761447e-05, "eth0_transmit_bw": 0.0}
...@@ -185,8 +185,8 @@ def test_run_default_benchmarks(self, mock_ansible_client_run): ...@@ -185,8 +185,8 @@ def test_run_default_benchmarks(self, mock_ansible_client_run):
mock_ansible_client_run.return_value = 0 mock_ansible_client_run.return_value = 0
self.runner.run() self.runner.run()
def test_merge_all_metrics(self): def test_merge_benchmark_metrics(self):
"""Test __merge_all_metrics.""" """Test __merge_benchmark_metrics."""
result_summary = json.loads( result_summary = json.loads(
'{"kernel-launch": {"overhead_event": [[0.00583], [0.00545], [0.00581], [0.00572], [0.00559], [0.00591], ' '{"kernel-launch": {"overhead_event": [[0.00583], [0.00545], [0.00581], [0.00572], [0.00559], [0.00591], '
'[0.00562], [0.00586]], "overhead_wall": [[0.01018], [0.01039], [0.01067], [0.01079], [0.00978], ' '[0.00562], [0.00586]], "overhead_wall": [[0.01018], [0.01039], [0.01067], [0.01079], [0.00978], '
...@@ -238,4 +238,43 @@ def test_merge_all_metrics(self): ...@@ -238,4 +238,43 @@ def test_merge_all_metrics(self):
'"pytorch-sharding-matmul/0/allreduce": 10.87, "pytorch-sharding-matmul/1/allreduce": 10.69, ' '"pytorch-sharding-matmul/0/allreduce": 10.87, "pytorch-sharding-matmul/1/allreduce": 10.69, '
'"pytorch-sharding-matmul/0/allgather": 10.56, "pytorch-sharding-matmul/1/allgather": 10.16}' '"pytorch-sharding-matmul/0/allgather": 10.56, "pytorch-sharding-matmul/1/allgather": 10.16}'
) )
self.assertEqual(self.runner._SuperBenchRunner__merge_all_metrics(result_summary, reduce_ops), expected) self.assertEqual(self.runner._SuperBenchRunner__merge_benchmark_metrics(result_summary, reduce_ops), expected)
def test_merge_monitor_metrics(self):
"""Test __merge_monitor_metrics."""
path = Path('tests/data/monitor/')
expected = {
'gpu_temperature:0': 50,
'gpu_temperature:1': 27,
'gpu_temperature:2': 24,
'gpu_temperature:3': 26,
'gpu_temperature:4': 25,
'gpu_temperature:5': 25,
'gpu_temperature:6': 23,
'gpu_temperature:7': 26,
'gpu_power_limit:0': 250,
'gpu_power_limit:1': 200,
'gpu_power_limit:2': 250,
'gpu_power_limit:3': 250,
'gpu_power_limit:4': 250,
'gpu_power_limit:5': 250,
'gpu_power_limit:6': 250,
'gpu_power_limit:7': 250,
'gpu_corrected_ecc:0': 12,
'gpu_corrected_ecc:1': 0,
'gpu_corrected_ecc:2': 0,
'gpu_corrected_ecc:3': 0,
'gpu_corrected_ecc:4': 0,
'gpu_corrected_ecc:5': 0,
'gpu_corrected_ecc:6': 0,
'gpu_corrected_ecc:7': 0,
'gpu_uncorrected_ecc:0': 0,
'gpu_uncorrected_ecc:1': 0,
'gpu_uncorrected_ecc:2': 0,
'gpu_uncorrected_ecc:3': 0,
'gpu_uncorrected_ecc:4': 0,
'gpu_uncorrected_ecc:5': 0,
'gpu_uncorrected_ecc:6': 0,
'gpu_uncorrected_ecc:7': 0
}
self.assertEqual(self.runner._SuperBenchRunner__merge_monitor_metrics(path), expected)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment