Unverified Commit 7595d794 authored by guoshzhao's avatar guoshzhao Committed by GitHub
Browse files

Runner: Add Feature - Generate summarized output files. (#157)

**Description**
Generate the summarized output files from all nodes. For each metric, do the reduce operation according to the `reduce_op`

**Major Revision**
- Generate the summarized json file per node:
For microbenchmark, the format is `{benchmark_name}/[{run_count}/]{metric_name}[:rank]`
For modelbenchmark, the format is `{benchmark_name}/{sub_benchmark_name}/[{run_count}/]{metric_name}`
`[]` means optional.
```
{
  "kernel-launch/overhead_event:0": 0.00583,
  "kernel-launch/overhead_event:1": 0.00545,
  "kernel-launch/overhead_event:2": 0.00581,
  "kernel-launch/overhead_event:3": 0.00572,
  "kernel-launch/overhead_event:4": 0.00559,
  "kernel-launch/overhead_event:5": 0.00591,
  "kernel-launch/overhead_event:6": 0.00562,
  "kernel-launch/overhead_event:7": 0.00586,
  "resnet_models/pytorch-resnet50/steptime-train-float32": 544.0827468410134,
  "resnet_models/pytorch-resnet50/throughput-train-float32": 353.7607016465773,
  "resnet_models/pytorch-resnet50/steptime-train-float16": 425.40482617914677,
  "resnet_models/pytorch-resnet50/throughput-train-float16": 454.0142363793973,
  "pytorch-sharding-matmul/0/allreduce": 10.561786651611328,
  "pytorch-sharding-matmul/1/allreduce": 10.561786651611328,
  "pytorch-sharding-matmul/0/allgather": 10.088025093078613,
  "pytorch-sharding-matmul/1/allgather": 10.088025093078613
}
```
- Generate the summarized jsonl file for all nodes, each line is the result from one node in json format.
parent a1e5c90d
......@@ -139,6 +139,7 @@ def run(self):
'jinja2>=2.10.1',
'joblib>=1.0.1',
'knack>=0.7.2',
'natsort>=7.1.1',
'omegaconf==2.0.6',
'pyyaml>=5.3',
],
......
......@@ -8,7 +8,7 @@
from superbench.benchmarks.return_code import ReturnCode
from superbench.benchmarks.context import Platform, Framework, Precision, ModelAction, \
DistributedImpl, DistributedBackend, BenchmarkType, BenchmarkContext
from superbench.benchmarks.reducer import ReduceType
from superbench.benchmarks.reducer import ReduceType, Reducer
from superbench.common.utils import LazyImport
BenchmarkRegistry = LazyImport(
......@@ -24,5 +24,5 @@
__all__ = [
'ReturnCode', 'Platform', 'Framework', 'BenchmarkType', 'Precision', 'ModelAction', 'DistributedImpl',
'DistributedBackend', 'BenchmarkContext', 'BenchmarkRegistry', 'ReduceType'
'DistributedBackend', 'BenchmarkContext', 'BenchmarkRegistry', 'ReduceType', 'Reducer'
]
......@@ -172,7 +172,7 @@ def exec(self):
if benchmark_name not in self._sb_enabled:
continue
benchmark_config = self._sb_benchmarks[benchmark_name]
benchmark_results = {}
benchmark_results = list()
self.__create_benchmark_dir(benchmark_name)
for framework in benchmark_config.frameworks or [Framework.NONE.value]:
if benchmark_name.endswith('_models'):
......@@ -186,10 +186,7 @@ def exec(self):
parameters=self.__get_arguments(benchmark_config.parameters)
)
result = self.__exec_benchmark(context, log_suffix)
if framework != Framework.NONE.value:
benchmark_results['{}/{}'.format(framework, model)] = result
else:
benchmark_results[model] = result
benchmark_results.append(result)
else:
log_suffix = 'micro-benchmark {}'.format(benchmark_name)
logger.info('Executor is going to execute %s.', log_suffix)
......@@ -200,8 +197,6 @@ def exec(self):
parameters=self.__get_arguments(benchmark_config.parameters)
)
result = self.__exec_benchmark(context, log_suffix)
if framework != Framework.NONE.value:
benchmark_results[framework] = result
else:
benchmark_results = result
benchmark_results.append(result)
self.__write_benchmark_results(benchmark_name, benchmark_results)
......@@ -3,14 +3,18 @@
"""SuperBench Runner."""
import json
import random
from pathlib import Path
from collections import defaultdict
from natsort import natsorted
from joblib import Parallel, delayed
from omegaconf import ListConfig, OmegaConf
from superbench.common.utils import SuperBenchLogger, logger
from superbench.runner.ansible import AnsibleClient
from superbench.benchmarks import ReduceType, Reducer
class SuperBenchRunner():
......@@ -202,6 +206,103 @@ def fetch_results(self): # pragma: no cover
)
)
def __create_results_summary(self): # pragma: no cover
"""Create the result summary file of all nodes."""
all_results = list()
for node_path in (self._output_path / 'nodes').glob('*'):
if not node_path.is_dir():
continue
results_summary = self.__create_single_node_summary(node_path)
results_summary['node'] = node_path.name
all_results.append(results_summary)
with (self._output_path / 'results-summary.jsonl').open(mode='w') as f:
for result in all_results:
json.dump(result, f)
f.write('\n')
def __create_single_node_summary(self, node_path): # pragma: no cover
"""Create the result summary file of single node.
Args:
node_path (Path): The Path instance of node directory.
Returns:
dict: Result summary of single node.
"""
results_summary = dict()
reduce_ops = dict()
file_list = [Path(f) for f in natsorted([str(f) for f in node_path.glob('**/results.json')])]
for results_file in file_list:
with results_file.open() as f:
try:
results = json.load(f)
except ValueError:
logger.error('Invalid JSON file: {}'.format(results_file))
continue
for result in results:
benchmark_name = result['name']
if results_file.parts[-3].endswith('_models'):
benchmark_name = '{}/{}'.format(results_file.parts[-3], result['name'])
if benchmark_name not in results_summary:
results_summary[benchmark_name] = defaultdict(list)
for metric in result['result']:
metric_name = '{}/{}'.format(benchmark_name, metric)
if metric_name not in reduce_ops:
reduce_ops[metric_name] = result['reduce_op'][metric]
elif reduce_ops[metric_name] != result['reduce_op'][metric]:
logger.error('Inconsistent reduce type for metric: {}'.format(metric_name))
continue
results_summary[benchmark_name][metric].append(result['result'][metric])
results_summary = self.__merge_all_metrics(results_summary, reduce_ops)
with (node_path / 'results-summary.json').open(mode='w') as f:
json.dump(results_summary, f, indent=2)
return results_summary
def __merge_all_metrics(self, results_summary, reduce_ops):
"""Merge metrics of all benchmarks in one node.
Args:
results_summary (dict): Summarized result of one node.
reduce_ops (dict): The reduce type of each metric.
Returns:
dict: Flattened result with metric as key.
"""
metrics_summary = dict()
for benchmark_name in results_summary:
for metric in results_summary[benchmark_name]:
metric_name = '{}/{}'.format(benchmark_name, metric)
if metric_name not in reduce_ops or (
reduce_ops[metric_name] is not None and reduce_ops[metric_name] not in ReduceType.get_values()
):
logger.error('Unknown reduce type for metric: {}'.format(metric_name))
continue
if reduce_ops[metric_name] is not None:
reduce_func = Reducer.get_reduce_func(ReduceType(reduce_ops[metric_name]))
values = [reduce_func(list(result)) for result in zip(*results_summary[benchmark_name][metric])]
for run_count in range(len(values)):
if len(values) > 1:
metric_name = '{}/{}/{}'.format(benchmark_name, run_count, metric)
else:
metric_name = '{}/{}'.format(benchmark_name, metric)
metrics_summary[metric_name] = values[run_count]
else:
for rank in range(len(results_summary[benchmark_name][metric])):
for run_count in range(len(results_summary[benchmark_name][metric][rank])):
if len(results_summary[benchmark_name][metric][rank]) > 1:
metric_name = '{}/{}/{}:{}'.format(benchmark_name, run_count, metric, rank)
else:
metric_name = '{}/{}:{}'.format(benchmark_name, metric, rank)
metrics_summary[metric_name] = results_summary[benchmark_name][metric][rank][run_count]
return metrics_summary
def _run_proc(self, benchmark_name, mode, vars):
"""Run the process.
......@@ -245,3 +346,5 @@ def run(self):
else:
logger.warning('Unknown mode %s.', mode.name)
self.fetch_results()
self.__create_results_summary()
......@@ -3,6 +3,7 @@
"""SuperBench Runner test."""
import json
import unittest
import shutil
import tempfile
......@@ -185,3 +186,58 @@ def test_run_default_benchmarks(self, mock_ansible_client_run):
"""
mock_ansible_client_run.return_value = 0
self.runner.run()
def test_merge_all_metrics(self):
"""Test __merge_all_metrics."""
result_summary = json.loads(
'{"kernel-launch": {"overhead_event": [[0.00583], [0.00545], [0.00581], [0.00572], [0.00559], [0.00591], '
'[0.00562], [0.00586]], "overhead_wall": [[0.01018], [0.01039], [0.01067], [0.01079], [0.00978], '
'[0.01085], [0.01036], [0.01033]]}, "resnet_models/pytorch-resnet50": {"steptime_train_float32": '
'[[252.03], [250.53], [253.75], [250.61], [252.86], [252.58], [251.15], [252.83]], '
'"throughput_train_float32": [[764.57], [767.83], [762.19], [767.31], [763.41], [764.31], [766.43], '
'[763.38]], "steptime_train_float16": [[198.36], [196.85], [200.55], [198.07], [199.41], [199.20], '
'[199.07], [199.34]], "throughput_train_float16": [[972.64], [977.31], [969.58], [974.33], [972.87], '
'[972.73], [972.46], [972.46]]}, "resnet_models/pytorch-resnet101": {"steptime_train_float32": [[385.53], '
'[384.05], [386.98], [385.12], [385.47], [385.81], [384.90], [386.65]], "throughput_train_float32": '
'[[499.39], [500.69], [498.57], [499.83], [499.51], [499.27], [499.94], [498.65]], '
'"steptime_train_float16": [[307.49], [307.13], [310.31], [307.64], [308.68], [309.61], [307.71], '
'[309.95]], "throughput_train_float16": [[627.21], [627.34], [624.85], [626.76], [626.26], [625.12], '
'[626.92], [625.02]]}, "pytorch-sharding-matmul": {"allreduce": [[10.56, 10.66], [10.87, 10.32], '
'[10.56, 10.45], [10.56, 10.60], [10.56, 10.45], [10.56, 10.38], [10.56, 10.33], [10.56, 10.69]], '
'"allgather": [[10.08, 10.10], [10.08, 10.16], [10.08, 10.06], [10.56, 10.04], [10.08, 10.05], '
'[10.08, 10.09], [10.08, 10.08], [10.08, 10.06]]}}'
)
reduce_ops = json.loads(
'{"kernel-launch/overhead_event": null, "kernel-launch/overhead_wall": null, '
'"resnet_models/pytorch-resnet50/steptime_train_float32": "max", '
'"resnet_models/pytorch-resnet50/throughput_train_float32": "min", '
'"resnet_models/pytorch-resnet50/steptime_train_float16": "max", '
'"resnet_models/pytorch-resnet50/throughput_train_float16": "min", '
'"resnet_models/pytorch-resnet101/steptime_train_float32": "max", '
'"resnet_models/pytorch-resnet101/throughput_train_float32": "min", '
'"resnet_models/pytorch-resnet101/steptime_train_float16": "max", '
'"resnet_models/pytorch-resnet101/throughput_train_float16": "min", '
'"pytorch-sharding-matmul/allreduce": "max", "pytorch-sharding-matmul/allgather": "max"}'
)
expected = json.loads(
'{"kernel-launch/overhead_event:0": 0.00583, "kernel-launch/overhead_event:1": 0.00545, '
'"kernel-launch/overhead_event:2": 0.00581, "kernel-launch/overhead_event:3": 0.00572, '
'"kernel-launch/overhead_event:4": 0.00559, "kernel-launch/overhead_event:5": 0.00591, '
'"kernel-launch/overhead_event:6": 0.00562, "kernel-launch/overhead_event:7": 0.00586, '
'"kernel-launch/overhead_wall:0": 0.01018, "kernel-launch/overhead_wall:1": 0.01039, '
'"kernel-launch/overhead_wall:2": 0.01067, "kernel-launch/overhead_wall:3": 0.01079, '
'"kernel-launch/overhead_wall:4": 0.00978, "kernel-launch/overhead_wall:5": 0.01085, '
'"kernel-launch/overhead_wall:6": 0.01036, "kernel-launch/overhead_wall:7": 0.01033, '
'"resnet_models/pytorch-resnet50/steptime_train_float32": 253.75, '
'"resnet_models/pytorch-resnet50/throughput_train_float32": 762.19, '
'"resnet_models/pytorch-resnet50/steptime_train_float16": 200.55, '
'"resnet_models/pytorch-resnet50/throughput_train_float16": 969.58, '
'"resnet_models/pytorch-resnet101/steptime_train_float32": 386.98, '
'"resnet_models/pytorch-resnet101/throughput_train_float32": 498.57, '
'"resnet_models/pytorch-resnet101/steptime_train_float16": 310.31, '
'"resnet_models/pytorch-resnet101/throughput_train_float16": 624.85, '
'"pytorch-sharding-matmul/0/allreduce": 10.87, "pytorch-sharding-matmul/1/allreduce": 10.69, '
'"pytorch-sharding-matmul/0/allgather": 10.56, "pytorch-sharding-matmul/1/allgather": 10.16}'
)
self.assertEqual(self.runner._SuperBenchRunner__merge_all_metrics(result_summary, reduce_ops), expected)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment