Runner: Add Feature - Generate summarized output files. (#157)

**Description** Generate the summarized output files from all nodes. For each metric, do the reduce operation according to the `reduce_op` **Major Revision** - Generate the summarized json file per node: For microbenchmark, the format is `{benchmark_name}/[{run_count}/]{metric_name}[:rank]` For modelbenchmark, the format is `{benchmark_name}/{sub_benchmark_name}/[{run_count}/]{metric_name}` `[]` means optional. ``` { "kernel-launch/overhead_event:0": 0.00583, "kernel-launch/overhead_event:1": 0.00545, "kernel-launch/overhead_event:2": 0.00581, "kernel-launch/overhead_event:3": 0.00572, "kernel-launch/overhead_event:4": 0.00559, "kernel-launch/overhead_event:5": 0.00591, "kernel-launch/overhead_event:6": 0.00562, "kernel-launch/overhead_event:7": 0.00586, "resnet_models/pytorch-resnet50/steptime-train-float32": 544.0827468410134, "resnet_models/pytorch-resnet50/throughput-train-float32": 353.7607016465773, "resnet_models/pytorch-resnet50/steptime-train-float16": 425.40482617914677, "resnet_models/pytorch-resnet50/throughput-train-float16": 454.0142363793973, "pytorch-sharding-matmul/0/allreduce": 10.561786651611328, "pytorch-sharding-matmul/1/allreduce": 10.561786651611328, "pytorch-sharding-matmul/0/allgather": 10.088025093078613, "pytorch-sharding-matmul/1/allgather": 10.088025093078613 } ``` - Generate the summarized jsonl file for all nodes, each line is the result from one node in json format.

Runner: Add Feature - Generate summarized output files. (#157)
**Description** Generate the summarized output files from all nodes. For each metric, do the reduce operation according to the `reduce_op` **Major Revision** - Generate the summarized json file per node: For microbenchmark, the format is `{benchmark_name}/[{run_count}/]{metric_name}[:rank]` For modelbenchmark, the format is `{benchmark_name}/{sub_benchmark_name}/[{run_count}/]{metric_name}` `[]` means optional. ``` { "kernel-launch/overhead_event:0": 0.00583, "kernel-launch/overhead_event:1": 0.00545, "kernel-launch/overhead_event:2": 0.00581, "kernel-launch/overhead_event:3": 0.00572, "kernel-launch/overhead_event:4": 0.00559, "kernel-launch/overhead_event:5": 0.00591, "kernel-launch/overhead_event:6": 0.00562, "kernel-launch/overhead_event:7": 0.00586, "resnet_models/pytorch-resnet50/steptime-train-float32": 544.0827468410134, "resnet_models/pytorch-resnet50/throughput-train-float32": 353.7607016465773, "resnet_models/pytorch-resnet50/steptime-train-float16": 425.40482617914677, "resnet_models/pytorch-resnet50/throughput-train-float16": 454.0142363793973, "pytorch-sharding-matmul/0/allreduce": 10.561786651611328, "pytorch-sharding-matmul/1/allreduce": 10.561786651611328, "pytorch-sharding-matmul/0/allgather": 10.088025093078613, "pytorch-sharding-matmul/1/allgather": 10.088025093078613 } ``` - Generate the summarized jsonl file for all nodes, each line is the result from one node in json format.
7595d794 · guoshzhao · GitHub · a1e5c90d · 7595d794 · 7595d794
Unverified Commit 7595d794 authored Aug 20, 2021 by guoshzhao Committed by GitHub Aug 20, 2021
5 changed files
--- a/setup.py
+++ b/setup.py
@@ -139,6 +139,7 @@ def run(self):
        'jinja2>=2.10.1',
        'joblib>=1.0.1',
        'knack>=0.7.2',
+        'natsort>=7.1.1',
        'omegaconf==2.0.6',
        'pyyaml>=5.3',
    ],

--- a/superbench/benchmarks/__init__.py
+++ b/superbench/benchmarks/__init__.py
@@ -8,7 +8,7 @@
 from superbench.benchmarks.return_code import ReturnCode
 from superbench.benchmarks.context import Platform, Framework, Precision, ModelAction, \
    DistributedImpl, DistributedBackend, BenchmarkType, BenchmarkContext
-from superbench.benchmarks.reducer import ReduceType
+from superbench.benchmarks.reducer import ReduceType, Reducer
 from superbench.common.utils import LazyImport

 BenchmarkRegistry = LazyImport(
@@ -24,5 +24,5 @@

 __all__ = [
    'ReturnCode', 'Platform', 'Framework', 'BenchmarkType', 'Precision', 'ModelAction', 'DistributedImpl',
-    'DistributedBackend', 'BenchmarkContext', 'BenchmarkRegistry', 'ReduceType'
+    'DistributedBackend', 'BenchmarkContext', 'BenchmarkRegistry', 'ReduceType', 'Reducer'
 ]
--- a/superbench/executor/executor.py
+++ b/superbench/executor/executor.py
@@ -172,7 +172,7 @@ def exec(self):
            if benchmark_name not in self._sb_enabled:
                continue
            benchmark_config = self._sb_benchmarks[benchmark_name]
-            benchmark_results = {}
+            benchmark_results = list()
            self.__create_benchmark_dir(benchmark_name)
            for framework in benchmark_config.frameworks or [Framework.NONE.value]:
                if benchmark_name.endswith('_models'):
@@ -186,10 +186,7 @@ def exec(self):
                            parameters=self.__get_arguments(benchmark_config.parameters)
                        )
                        result = self.__exec_benchmark(context, log_suffix)
-                        if framework != Framework.NONE.value:
-                            benchmark_results['{}/{}'.format(framework, model)] = result
-                        else:
-                            benchmark_results[model] = result
+                        benchmark_results.append(result)
                else:
                    log_suffix = 'micro-benchmark {}'.format(benchmark_name)
                    logger.info('Executor is going to execute %s.', log_suffix)
@@ -200,8 +197,6 @@ def exec(self):
                        parameters=self.__get_arguments(benchmark_config.parameters)
                    )
                    result = self.__exec_benchmark(context, log_suffix)
-                    if framework != Framework.NONE.value:
-                        benchmark_results[framework] = result
-                    else:
-                        benchmark_results = result
+                    benchmark_results.append(result)
+
            self.__write_benchmark_results(benchmark_name, benchmark_results)
--- a/superbench/runner/runner.py
+++ b/superbench/runner/runner.py
@@ -3,14 +3,18 @@

 """SuperBench Runner."""

+import json
 import random
 from pathlib import Path
+from collections import defaultdict

+from natsort import natsorted
 from joblib import Parallel, delayed
 from omegaconf import ListConfig, OmegaConf

 from superbench.common.utils import SuperBenchLogger, logger
 from superbench.runner.ansible import AnsibleClient
+from superbench.benchmarks import ReduceType, Reducer


 class SuperBenchRunner():
@@ -202,6 +206,103 @@ def fetch_results(self):    # pragma: no cover
            )
        )

+    def __create_results_summary(self):    # pragma: no cover
+        """Create the result summary file of all nodes."""
+        all_results = list()
+        for node_path in (self._output_path / 'nodes').glob('*'):
+            if not node_path.is_dir():
+                continue
+            results_summary = self.__create_single_node_summary(node_path)
+            results_summary['node'] = node_path.name
+            all_results.append(results_summary)
+
+        with (self._output_path / 'results-summary.jsonl').open(mode='w') as f:
+            for result in all_results:
+                json.dump(result, f)
+                f.write('\n')
+
+    def __create_single_node_summary(self, node_path):    # pragma: no cover
+        """Create the result summary file of single node.
+
+        Args:
+            node_path (Path): The Path instance of node directory.
+
+        Returns:
+            dict: Result summary of single node.
+        """
+        results_summary = dict()
+        reduce_ops = dict()
+        file_list = [Path(f) for f in natsorted([str(f) for f in node_path.glob('**/results.json')])]
+        for results_file in file_list:
+            with results_file.open() as f:
+                try:
+                    results = json.load(f)
+                except ValueError:
+                    logger.error('Invalid JSON file: {}'.format(results_file))
+                    continue
+
+                for result in results:
+                    benchmark_name = result['name']
+                    if results_file.parts[-3].endswith('_models'):
+                        benchmark_name = '{}/{}'.format(results_file.parts[-3], result['name'])
+                    if benchmark_name not in results_summary:
+                        results_summary[benchmark_name] = defaultdict(list)
+                    for metric in result['result']:
+                        metric_name = '{}/{}'.format(benchmark_name, metric)
+                        if metric_name not in reduce_ops:
+                            reduce_ops[metric_name] = result['reduce_op'][metric]
+                        elif reduce_ops[metric_name] != result['reduce_op'][metric]:
+                            logger.error('Inconsistent reduce type for metric: {}'.format(metric_name))
+                            continue
+
+                        results_summary[benchmark_name][metric].append(result['result'][metric])
+
+        results_summary = self.__merge_all_metrics(results_summary, reduce_ops)
+        with (node_path / 'results-summary.json').open(mode='w') as f:
+            json.dump(results_summary, f, indent=2)
+
+        return results_summary
+
+    def __merge_all_metrics(self, results_summary, reduce_ops):
+        """Merge metrics of all benchmarks in one node.
+
+        Args:
+            results_summary (dict): Summarized result of one node.
+            reduce_ops (dict): The reduce type of each metric.
+
+        Returns:
+            dict: Flattened result with metric as key.
+        """
+        metrics_summary = dict()
+        for benchmark_name in results_summary:
+            for metric in results_summary[benchmark_name]:
+                metric_name = '{}/{}'.format(benchmark_name, metric)
+                if metric_name not in reduce_ops or (
+                    reduce_ops[metric_name] is not None and reduce_ops[metric_name] not in ReduceType.get_values()
+                ):
+                    logger.error('Unknown reduce type for metric: {}'.format(metric_name))
+                    continue
+
+                if reduce_ops[metric_name] is not None:
+                    reduce_func = Reducer.get_reduce_func(ReduceType(reduce_ops[metric_name]))
+                    values = [reduce_func(list(result)) for result in zip(*results_summary[benchmark_name][metric])]
+                    for run_count in range(len(values)):
+                        if len(values) > 1:
+                            metric_name = '{}/{}/{}'.format(benchmark_name, run_count, metric)
+                        else:
+                            metric_name = '{}/{}'.format(benchmark_name, metric)
+                        metrics_summary[metric_name] = values[run_count]
+                else:
+                    for rank in range(len(results_summary[benchmark_name][metric])):
+                        for run_count in range(len(results_summary[benchmark_name][metric][rank])):
+                            if len(results_summary[benchmark_name][metric][rank]) > 1:
+                                metric_name = '{}/{}/{}:{}'.format(benchmark_name, run_count, metric, rank)
+                            else:
+                                metric_name = '{}/{}:{}'.format(benchmark_name, metric, rank)
+                            metrics_summary[metric_name] = results_summary[benchmark_name][metric][rank][run_count]
+
+        return metrics_summary
+
    def _run_proc(self, benchmark_name, mode, vars):
        """Run the process.

@@ -245,3 +346,5 @@ def run(self):
                else:
                    logger.warning('Unknown mode %s.', mode.name)
            self.fetch_results()
+
+        self.__create_results_summary()
--- a/tests/runner/test_runner.py
+++ b/tests/runner/test_runner.py
@@ -3,6 +3,7 @@

 """SuperBench Runner test."""

+import json
 import unittest
 import shutil
 import tempfile
@@ -185,3 +186,58 @@ def test_run_default_benchmarks(self, mock_ansible_client_run):
        """
        mock_ansible_client_run.return_value = 0
        self.runner.run()
+
+    def test_merge_all_metrics(self):
+        """Test __merge_all_metrics."""
+        result_summary = json.loads(
+            '{"kernel-launch": {"overhead_event": [[0.00583], [0.00545], [0.00581], [0.00572], [0.00559], [0.00591], '
+            '[0.00562], [0.00586]], "overhead_wall": [[0.01018], [0.01039], [0.01067], [0.01079], [0.00978], '
+            '[0.01085], [0.01036], [0.01033]]}, "resnet_models/pytorch-resnet50": {"steptime_train_float32": '
+            '[[252.03], [250.53], [253.75], [250.61], [252.86], [252.58], [251.15], [252.83]], '
+            '"throughput_train_float32": [[764.57], [767.83], [762.19], [767.31], [763.41], [764.31], [766.43], '
+            '[763.38]], "steptime_train_float16": [[198.36], [196.85], [200.55], [198.07], [199.41], [199.20], '
+            '[199.07], [199.34]], "throughput_train_float16": [[972.64], [977.31], [969.58], [974.33], [972.87], '
+            '[972.73], [972.46], [972.46]]}, "resnet_models/pytorch-resnet101": {"steptime_train_float32": [[385.53], '
+            '[384.05], [386.98], [385.12], [385.47], [385.81], [384.90], [386.65]], "throughput_train_float32": '
+            '[[499.39], [500.69], [498.57], [499.83], [499.51], [499.27], [499.94], [498.65]], '
+            '"steptime_train_float16": [[307.49], [307.13], [310.31], [307.64], [308.68], [309.61], [307.71], '
+            '[309.95]], "throughput_train_float16": [[627.21], [627.34], [624.85], [626.76], [626.26], [625.12], '
+            '[626.92], [625.02]]}, "pytorch-sharding-matmul": {"allreduce": [[10.56, 10.66], [10.87, 10.32], '
+            '[10.56, 10.45], [10.56, 10.60], [10.56, 10.45], [10.56, 10.38], [10.56, 10.33], [10.56, 10.69]], '
+            '"allgather": [[10.08, 10.10], [10.08, 10.16], [10.08, 10.06], [10.56, 10.04], [10.08, 10.05], '
+            '[10.08, 10.09], [10.08, 10.08], [10.08, 10.06]]}}'
+        )
+        reduce_ops = json.loads(
+            '{"kernel-launch/overhead_event": null, "kernel-launch/overhead_wall": null, '
+            '"resnet_models/pytorch-resnet50/steptime_train_float32": "max", '
+            '"resnet_models/pytorch-resnet50/throughput_train_float32": "min", '
+            '"resnet_models/pytorch-resnet50/steptime_train_float16": "max", '
+            '"resnet_models/pytorch-resnet50/throughput_train_float16": "min", '
+            '"resnet_models/pytorch-resnet101/steptime_train_float32": "max", '
+            '"resnet_models/pytorch-resnet101/throughput_train_float32": "min", '
+            '"resnet_models/pytorch-resnet101/steptime_train_float16": "max", '
+            '"resnet_models/pytorch-resnet101/throughput_train_float16": "min", '
+            '"pytorch-sharding-matmul/allreduce": "max", "pytorch-sharding-matmul/allgather": "max"}'
+        )
+
+        expected = json.loads(
+            '{"kernel-launch/overhead_event:0": 0.00583, "kernel-launch/overhead_event:1": 0.00545, '
+            '"kernel-launch/overhead_event:2": 0.00581, "kernel-launch/overhead_event:3": 0.00572, '
+            '"kernel-launch/overhead_event:4": 0.00559, "kernel-launch/overhead_event:5": 0.00591, '
+            '"kernel-launch/overhead_event:6": 0.00562, "kernel-launch/overhead_event:7": 0.00586, '
+            '"kernel-launch/overhead_wall:0": 0.01018, "kernel-launch/overhead_wall:1": 0.01039, '
+            '"kernel-launch/overhead_wall:2": 0.01067, "kernel-launch/overhead_wall:3": 0.01079, '
+            '"kernel-launch/overhead_wall:4": 0.00978, "kernel-launch/overhead_wall:5": 0.01085, '
+            '"kernel-launch/overhead_wall:6": 0.01036, "kernel-launch/overhead_wall:7": 0.01033, '
+            '"resnet_models/pytorch-resnet50/steptime_train_float32": 253.75, '
+            '"resnet_models/pytorch-resnet50/throughput_train_float32": 762.19, '
+            '"resnet_models/pytorch-resnet50/steptime_train_float16": 200.55, '
+            '"resnet_models/pytorch-resnet50/throughput_train_float16": 969.58, '
+            '"resnet_models/pytorch-resnet101/steptime_train_float32": 386.98, '
+            '"resnet_models/pytorch-resnet101/throughput_train_float32": 498.57, '
+            '"resnet_models/pytorch-resnet101/steptime_train_float16": 310.31, '
+            '"resnet_models/pytorch-resnet101/throughput_train_float16": 624.85, '
+            '"pytorch-sharding-matmul/0/allreduce": 10.87, "pytorch-sharding-matmul/1/allreduce": 10.69, '
+            '"pytorch-sharding-matmul/0/allgather": 10.56, "pytorch-sharding-matmul/1/allgather": 10.16}'
+        )
+        self.assertEqual(self.runner._SuperBenchRunner__merge_all_metrics(result_summary, reduce_ops), expected)