Benchmarks: Add Feature - Add reduce function support for output summary. (#147)

**Description** Add reduce function support for output summary. **Major Revision** - Add reducer class to maintain all reduce functions. - Save reduce type of each metric into `BenchmarkResult` - Fix UT.

Benchmarks: Add Feature - Add reduce function support for output summary. (#147)
**Description** Add reduce function support for output summary. **Major Revision** - Add reducer class to maintain all reduce functions. - Save reduce type of each metric into `BenchmarkResult` - Fix UT.
e41b1f62 · guoshzhao · GitHub · 86c390a9 · e41b1f62 · e41b1f62
Unverified Commit e41b1f62 authored Aug 05, 2021 by guoshzhao Committed by GitHub Aug 05, 2021
7 changed files
--- a/superbench/benchmarks/__init__.py
+++ b/superbench/benchmarks/__init__.py
@@ -8,6 +8,7 @@
 from superbench.benchmarks.return_code import ReturnCode
 from superbench.benchmarks.context import Platform, Framework, Precision, ModelAction, \
    DistributedImpl, DistributedBackend, BenchmarkType, BenchmarkContext
+from superbench.benchmarks.reducer import ReduceType
 from superbench.common.utils import LazyImport

 BenchmarkRegistry = LazyImport(
@@ -23,5 +24,5 @@

 __all__ = [
    'ReturnCode', 'Platform', 'Framework', 'BenchmarkType', 'Precision', 'ModelAction', 'DistributedImpl',
-    'DistributedBackend', 'BenchmarkContext', 'BenchmarkRegistry'
+    'DistributedBackend', 'BenchmarkContext', 'BenchmarkRegistry', 'ReduceType'
 ]
--- a/superbench/benchmarks/micro_benchmarks/micro_base.py
+++ b/superbench/benchmarks/micro_benchmarks/micro_base.py
@@ -49,12 +49,13 @@ def _benchmark(self):
        """
        pass

-    def _process_numeric_result(self, metric, result):
+    def _process_numeric_result(self, metric, result, reduce_type=None):
        """Function to save the numerical results.

        Args:
            metric (str): metric name which is the key.
            result (List[numbers.Number]): numerical result.
+            reduce_type (ReduceType): The type of reduce function.

        Return:
            True if result list is not empty.
@@ -68,7 +69,7 @@ def _process_numeric_result(self, metric, result):
            return False

        self._result.add_raw_data(metric, result)
-        self._result.add_result(metric, sum(result) / len(result))
+        self._result.add_result(metric, sum(result) / len(result), reduce_type)
        return True

    def print_env_info(self):

--- a/superbench/benchmarks/reducer.py
+++ b/superbench/benchmarks/reducer.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""A module for result reducer."""
+
+from typing import Dict, Callable
+from statistics import mean
+
+from superbench.benchmarks.context import Enum
+
+
+class ReduceType(Enum):
+    """The Enum class representing different reducer."""
+    AVG = 'avg'
+    MAX = 'max'
+    MIN = 'min'
+    SUM = 'sum'
+
+
+class Reducer:
+    """Reducer class to maintain all reduce functions."""
+    functions: Dict[ReduceType, Callable] = dict()
+
+    @classmethod
+    def add_reduce_func(cls, reduce_type):
+        """Add reduce fuction.
+
+        Args:
+            reduce_type (ReduceType): The type of reduce function.
+
+        Return:
+            decorator (Callable): return the decorator to add the reduce function.
+        """
+        def decorator(func):
+            cls.functions[reduce_type] = func
+            return func
+
+        return decorator
+
+    @classmethod
+    def get_reduce_func(cls, reduce_type):
+        """Get reduce fuction by reduce_type.
+
+        Args:
+            reduce_type (ReduceType): The type of reduce function.
+
+        Return:
+            func (Callable): reduce function, None means invalid reduce type.
+        """
+        if reduce_type in cls.functions:
+            return cls.functions[reduce_type]
+
+        return None
+
+
+Reducer.add_reduce_func(ReduceType.MAX)(max)
+Reducer.add_reduce_func(ReduceType.MIN)(min)
+Reducer.add_reduce_func(ReduceType.SUM)(sum)
+Reducer.add_reduce_func(ReduceType.AVG)(mean)
--- a/superbench/benchmarks/result.py
+++ b/superbench/benchmarks/result.py
@@ -31,6 +31,7 @@ def __init__(self, name, type, return_code, run_count=0):
        self.__end_time = None
        self.__raw_data = dict()
        self.__result = dict()
+        self.__reduce = dict()

    def __eq__(self, rhs):
        """Override equal function for deep comparison.
@@ -67,7 +68,7 @@ def add_raw_data(self, metric, value):

        return True

-    def add_result(self, metric, value):
+    def add_result(self, metric, value, reduce_type=None):
        """Add summarized data into result.

        Args:
@@ -75,6 +76,7 @@ def add_result(self, metric, value):
            value (float): summarized data.
              For e2e model benchmarks, the value is step-time or throughput.
              For micro-benchmarks, the value is FLOPS, bandwidth and etc.
+            reduce_type (ReduceType): type of reduce function.

        Return:
            True if succeed to add the result.
@@ -87,6 +89,7 @@ def add_result(self, metric, value):

        if metric not in self.__result:
            self.__result[metric] = list()
+            self.__reduce[metric] = reduce_type.value if isinstance(reduce_type, Enum) else None
        self.__result[metric].append(value)

        return True

--- a/tests/benchmarks/model_benchmarks/test_model_base.py
+++ b/tests/benchmarks/model_benchmarks/test_model_base.py
@@ -197,7 +197,6 @@ def test_preprocess():
  --hidden_size int     Hidden size.
  --seq_len int         Sequence length."""
    )
-    print(settings)
    assert (settings == expected_settings)

    # Negative case for _preprocess() - invalid precision.
@@ -219,7 +218,8 @@ def test_train():
        '{"name": "pytorch-fake-model", "type": "model", "run_count": 1, "return_code": 0, '
        '"start_time": null, "end_time": null, "raw_data": {"steptime_train_float32": [[2, 2, 2, 2, 2, 2, 2, 2]], '
        '"throughput_train_float32": [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]]}, '
-        '"result": {"steptime_train_float32": [2.0], "throughput_train_float32": [16000.0]}}'
+        '"result": {"steptime_train_float32": [2.0], "throughput_train_float32": [16000.0]}, '
+        '"reduce": {"steptime_train_float32": null, "throughput_train_float32": null}}'
    )
    assert (benchmark._preprocess())
    assert (benchmark._ModelBenchmark__train(Precision.FLOAT32))
@@ -229,7 +229,7 @@ def test_train():
    benchmark = create_benchmark('--num_steps 0')
    expected_result = (
        '{"name": "pytorch-fake-model", "type": "model", "run_count": 1, "return_code": 3, '
-        '"start_time": null, "end_time": null, "raw_data": {}, "result": {}}'
+        '"start_time": null, "end_time": null, "raw_data": {}, "result": {}, "reduce": {}}'
    )
    assert (benchmark._preprocess())
    assert (benchmark._ModelBenchmark__train(Precision.FLOAT32) is False)
@@ -243,7 +243,8 @@ def test_inference():
        '{"name": "pytorch-fake-model", "type": "model", "run_count": 1, "return_code": 0, '
        '"start_time": null, "end_time": null, "raw_data": {"steptime_inference_float16": [[4, 4, 4, 4, 4, 4, 4, 4]], '
        '"throughput_inference_float16": [[8000.0, 8000.0, 8000.0, 8000.0, 8000.0, 8000.0, 8000.0, 8000.0]]}, '
-        '"result": {"steptime_inference_float16": [4.0], "throughput_inference_float16": [8000.0]}}'
+        '"result": {"steptime_inference_float16": [4.0], "throughput_inference_float16": [8000.0]}, '
+        '"reduce": {"steptime_inference_float16": null, "throughput_inference_float16": null}}'
    )
    assert (benchmark._preprocess())
    assert (benchmark._ModelBenchmark__inference(Precision.FLOAT16))
@@ -253,7 +254,7 @@ def test_inference():
    benchmark = create_benchmark('--num_steps 0')
    expected_result = (
        '{"name": "pytorch-fake-model", "type": "model", "run_count": 1, "return_code": 3, '
-        '"start_time": null, "end_time": null, "raw_data": {}, "result": {}}'
+        '"start_time": null, "end_time": null, "raw_data": {}, "result": {}, "reduce": {}}'
    )
    assert (benchmark._preprocess())
    assert (benchmark._ModelBenchmark__inference(Precision.FLOAT16) is False)
@@ -292,7 +293,9 @@ def test_benchmark():
        '"steptime_train_float16": [[2, 2, 2, 2, 2, 2, 2, 2]], '
        '"throughput_train_float16": [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]]}, '
        '"result": {"steptime_train_float32": [2.0], "throughput_train_float32": [16000.0], '
-        '"steptime_train_float16": [2.0], "throughput_train_float16": [16000.0]}}'
+        '"steptime_train_float16": [2.0], "throughput_train_float16": [16000.0]}, '
+        '"reduce": {"steptime_train_float32": null, "throughput_train_float32": null, '
+        '"steptime_train_float16": null, "throughput_train_float16": null}}'
    )
    assert (benchmark.serialized_result == expected_serialized_result)


--- a/tests/benchmarks/test_registry.py
+++ b/tests/benchmarks/test_registry.py
@@ -148,7 +148,8 @@ def test_launch_benchmark():
        '{"name": "accumulation", "type": "micro", "run_count": 1, '
        '"return_code": 0, "start_time": null, "end_time": null, '
        '"raw_data": {"accumulation_result": ["1,3,6,10"]}, '
-        '"result": {"accumulation_result": [10]}}'
+        '"result": {"accumulation_result": [10]}, '
+        '"reduce": {"accumulation_result": null}}'
    )
    assert (result == expected)

@@ -171,7 +172,8 @@ def test_launch_benchmark():
        '{"name": "accumulation", "type": "micro", "run_count": 1, '
        '"return_code": 0, "start_time": null, "end_time": null, '
        '"raw_data": {"accumulation_result": ["1,3,6"]}, '
-        '"result": {"accumulation_result": [6]}}'
+        '"result": {"accumulation_result": [6]}, '
+        '"reduce": {"accumulation_result": null}}'
    )
    assert (result == expected)


--- a/tests/benchmarks/test_result.py
+++ b/tests/benchmarks/test_result.py
@@ -3,7 +3,7 @@

 """Tests for BenchmarkResult module."""

-from superbench.benchmarks import BenchmarkType, ReturnCode
+from superbench.benchmarks import BenchmarkType, ReturnCode, ReduceType
 from superbench.benchmarks.result import BenchmarkResult


@@ -67,9 +67,9 @@ def test_serialize_deserialize():
    """Test serialization/deserialization and compare the results."""
    # Result with one metric.
    result = BenchmarkResult('pytorch-bert-base1', BenchmarkType.MICRO.value, ReturnCode.SUCCESS.value, run_count=2)
-    result.add_result('metric1', 300)
-    result.add_result('metric1', 200)
-    result.add_result('metric2', 100)
+    result.add_result('metric1', 300, ReduceType.MAX)
+    result.add_result('metric1', 200, ReduceType.MAX)
+    result.add_result('metric2', 100, ReduceType.AVG)
    result.add_raw_data('metric1', [1, 2, 3])
    result.add_raw_data('metric1', [4, 5, 6])
    result.add_raw_data('metric1', [7, 8, 9])
@@ -82,6 +82,7 @@ def test_serialize_deserialize():
        '{"name": "pytorch-bert-base1", "type": "micro", "run_count": 2, "return_code": 0, '
        '"start_time": "2021-02-03 16:59:49", "end_time": "2021-02-03 17:00:08", '
        '"raw_data": {"metric1": [[1, 2, 3], [4, 5, 6], [7, 8, 9]]}, '
-        '"result": {"metric1": [300, 200], "metric2": [100]}}'
+        '"result": {"metric1": [300, 200], "metric2": [100]}, '
+        '"reduce": {"metric1": "max", "metric2": "avg"}}'
    )
    assert (result.to_string() == expected)