Benchmarks: Add Feature - Add return_code metric into result (#256)

**Description** Add return_code metric into result and revise unit tests.

Benchmarks: Add Feature - Add return_code metric into result (#256)
**Description** Add return_code metric into result and revise unit tests.
44f0270e · guoshzhao · GitHub · 655f238d · 44f0270e · 44f0270e
Unverified Commit 44f0270e authored Dec 07, 2021 by guoshzhao Committed by GitHub Dec 07, 2021
12 changed files
--- a/superbench/benchmarks/base.py
+++ b/superbench/benchmarks/base.py
@@ -283,3 +283,8 @@ def result(self):
    def serialized_result(self):
        """Decoration function to access benchmark result."""
        return self._result.to_string()
+
+    @property
+    def default_metric_count(self):
+        """Decoration function to get the count of default metrics."""
+        return self._result.default_metric_count
--- a/superbench/benchmarks/result.py
+++ b/superbench/benchmarks/result.py
@@ -31,6 +31,7 @@ def __init__(self, name, type, return_code, run_count=0):
        self.__end_time = None
        self.__raw_data = dict()
        self.__result = dict()
+        self.__result['return_code'] = [return_code.value]
        self.__reduce_op = dict()

    def __eq__(self, rhs):
@@ -119,6 +120,7 @@ def set_return_code(self, return_code):
            return_code (ReturnCode): return code defined in superbench.benchmarks.ReturnCode.
        """
        self.__return_code = return_code
+        self.__result['return_code'][0] = return_code.value

    def to_string(self):
        """Serialize the BenchmarkResult object to string.
@@ -158,6 +160,15 @@ def return_code(self):
        """Decoration function to access __return_code."""
        return self.__return_code

+    @property
+    def default_metric_count(self):
+        """Decoration function to get the count of default metrics."""
+        count = 0
+        if 'return_code' in self.__result:
+            count += 1
+
+        return count
+
    @property
    def start_time(self):
        """Decoration function to access __start_time."""

--- a/tests/benchmarks/micro_benchmarks/test_computation_communication_overlap.py
+++ b/tests/benchmarks/micro_benchmarks/test_computation_communication_overlap.py
@@ -45,8 +45,8 @@ def test_pytorch_computation_communication_overlap_normal():
        assert (benchmark.run_count == 1)
        assert (benchmark.return_code == ReturnCode.SUCCESS)

-        assert (len(benchmark.raw_data) == benchmark.run_count * len(benchmark._args.kernel))
-        assert (len(benchmark.result) == benchmark.run_count * len(benchmark._args.kernel))
+        assert (len(benchmark.raw_data) == len(benchmark._args.kernel))
+        assert (len(benchmark.result) == len(benchmark._args.kernel) + benchmark.default_metric_count)


 @decorator.cuda_test
@@ -79,6 +79,6 @@ def test_pytorch_computation_communication_overlap_fake_distributed():
    assert (benchmark.run_count == 1)
    assert (benchmark.return_code == ReturnCode.SUCCESS)

-    assert (len(benchmark.raw_data) == benchmark.run_count * len(benchmark._args.kernel))
-    assert (len(benchmark.result) == benchmark.run_count * len(benchmark._args.kernel))
+    assert (len(benchmark.raw_data) == len(benchmark._args.kernel))
+    assert (len(benchmark.result) == len(benchmark._args.kernel) + benchmark.default_metric_count)
    utils.clean_simulated_ddp_distributed_env()
--- a/tests/benchmarks/micro_benchmarks/test_cublas_function.py
+++ b/tests/benchmarks/micro_benchmarks/test_cublas_function.py
@@ -42,7 +42,8 @@ def test_cublas_functions():
    for metric in list(benchmark.result.keys()):
        assert (len(benchmark.result[metric]) == 1)
        assert (isinstance(benchmark.result[metric][0], numbers.Number))
-        assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps)
+        if metric != 'return_code':
+            assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps)

    # Test for custom configuration
    custom_config_str = '{"name":"cublasCgemm","m":512,"n":512,"k":32,"transa":1,"transb":0}'
@@ -73,8 +74,9 @@ def test_cublas_functions():
    assert (len(benchmark.raw_data['raw_output_0']) == 1)
    assert (isinstance(benchmark.raw_data['raw_output_0'][0], str))

-    assert (1 == len(benchmark.result))
+    assert (1 + benchmark.default_metric_count == len(benchmark.result))
    for metric in list(benchmark.result.keys()):
        assert (len(benchmark.result[metric]) == 1)
        assert (isinstance(benchmark.result[metric][0], numbers.Number))
-        assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps)
+        if metric != 'return_code':
+            assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps)
--- a/tests/benchmarks/micro_benchmarks/test_cudnn_function.py
+++ b/tests/benchmarks/micro_benchmarks/test_cudnn_function.py
@@ -42,7 +42,8 @@ def test_cudnn_functions():
    for metric in list(benchmark.result.keys()):
        assert (len(benchmark.result[metric]) == 1)
        assert (isinstance(benchmark.result[metric][0], numbers.Number))
-        assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps)
+        if metric != 'return_code':
+            assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps)

    # Test for custom configuration
    custom_config_str = '{"algo":0,"arrayLength":2,"convType":0,"dilationA":[1,1],"filterStrideA":[1,1],' \
@@ -77,8 +78,9 @@ def test_cudnn_functions():
    assert (len(benchmark.raw_data['raw_output_0']) == 1)
    assert (isinstance(benchmark.raw_data['raw_output_0'][0], str))

-    assert (1 == len(benchmark.result))
+    assert (1 + benchmark.default_metric_count == len(benchmark.result))
    for metric in list(benchmark.result.keys()):
        assert (len(benchmark.result[metric]) == 1)
        assert (isinstance(benchmark.result[metric][0], numbers.Number))
-        assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps)
+        if metric != 'return_code':
+            assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps)
--- a/tests/benchmarks/micro_benchmarks/test_disk_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_disk_performance.py
@@ -509,7 +509,7 @@ def test_disk_performance_result_parsing(self):
        assert (benchmark.return_code == ReturnCode.SUCCESS)

        # bs + <read, write> x <iops, 95th, 99th, 99.9th>
-        assert (9 == len(benchmark.result.keys()))
+        assert (9 + benchmark.default_metric_count == len(benchmark.result.keys()))

        assert (1 == len(benchmark.result[jobname_prefix + '_bs']))
        assert (4096 == benchmark.result[jobname_prefix + '_bs'][0])

--- a/tests/benchmarks/micro_benchmarks/test_gpcnet_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_gpcnet_performance.py
@@ -93,7 +93,7 @@ def test_gpcnet_network_test(self):
 --------------------------------------------------------------------------
 """
        assert (benchmark._process_raw_result(0, raw_output_no_execution))
-        assert (len(benchmark.result) == 0)
+        assert (len(benchmark.result) == benchmark.default_metric_count)

        # Check function process_raw_data.
        # Positive case - valid raw output.
@@ -250,7 +250,7 @@ def test_gpcnet_network_load(self):    # noqa: C901
 --------------------------------------------------------------------------
 """
        assert (benchmark._process_raw_result(0, raw_output_no_execution))
-        assert (len(benchmark.result) == 0)
+        assert (len(benchmark.result) == benchmark.default_metric_count)
        # Positive case - valid raw output.
        assert (benchmark._process_raw_result(0, raw_output))
        test_name = 'IsolatedNetworkTests'

--- a/tests/benchmarks/micro_benchmarks/test_gpu_copy_bw_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_gpu_copy_bw_performance.py
@@ -112,12 +112,15 @@ def _test_gpu_copy_bw_performance_result_parsing(self, platform):
        assert (1 == len(benchmark.raw_data))
        print(test_raw_output.splitlines())
        test_raw_output_dict = {x.split()[0]: float(x.split()[1]) for x in test_raw_output.strip().splitlines()}
-        assert (len(test_raw_output_dict) == len(benchmark.result))
+        assert (len(test_raw_output_dict) + benchmark.default_metric_count == len(benchmark.result))
        for output_key in benchmark.result:
-            assert (len(benchmark.result[output_key]) == 1)
-            assert (isinstance(benchmark.result[output_key][0], numbers.Number))
-            assert (output_key in test_raw_output_dict)
-            assert (test_raw_output_dict[output_key] == benchmark.result[output_key][0])
+            if output_key == 'return_code':
+                assert (benchmark.result[output_key] == [0])
+            else:
+                assert (len(benchmark.result[output_key]) == 1)
+                assert (isinstance(benchmark.result[output_key][0], numbers.Number))
+                assert (output_key in test_raw_output_dict)
+                assert (test_raw_output_dict[output_key] == benchmark.result[output_key][0])

        # Negative case - invalid raw output.
        assert (benchmark._process_raw_result(1, 'Invalid raw output') is False)

--- a/tests/benchmarks/micro_benchmarks/test_tensorrt_inference_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_tensorrt_inference_performance.py
@@ -133,7 +133,7 @@ def test_tensorrt_inference_result_parsing(self, test_raw_log):
        self.assertTrue(benchmark._process_raw_result(0, test_raw_log))
        self.assertEqual(ReturnCode.SUCCESS, benchmark.return_code)

-        self.assertEqual(6, len(benchmark.result))
+        self.assertEqual(6 + benchmark.default_metric_count, len(benchmark.result))
        for tag in ['mean', '99']:
            self.assertEqual(0.5, benchmark.result[f'gpu_lat_ms_{tag}'][0])
            self.assertEqual(0.6, benchmark.result[f'host_lat_ms_{tag}'][0])

--- a/tests/benchmarks/model_benchmarks/test_model_base.py
+++ b/tests/benchmarks/model_benchmarks/test_model_base.py
@@ -225,7 +225,7 @@ def test_train():
        '"start_time": null, "end_time": null, "raw_data": {'
        '"steptime_train_float32": [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]], '
        '"throughput_train_float32": [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]]}, '
-        '"result": {"steptime_train_float32": [2.0], "throughput_train_float32": [16000.0]}, '
+        '"result": {"return_code": [0], "steptime_train_float32": [2.0], "throughput_train_float32": [16000.0]}, '
        '"reduce_op": {"steptime_train_float32": "max", "throughput_train_float32": "min"}}'
    )
    assert (benchmark._preprocess())
@@ -236,7 +236,7 @@ def test_train():
    benchmark = create_benchmark('--num_steps 0')
    expected_result = (
        '{"name": "pytorch-fake-model", "type": "model", "run_count": 1, "return_code": 3, '
-        '"start_time": null, "end_time": null, "raw_data": {}, "result": {}, "reduce_op": {}}'
+        '"start_time": null, "end_time": null, "raw_data": {}, "result": {"return_code": [3]}, "reduce_op": {}}'
    )
    assert (benchmark._preprocess())
    assert (benchmark._ModelBenchmark__train(Precision.FLOAT32) is False)
@@ -251,8 +251,8 @@ def test_inference():
        '"start_time": null, "end_time": null, "raw_data": {'
        '"steptime_inference_float16": [[4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0]], '
        '"throughput_inference_float16": [[8000.0, 8000.0, 8000.0, 8000.0, 8000.0, 8000.0, 8000.0, 8000.0]]}, '
-        '"result": {"steptime_inference_float16": [4.0], "throughput_inference_float16": [8000.0]}, '
-        '"reduce_op": {"steptime_inference_float16": null, "throughput_inference_float16": null}}'
+        '"result": {"return_code": [0], "steptime_inference_float16": [4.0], "throughput_inference_float16": '
+        '[8000.0]}, "reduce_op": {"steptime_inference_float16": null, "throughput_inference_float16": null}}'
    )
    assert (benchmark._preprocess())
    assert (benchmark._ModelBenchmark__inference(Precision.FLOAT16))
@@ -262,7 +262,7 @@ def test_inference():
    benchmark = create_benchmark('--num_steps 0')
    expected_result = (
        '{"name": "pytorch-fake-model", "type": "model", "run_count": 1, "return_code": 3, '
-        '"start_time": null, "end_time": null, "raw_data": {}, "result": {}, "reduce_op": {}}'
+        '"start_time": null, "end_time": null, "raw_data": {}, "result": {"return_code": [3]}, "reduce_op": {}}'
    )
    assert (benchmark._preprocess())
    assert (benchmark._ModelBenchmark__inference(Precision.FLOAT16) is False)
@@ -287,6 +287,7 @@ def test_benchmark():
    }
    assert (benchmark.raw_data == expected_raw_data)
    expected_result = {
+        'return_code': [0],
        'steptime_train_float32': [2.0],
        'throughput_train_float32': [16000.0],
        'steptime_train_float16': [2.0],
@@ -300,7 +301,7 @@ def test_benchmark():
        '"throughput_train_float32": [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]], '
        '"steptime_train_float16": [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]], '
        '"throughput_train_float16": [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]]}, '
-        '"result": {"steptime_train_float32": [2.0], "throughput_train_float32": [16000.0], '
+        '"result": {"return_code": [0], "steptime_train_float32": [2.0], "throughput_train_float32": [16000.0], '
        '"steptime_train_float16": [2.0], "throughput_train_float16": [16000.0]}, '
        '"reduce_op": {"steptime_train_float32": "max", "throughput_train_float32": "min", '
        '"steptime_train_float16": "max", "throughput_train_float16": "min"}}'
@@ -337,7 +338,7 @@ def test_check_result_format():
    assert (benchmark._Benchmark__check_raw_data())

    # Negative case for __check_result_format() - change List[int] to List[str].
-    benchmark._result._BenchmarkResult__result = {'metric1': ['2.0']}
+    benchmark._result._BenchmarkResult__result = {'return_code': [0], 'metric1': ['2.0']}
    assert (benchmark._Benchmark__check_summarized_result() is False)

    # Negative case for __check_raw_data() - change List[List[int]] to List[List[str]].

--- a/tests/benchmarks/test_registry.py
+++ b/tests/benchmarks/test_registry.py
@@ -139,7 +139,7 @@ def test_launch_benchmark():
    assert (benchmark.run_count == 1)
    assert (benchmark.return_code == ReturnCode.SUCCESS)
    assert (benchmark.raw_data == {'accumulation_result': ['1,3,6,10']})
-    assert (benchmark.result == {'accumulation_result': [10]})
+    assert (benchmark.result == {'return_code': [0], 'accumulation_result': [10]})

    # Replace the timestamp as null.
    result = re.sub(r'\"\d+-\d+-\d+ \d+:\d+:\d+\"', 'null', benchmark.serialized_result)
@@ -147,7 +147,7 @@ def test_launch_benchmark():
        '{"name": "accumulation", "type": "micro", "run_count": 1, '
        '"return_code": 0, "start_time": null, "end_time": null, '
        '"raw_data": {"accumulation_result": ["1,3,6,10"]}, '
-        '"result": {"accumulation_result": [10]}, '
+        '"result": {"return_code": [0], "accumulation_result": [10]}, '
        '"reduce_op": {"accumulation_result": null}}'
    )
    assert (result == expected)
@@ -163,7 +163,7 @@ def test_launch_benchmark():
    assert (benchmark.run_count == 1)
    assert (benchmark.return_code == ReturnCode.SUCCESS)
    assert (benchmark.raw_data == {'accumulation_result': ['1,3,6']})
-    assert (benchmark.result == {'accumulation_result': [6]})
+    assert (benchmark.result == {'return_code': [0], 'accumulation_result': [6]})

    # Replace the timestamp as null.
    result = re.sub(r'\"\d+-\d+-\d+ \d+:\d+:\d+\"', 'null', benchmark.serialized_result)
@@ -171,7 +171,7 @@ def test_launch_benchmark():
        '{"name": "accumulation", "type": "micro", "run_count": 1, '
        '"return_code": 0, "start_time": null, "end_time": null, '
        '"raw_data": {"accumulation_result": ["1,3,6"]}, '
-        '"result": {"accumulation_result": [6]}, '
+        '"result": {"return_code": [0], "accumulation_result": [6]}, '
        '"reduce_op": {"accumulation_result": null}}'
    )
    assert (result == expected)

--- a/tests/benchmarks/test_result.py
+++ b/tests/benchmarks/test_result.py
@@ -9,27 +9,27 @@

 def test_add_raw_data():
    """Test interface BenchmarkResult.add_raw_data()."""
-    result = BenchmarkResult('micro', BenchmarkType.MICRO.value, ReturnCode.SUCCESS.value)
+    result = BenchmarkResult('micro', BenchmarkType.MICRO, ReturnCode.SUCCESS)
    result.add_raw_data('metric1', 'raw log 1')
    result.add_raw_data('metric1', 'raw log 2')
    assert (result.raw_data['metric1'][0] == 'raw log 1')
    assert (result.raw_data['metric1'][1] == 'raw log 2')
-    assert (result.type == BenchmarkType.MICRO.value)
-    assert (result.return_code == ReturnCode.SUCCESS.value)
+    assert (result.type == BenchmarkType.MICRO)
+    assert (result.return_code == ReturnCode.SUCCESS)

-    result = BenchmarkResult('model', BenchmarkType.MODEL.value, ReturnCode.SUCCESS.value)
+    result = BenchmarkResult('model', BenchmarkType.MODEL, ReturnCode.SUCCESS)
    result.add_raw_data('metric1', [1, 2, 3])
    result.add_raw_data('metric1', [4, 5, 6])

    assert (result.raw_data['metric1'][0] == [1, 2, 3])
    assert (result.raw_data['metric1'][1] == [4, 5, 6])
-    assert (result.type == BenchmarkType.MODEL.value)
-    assert (result.return_code == ReturnCode.SUCCESS.value)
+    assert (result.type == BenchmarkType.MODEL)
+    assert (result.return_code == ReturnCode.SUCCESS)


 def test_add_result():
    """Test interface BenchmarkResult.add_result()."""
-    result = BenchmarkResult('micro', BenchmarkType.MICRO.value, ReturnCode.SUCCESS.value)
+    result = BenchmarkResult('micro', BenchmarkType.MICRO, ReturnCode.SUCCESS)
    result.add_result('metric1', 300)
    result.add_result('metric1', 200)
    assert (result.result['metric1'][0] == 300)
@@ -38,7 +38,7 @@ def test_add_result():

 def test_set_timestamp():
    """Test interface BenchmarkResult.set_timestamp()."""
-    result = BenchmarkResult('micro', BenchmarkType.MICRO.value, ReturnCode.SUCCESS.value)
+    result = BenchmarkResult('micro', BenchmarkType.MICRO, ReturnCode.SUCCESS)
    start_time = '2021-02-03 16:59:49'
    end_time = '2021-02-03 17:00:08'
    result.set_timestamp(start_time, end_time)
@@ -48,25 +48,28 @@ def test_set_timestamp():

 def test_set_benchmark_type():
    """Test interface BenchmarkResult.set_benchmark_type()."""
-    result = BenchmarkResult('micro', BenchmarkType.MICRO.value, ReturnCode.SUCCESS.value)
-    result.set_benchmark_type(BenchmarkType.MICRO.value)
-    assert (result.type == BenchmarkType.MICRO.value)
+    result = BenchmarkResult('micro', BenchmarkType.MICRO, ReturnCode.SUCCESS)
+    result.set_benchmark_type(BenchmarkType.MICRO)
+    assert (result.type == BenchmarkType.MICRO)


 def test_set_return_code():
    """Test interface BenchmarkResult.set_return_code()."""
-    result = BenchmarkResult('micro', BenchmarkType.MICRO.value, ReturnCode.SUCCESS.value)
-    assert (result.return_code == ReturnCode.SUCCESS.value)
-    result.set_return_code(ReturnCode.INVALID_ARGUMENT.value)
-    assert (result.return_code == ReturnCode.INVALID_ARGUMENT.value)
-    result.set_return_code(ReturnCode.INVALID_BENCHMARK_RESULT.value)
-    assert (result.return_code == ReturnCode.INVALID_BENCHMARK_RESULT.value)
+    result = BenchmarkResult('micro', BenchmarkType.MICRO, ReturnCode.SUCCESS)
+    assert (result.return_code == ReturnCode.SUCCESS)
+    assert (result.result['return_code'] == [ReturnCode.SUCCESS.value])
+    result.set_return_code(ReturnCode.INVALID_ARGUMENT)
+    assert (result.return_code == ReturnCode.INVALID_ARGUMENT)
+    assert (result.result['return_code'] == [ReturnCode.INVALID_ARGUMENT.value])
+    result.set_return_code(ReturnCode.INVALID_BENCHMARK_RESULT)
+    assert (result.return_code == ReturnCode.INVALID_BENCHMARK_RESULT)
+    assert (result.result['return_code'] == [ReturnCode.INVALID_BENCHMARK_RESULT.value])


 def test_serialize_deserialize():
    """Test serialization/deserialization and compare the results."""
    # Result with one metric.
-    result = BenchmarkResult('pytorch-bert-base1', BenchmarkType.MICRO.value, ReturnCode.SUCCESS.value, run_count=2)
+    result = BenchmarkResult('pytorch-bert-base1', BenchmarkType.MICRO, ReturnCode.SUCCESS, run_count=2)
    result.add_result('metric1', 300, ReduceType.MAX)
    result.add_result('metric1', 200, ReduceType.MAX)
    result.add_result('metric2', 100, ReduceType.AVG)
@@ -76,13 +79,13 @@ def test_serialize_deserialize():
    start_time = '2021-02-03 16:59:49'
    end_time = '2021-02-03 17:00:08'
    result.set_timestamp(start_time, end_time)
-    result.set_benchmark_type(BenchmarkType.MICRO.value)
+    result.set_benchmark_type(BenchmarkType.MICRO)

    expected = (
        '{"name": "pytorch-bert-base1", "type": "micro", "run_count": 2, "return_code": 0, '
        '"start_time": "2021-02-03 16:59:49", "end_time": "2021-02-03 17:00:08", '
        '"raw_data": {"metric1": [[1, 2, 3], [4, 5, 6], [7, 8, 9]]}, '
-        '"result": {"metric1": [300, 200], "metric2": [100]}, '
+        '"result": {"return_code": [0], "metric1": [300, 200], "metric2": [100]}, '
        '"reduce_op": {"metric1": "max", "metric2": "avg"}}'
    )
    assert (result.to_string() == expected)