Benchmarks: Add Feature - Provide option to save raw data into file. (#333)

**Description** Use config `log_raw_data` to control whether log the raw data into file or not. The default value is `no`. We can set it as `yes` for some particular benchmarks to save the raw data into file, such as NCCL/RCCL test.

Benchmarks: Add Feature - Provide option to save raw data into file. (#333)
**Description** Use config `log_raw_data` to control whether log the raw data into file or not. The default value is `no`. We can set it as `yes` for some particular benchmarks to save the raw data into file, such as NCCL/RCCL test.
6d895da8 · guoshzhao · GitHub · d368d90e · 6d895da8 · 6d895da8
Unverified Commit 6d895da8 authored Apr 01, 2022 by guoshzhao Committed by GitHub Apr 01, 2022
13 changed files
--- a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py
@@ -127,7 +127,9 @@ def _process_raw_result(self, cmd_idx, raw_output):
        Return:
            True if the raw output string is valid and result can be extracted.
        """
-        self._result.add_raw_data(f'raw_output_{self._args.pytorch_models[cmd_idx]}', raw_output)
+        self._result.add_raw_data(
+            f'raw_output_{self._args.pytorch_models[cmd_idx]}', raw_output, self._args.log_raw_data
+        )

        success = False
        try:

--- a/superbench/benchmarks/model_benchmarks/model_base.py
+++ b/superbench/benchmarks/model_benchmarks/model_base.py
@@ -400,8 +400,8 @@ def __process_model_result(self, model_action, precision, step_times):
        # The unit of step time is millisecond, use it to calculate the throughput with the unit samples/sec.
        millisecond_per_second = 1000
        throughput = [millisecond_per_second / step_time * self._args.batch_size for step_time in step_times]
-        self._result.add_raw_data(metric_s, step_times)
-        self._result.add_raw_data(metric_t, throughput)
+        self._result.add_raw_data(metric_s, step_times, self._args.log_raw_data)
+        self._result.add_raw_data(metric_t, throughput, self._args.log_raw_data)

        if model_action == ModelAction.TRAIN:
            if not self._sync_result(step_times):

--- a/superbench/benchmarks/result.py
+++ b/superbench/benchmarks/result.py
@@ -3,6 +3,7 @@

 """A module for unified result of benchmarks."""

+import os
 import json
 from enum import Enum

@@ -46,7 +47,7 @@ def __eq__(self, rhs):
        """
        return self.__dict__ == rhs.__dict__

-    def add_raw_data(self, metric, value):
+    def add_raw_data(self, metric, value, log_raw_data):
        """Add raw benchmark data into result.

        Args:
@@ -54,6 +55,7 @@ def add_raw_data(self, metric, value):
            value (str or list): raw benchmark data.
              For e2e model benchmarks, its type is list.
              For micro-benchmarks or docker-benchmarks, its type is string.
+            log_raw_data (bool): whether to log raw data into file instead of saving it into result object.

        Return:
            True if succeed to add the raw data.
@@ -64,6 +66,11 @@ def add_raw_data(self, metric, value):
            )
            return False

+        if log_raw_data:
+            with open(os.path.join(os.getcwd(), 'rawdata.log'), 'a') as f:
+                f.write('metric:{}\n'.format(metric))
+                f.write('rawdata:{}\n\n'.format(value))
+        else:
            if metric not in self.__raw_data:
                self.__raw_data[metric] = list()
            self.__raw_data[metric].append(value)

--- a/superbench/executor/executor.py
+++ b/superbench/executor/executor.py
@@ -200,6 +200,8 @@ def exec(self):
            benchmark_config = self._sb_benchmarks[benchmark_name]
            benchmark_results = list()
            self.__create_benchmark_dir(benchmark_name)
+            cwd = os.getcwd()
+            os.chdir(self.__get_benchmark_dir(benchmark_name))

            monitor = None
            if self.__get_rank_id() == 0 and self._sb_monitor_config and self._sb_monitor_config.enable:
@@ -243,3 +245,4 @@ def exec(self):
            if monitor:
                monitor.stop()
            self.__write_benchmark_results(benchmark_name, benchmark_results)
+            os.chdir(cwd)
--- a/tests/benchmarks/docker_benchmarks/test_docker_base.py
+++ b/tests/benchmarks/docker_benchmarks/test_docker_base.py
@@ -33,7 +33,7 @@ def _process_raw_result(self, cmd_idx, raw_output):
        Return:
            True if the raw output string is valid and result can be extracted.
        """
-        self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output)
+        self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data)
        pattern = r'\d+\.\d+'
        result = re.findall(pattern, raw_output)
        if len(result) != 2:

--- a/tests/benchmarks/docker_benchmarks/test_rocm_onnxruntime_performance.py
+++ b/tests/benchmarks/docker_benchmarks/test_rocm_onnxruntime_performance.py
@@ -3,6 +3,8 @@

 """Tests for RocmOnnxRuntimeModelBenchmark modules."""

+from types import SimpleNamespace
+
 from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, Platform, ReturnCode
 from superbench.benchmarks.result import BenchmarkResult

@@ -20,6 +22,7 @@ def test_rocm_onnxruntime_performance():
    assert (benchmark._entrypoint == '/stage/onnxruntime-training-examples/huggingface/azureml/run_benchmark.sh')
    assert (benchmark._cmd is None)
    benchmark._result = BenchmarkResult(benchmark._name, benchmark._benchmark_type, ReturnCode.SUCCESS)
+    benchmark._args = SimpleNamespace(log_raw_data=False)

    raw_output = """
 __superbench__ begin bert-large-uncased ngpu=1

--- a/tests/benchmarks/micro_benchmarks/test_gemm_flops_performance_base.py
+++ b/tests/benchmarks/micro_benchmarks/test_gemm_flops_performance_base.py
@@ -54,7 +54,7 @@ def _process_raw_result(self, cmd_idx, raw_output):
        Return:
            True if the raw output string is valid and result can be extracted.
        """
-        self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output)
+        self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data)

        try:
            params = raw_output.strip('\n').split('--')

--- a/tests/benchmarks/micro_benchmarks/test_memory_bw_performance_base.py
+++ b/tests/benchmarks/micro_benchmarks/test_memory_bw_performance_base.py
@@ -53,7 +53,7 @@ def _process_raw_result(self, cmd_idx, raw_output):
        Return:
            True if the raw output string is valid and result can be extracted.
        """
-        self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output)
+        self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data)

        try:
            params = raw_output.strip('\n').split(' memory=')

--- a/tests/benchmarks/micro_benchmarks/test_micro_base.py
+++ b/tests/benchmarks/micro_benchmarks/test_micro_base.py
@@ -69,7 +69,7 @@ def _process_raw_result(self, cmd_idx, raw_output):
        Return:
            True if the raw output string is valid and result can be extracted.
        """
-        self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output)
+        self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data)
        pattern = r'\d+\.\d+'
        result = re.findall(pattern, raw_output)
        if len(result) != 2:

--- a/tests/benchmarks/micro_benchmarks/test_tensorrt_inference_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_tensorrt_inference_performance.py
@@ -121,7 +121,7 @@ def test_tensorrt_inference_result_parsing(self, test_raw_log):
        """Test tensorrt-inference benchmark result parsing."""
        (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA)
        benchmark = benchmark_cls(self.benchmark_name, parameters='')
-        benchmark._args = SimpleNamespace(pytorch_models=['model_0', 'model_1'])
+        benchmark._args = SimpleNamespace(pytorch_models=['model_0', 'model_1'], log_raw_data=False)
        benchmark._result = BenchmarkResult(self.benchmark_name, BenchmarkType.MICRO, ReturnCode.SUCCESS, run_count=1)

        # Positive case - valid raw output

--- a/tests/benchmarks/model_benchmarks/test_model_base.py
+++ b/tests/benchmarks/model_benchmarks/test_model_base.py
@@ -158,6 +158,8 @@ def test_arguments_related_interfaces():
  --duration int        The elapsed time of benchmark in seconds.
  --force_fp32          Enable option to use full float32 precision.
  --hidden_size int     Hidden size.
+  --log_raw_data        Log raw data into file instead of saving it into
+                        result object.
  --model_action ModelAction [ModelAction ...]
                        Benchmark model process. E.g. train inference.
  --no_gpu              Disable GPU training.
@@ -192,6 +194,8 @@ def test_preprocess():
  --duration int        The elapsed time of benchmark in seconds.
  --force_fp32          Enable option to use full float32 precision.
  --hidden_size int     Hidden size.
+  --log_raw_data        Log raw data into file instead of saving it into
+                        result object.
  --model_action ModelAction [ModelAction ...]
                        Benchmark model process. E.g. train inference.
  --no_gpu              Disable GPU training.

--- a/tests/benchmarks/test_registry.py
+++ b/tests/benchmarks/test_registry.py
@@ -49,7 +49,7 @@ def _benchmark(self):
            raw_data.append(str(result))

        metric = 'accumulation_result'
-        self._result.add_raw_data(metric, ','.join(raw_data))
+        self._result.add_raw_data(metric, ','.join(raw_data), self._args.log_raw_data)
        self._result.add_result(metric, result)

        return True
@@ -114,6 +114,8 @@ def test_get_benchmark_configurable_settings():

    expected = """optional arguments:
  --duration int     The elapsed time of benchmark in seconds.
+  --log_raw_data     Log raw data into file instead of saving it into result
+                     object.
  --lower_bound int  The lower bound for accumulation.
  --run_count int    The run count of benchmark.
  --upper_bound int  The upper bound for accumulation."""

--- a/tests/benchmarks/test_result.py
+++ b/tests/benchmarks/test_result.py
@@ -3,6 +3,8 @@

 """Tests for BenchmarkResult module."""

+import os
+
 from superbench.benchmarks import BenchmarkType, ReturnCode, ReduceType
 from superbench.benchmarks.result import BenchmarkResult

@@ -10,22 +12,31 @@
 def test_add_raw_data():
    """Test interface BenchmarkResult.add_raw_data()."""
    result = BenchmarkResult('micro', BenchmarkType.MICRO, ReturnCode.SUCCESS)
-    result.add_raw_data('metric1', 'raw log 1')
-    result.add_raw_data('metric1', 'raw log 2')
+    result.add_raw_data('metric1', 'raw log 1', False)
+    result.add_raw_data('metric1', 'raw log 2', False)
    assert (result.raw_data['metric1'][0] == 'raw log 1')
    assert (result.raw_data['metric1'][1] == 'raw log 2')
    assert (result.type == BenchmarkType.MICRO)
    assert (result.return_code == ReturnCode.SUCCESS)

    result = BenchmarkResult('model', BenchmarkType.MODEL, ReturnCode.SUCCESS)
-    result.add_raw_data('metric1', [1, 2, 3])
-    result.add_raw_data('metric1', [4, 5, 6])
-
+    result.add_raw_data('metric1', [1, 2, 3], False)
+    result.add_raw_data('metric1', [4, 5, 6], False)
    assert (result.raw_data['metric1'][0] == [1, 2, 3])
    assert (result.raw_data['metric1'][1] == [4, 5, 6])
    assert (result.type == BenchmarkType.MODEL)
    assert (result.return_code == ReturnCode.SUCCESS)

+    # Test log_raw_data = True.
+    result = BenchmarkResult('micro', BenchmarkType.MICRO, ReturnCode.SUCCESS)
+    result.add_raw_data('metric1', 'raw log 1', True)
+    result.add_raw_data('metric1', 'raw log 2', True)
+    assert (result.type == BenchmarkType.MICRO)
+    assert (result.return_code == ReturnCode.SUCCESS)
+    raw_data_file = os.path.join(os.getcwd(), 'rawdata.log')
+    assert (os.path.isfile(raw_data_file))
+    os.remove(raw_data_file)
+

 def test_add_result():
    """Test interface BenchmarkResult.add_result()."""
@@ -73,9 +84,9 @@ def test_serialize_deserialize():
    result.add_result('metric1', 300, ReduceType.MAX)
    result.add_result('metric1', 200, ReduceType.MAX)
    result.add_result('metric2', 100, ReduceType.AVG)
-    result.add_raw_data('metric1', [1, 2, 3])
-    result.add_raw_data('metric1', [4, 5, 6])
-    result.add_raw_data('metric1', [7, 8, 9])
+    result.add_raw_data('metric1', [1, 2, 3], False)
+    result.add_raw_data('metric1', [4, 5, 6], False)
+    result.add_raw_data('metric1', [7, 8, 9], False)
    start_time = '2021-02-03 16:59:49'
    end_time = '2021-02-03 17:00:08'
    result.set_timestamp(start_time, end_time)