Benchmarks: Support error tolerance in micro-benchmark for CuDNN function (#490)

**Description** Support error tolerance in micro-benchmark for CuDNN function **Major Revision** - revise micro_base to support running the remaining commands run when one command failed in the microbenchmark - make error tolerance as true in cudnn functions

Benchmarks: Support error tolerance in micro-benchmark for CuDNN function (#490)
**Description** Support error tolerance in micro-benchmark for CuDNN function **Major Revision** - revise micro_base to support running the remaining commands run when one command failed in the microbenchmark - make error tolerance as true in cudnn functions
5a88db16 · Yuting Jiang · GitHub · b808135c · 5a88db16 · 5a88db16
Unverified Commit 5a88db16 authored Mar 20, 2023 by Yuting Jiang Committed by GitHub Mar 20, 2023
2 changed files
--- a/superbench/benchmarks/micro_benchmarks/cudnn_function.py
+++ b/superbench/benchmarks/micro_benchmarks/cudnn_function.py
@@ -367,6 +367,7 @@ def _preprocess(self):
        if not super()._preprocess():
            return False
+        self._args.tolerant_fail = True
        command = os.path.join(self._args.bin_dir, self._bin_name)
        command += (' --num_test ' + str(self._args.num_steps))
        command += (' --warm_up ' + str(self._args.num_warmup))
@@ -440,13 +441,14 @@ def _process_raw_result(self, cmd_idx, raw_output):
                    self._curr_run_index, cmd_idx, self._name, raw_output, str(e)
                )
            )
-            return False
+            error = True
        if error:
            logger.error(
                'Error in running cudnn test - round: {}, index of cmd: {}, benchmark: {}, raw data: {}'.format(
                    self._curr_run_index, cmd_idx, self._name, raw_output
                )
            )
+            self._result.add_result(metric.lower() + '_time', -1)
            return False
        return True

--- a/superbench/benchmarks/micro_benchmarks/micro_base.py
+++ b/superbench/benchmarks/micro_benchmarks/micro_base.py
@@ -110,6 +110,12 @@ def add_parser_arguments(self):
            required=False,
            help='Specify the directory of the benchmark binary.',
        )
+        self._parser.add_argument(
+            '--tolerant_fail',
+            action='store_true',
+            default=False,
+            help='Tolerant failure for sub microbenchmark.',
+        )
    def _set_binary_path(self):
        """Search the binary from self._args.bin_dir or from system environment path and set the binary directory.
@@ -166,6 +172,7 @@ def _benchmark(self):
        Return:
            True if run benchmark successfully.
        """
+        ret = True
        for cmd_idx in range(len(self._commands)):
            logger.info(
                'Execute command - round: {}, benchmark: {}, command: {}.'.format(
@@ -181,13 +188,15 @@ def _benchmark(self):
                        self._curr_run_index, self._name, output.stdout
                    )
                )
-                return False
+                ret = False
            else:
                if not self._process_raw_result(cmd_idx, output.stdout):
                    self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
-                    return False
+                    ret = False
+            if not self._args.tolerant_fail and ret is False:
+                return False
-        return True
+        return ret
    @abstractmethod
    def _process_raw_result(self, cmd_idx, raw_output):