Benchmarks: Add Feature - Add timeout feature for each benchmark. (#288)

**Description** Add timeout feature for each benchmark. **Major Revision** - Add `timeout` config for each benchmark. In current config files, only set the timeout for kernel-launch as example. Other benchmarks can be set in the future. - Set the timeout config for `ansible_runner.run()`. Runner will get the return code 254: [ansible.py:80][WARNING] Run failed, return code 254. - Using `timeout` command to terminate the client process.

Benchmarks: Add Feature - Add timeout feature for each benchmark. (#288)
**Description** Add timeout feature for each benchmark. **Major Revision** - Add `timeout` config for each benchmark. In current config files, only set the timeout for kernel-launch as example. Other benchmarks can be set in the future. - Set the timeout config for `ansible_runner.run()`. Runner will get the return code 254: [ansible.py:80][WARNING] Run failed, return code 254. - Using `timeout` command to terminate the client process.
d877ca23 · guoshzhao · GitHub · f283b536 · d877ca23 · d877ca23
Unverified Commit d877ca23 authored Jan 28, 2022 by guoshzhao Committed by GitHub Jan 28, 2022
Showing with 32 additions and 2 deletions

docs/superbench-config.mdx docs/superbench-config.mdx +10 -0

superbench/runner/runner.py superbench/runner/runner.py +11 -2

tests/runner/test_runner.py tests/runner/test_runner.py +11 -0

No files found.
--- a/docs/superbench-config.mdx
+++ b/docs/superbench-config.mdx
@@ -203,6 +203,7 @@ Definition for each benchmark, here is an overview of `Benchmark` configuration
 ```yaml
 ${benchmark_name}:
  enable: bool
+  timeout: int
  modes: [ Mode ]
  frameworks: [ enum ]
  parameters:
@@ -216,6 +217,7 @@ ${benchmark_name}:
 ```yaml
 model-benchmarks:${annotation}:
  enable: bool
+  timeout: int
  modes: [ Mode ]
  frameworks: [ enum ]
  models: [ enum ]
@@ -240,6 +242,7 @@ model-benchmarks:${annotation}:
 ```yaml
 kernel-launch:
  enable: true
+  timeout: 120
  modes:
    - name: local
      proc_num: 8
@@ -256,6 +259,7 @@ kernel-launch:
 ```yaml
 model-benchmarks:resnet:
  enable: true
+  timeout: 1800
  modes:
    - name: torch.distributed
      proc_num: 8
@@ -287,6 +291,12 @@ Enable current benchmark or not, can be overwritten by [`superbench.enable`](#su
 * default value: `true`
+### `timeout`
+Set the timeout value in seconds, the benchmarking will stop early if timeout is triggerred.
+* default value: none
 ### `modes`
 A list of modes in which the benchmark runs.

--- a/superbench/runner/runner.py
+++ b/superbench/runner/runner.py
@@ -102,12 +102,13 @@ class SuperBenchRunner():
                return list(self._sb_config.superbench.enable)
        return [k for k, v in self._sb_benchmarks.items() if v.enable]
-    def __get_mode_command(self, benchmark_name, mode):
+    def __get_mode_command(self, benchmark_name, mode, timeout=None):
        """Get runner command for given mode.
        Args:
            benchmark_name (str): Benchmark name.
            mode (DictConfig): Runner mode.
+            timeout (int): The timeout value in seconds.
        Return:
            str: Runner command.
@@ -116,6 +117,9 @@ class SuperBenchRunner():
            name=benchmark_name,
            output_dir=self._sb_output_dir,
        )
+        if timeout is not None:
+            exec_command = 'timeout {timeout} {command}'.format(timeout=timeout, command=exec_command)
        mode_command = exec_command
        if mode.name == 'local':
            mode_command = '{prefix} {command}'.format(
@@ -353,14 +357,19 @@ class SuperBenchRunner():
        """
        mode.update(vars)
        logger.info('Runner is going to run %s in %s mode, proc rank %d.', benchmark_name, mode.name, mode.proc_rank)
+        timeout = self._sb_benchmarks[benchmark_name].timeout
        ansible_runner_config = self._ansible_client.get_shell_config(
            (
                'docker exec sb-workspace bash -c '
                "'set -o allexport && source sb.env && set +o allexport && {command}'"
-            ).format(command=self.__get_mode_command(benchmark_name, mode))
+            ).format(command=self.__get_mode_command(benchmark_name, mode, timeout))
        )
        if mode.name == 'mpi':
            ansible_runner_config = self._ansible_client.update_mpi_config(ansible_runner_config)
+        ansible_runner_config['timeout'] = timeout
        rc = self._ansible_client.run(ansible_runner_config, sudo=True)
        return rc

--- a/tests/runner/test_runner.py
+++ b/tests/runner/test_runner.py
@@ -170,6 +170,17 @@ class RunnerTestCase(unittest.TestCase):
                    ), test_case['expected_command']
                )
+                test_case['timeout'] = 10
+                timeout_str = 'timeout {} '.format(test_case['timeout'])
+                index = test_case['expected_command'].find('sb exec')
+                expected_command = test_case['expected_command'][:index] + timeout_str + test_case['expected_command'][
+                    index:]
+                self.assertEqual(
+                    self.runner._SuperBenchRunner__get_mode_command(
+                        test_case['benchmark_name'], OmegaConf.create(test_case['mode']), test_case['timeout']
+                    ), expected_command
+                )
    def test_run_empty_benchmarks(self):
        """Test run empty benchmarks, nothing should happen."""
        self.runner._sb_enabled_benchmarks = []