Unverified Commit d877ca23 authored by guoshzhao's avatar guoshzhao Committed by GitHub
Browse files

Benchmarks: Add Feature - Add timeout feature for each benchmark. (#288)

**Description**
Add timeout feature for each benchmark.

**Major Revision**
- Add `timeout` config for each benchmark. In current config files, only set the timeout for kernel-launch as example. Other benchmarks can be set in the future.
- Set the timeout config for `ansible_runner.run()`. Runner will get the return code 254:
   [ansible.py:80][WARNING] Run failed, return code 254.
- Using `timeout` command to terminate the client process.
parent f283b536
...@@ -203,6 +203,7 @@ Definition for each benchmark, here is an overview of `Benchmark` configuration ...@@ -203,6 +203,7 @@ Definition for each benchmark, here is an overview of `Benchmark` configuration
```yaml ```yaml
${benchmark_name}: ${benchmark_name}:
enable: bool enable: bool
timeout: int
modes: [ Mode ] modes: [ Mode ]
frameworks: [ enum ] frameworks: [ enum ]
parameters: parameters:
...@@ -216,6 +217,7 @@ ${benchmark_name}: ...@@ -216,6 +217,7 @@ ${benchmark_name}:
```yaml ```yaml
model-benchmarks:${annotation}: model-benchmarks:${annotation}:
enable: bool enable: bool
timeout: int
modes: [ Mode ] modes: [ Mode ]
frameworks: [ enum ] frameworks: [ enum ]
models: [ enum ] models: [ enum ]
...@@ -240,6 +242,7 @@ model-benchmarks:${annotation}: ...@@ -240,6 +242,7 @@ model-benchmarks:${annotation}:
```yaml ```yaml
kernel-launch: kernel-launch:
enable: true enable: true
timeout: 120
modes: modes:
- name: local - name: local
proc_num: 8 proc_num: 8
...@@ -256,6 +259,7 @@ kernel-launch: ...@@ -256,6 +259,7 @@ kernel-launch:
```yaml ```yaml
model-benchmarks:resnet: model-benchmarks:resnet:
enable: true enable: true
timeout: 1800
modes: modes:
- name: torch.distributed - name: torch.distributed
proc_num: 8 proc_num: 8
...@@ -287,6 +291,12 @@ Enable current benchmark or not, can be overwritten by [`superbench.enable`](#su ...@@ -287,6 +291,12 @@ Enable current benchmark or not, can be overwritten by [`superbench.enable`](#su
* default value: `true` * default value: `true`
### `timeout`
Set the timeout value in seconds, the benchmarking will stop early if timeout is triggerred.
* default value: none
### `modes` ### `modes`
A list of modes in which the benchmark runs. A list of modes in which the benchmark runs.
......
...@@ -102,12 +102,13 @@ def __get_enabled_benchmarks(self): ...@@ -102,12 +102,13 @@ def __get_enabled_benchmarks(self):
return list(self._sb_config.superbench.enable) return list(self._sb_config.superbench.enable)
return [k for k, v in self._sb_benchmarks.items() if v.enable] return [k for k, v in self._sb_benchmarks.items() if v.enable]
def __get_mode_command(self, benchmark_name, mode): def __get_mode_command(self, benchmark_name, mode, timeout=None):
"""Get runner command for given mode. """Get runner command for given mode.
Args: Args:
benchmark_name (str): Benchmark name. benchmark_name (str): Benchmark name.
mode (DictConfig): Runner mode. mode (DictConfig): Runner mode.
timeout (int): The timeout value in seconds.
Return: Return:
str: Runner command. str: Runner command.
...@@ -116,6 +117,9 @@ def __get_mode_command(self, benchmark_name, mode): ...@@ -116,6 +117,9 @@ def __get_mode_command(self, benchmark_name, mode):
name=benchmark_name, name=benchmark_name,
output_dir=self._sb_output_dir, output_dir=self._sb_output_dir,
) )
if timeout is not None:
exec_command = 'timeout {timeout} {command}'.format(timeout=timeout, command=exec_command)
mode_command = exec_command mode_command = exec_command
if mode.name == 'local': if mode.name == 'local':
mode_command = '{prefix} {command}'.format( mode_command = '{prefix} {command}'.format(
...@@ -353,14 +357,19 @@ def _run_proc(self, benchmark_name, mode, vars): ...@@ -353,14 +357,19 @@ def _run_proc(self, benchmark_name, mode, vars):
""" """
mode.update(vars) mode.update(vars)
logger.info('Runner is going to run %s in %s mode, proc rank %d.', benchmark_name, mode.name, mode.proc_rank) logger.info('Runner is going to run %s in %s mode, proc rank %d.', benchmark_name, mode.name, mode.proc_rank)
timeout = self._sb_benchmarks[benchmark_name].timeout
ansible_runner_config = self._ansible_client.get_shell_config( ansible_runner_config = self._ansible_client.get_shell_config(
( (
'docker exec sb-workspace bash -c ' 'docker exec sb-workspace bash -c '
"'set -o allexport && source sb.env && set +o allexport && {command}'" "'set -o allexport && source sb.env && set +o allexport && {command}'"
).format(command=self.__get_mode_command(benchmark_name, mode)) ).format(command=self.__get_mode_command(benchmark_name, mode, timeout))
) )
if mode.name == 'mpi': if mode.name == 'mpi':
ansible_runner_config = self._ansible_client.update_mpi_config(ansible_runner_config) ansible_runner_config = self._ansible_client.update_mpi_config(ansible_runner_config)
ansible_runner_config['timeout'] = timeout
rc = self._ansible_client.run(ansible_runner_config, sudo=True) rc = self._ansible_client.run(ansible_runner_config, sudo=True)
return rc return rc
......
...@@ -170,6 +170,17 @@ def test_get_mode_command(self): ...@@ -170,6 +170,17 @@ def test_get_mode_command(self):
), test_case['expected_command'] ), test_case['expected_command']
) )
test_case['timeout'] = 10
timeout_str = 'timeout {} '.format(test_case['timeout'])
index = test_case['expected_command'].find('sb exec')
expected_command = test_case['expected_command'][:index] + timeout_str + test_case['expected_command'][
index:]
self.assertEqual(
self.runner._SuperBenchRunner__get_mode_command(
test_case['benchmark_name'], OmegaConf.create(test_case['mode']), test_case['timeout']
), expected_command
)
def test_run_empty_benchmarks(self): def test_run_empty_benchmarks(self):
"""Test run empty benchmarks, nothing should happen.""" """Test run empty benchmarks, nothing should happen."""
self.runner._sb_enabled_benchmarks = [] self.runner._sb_enabled_benchmarks = []
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment