Support node_num=1 in mpi mode (#372)

Support `node_num: 1` in mpi mode, so that we can run mpi benchmarks in both 1 node and all nodes in one config by changing `node_num`. Update docs and add test case accordingly.

Support node_num=1 in mpi mode (#372)
Support `node_num: 1` in mpi mode, so that we can run mpi benchmarks in both 1 node and all nodes in one config by changing `node_num`. Update docs and add test case accordingly.
e00a8180 · Yifan Xiong · GitHub · 9f03d568 · e00a8180 · e00a8180
Unverified Commit e00a8180 authored Jul 08, 2022 by Yifan Xiong Committed by GitHub Jul 08, 2022
Showing with 30 additions and 6 deletions

docs/superbench-config.mdx docs/superbench-config.mdx +2 -2

superbench/runner/runner.py superbench/runner/runner.py +4 -4

tests/runner/test_runner.py tests/runner/test_runner.py +24 -0

No files found.
--- a/docs/superbench-config.mdx
+++ b/docs/superbench-config.mdx
@@ -396,7 +396,7 @@ Some attributes may only be suitable for specific mode.
 |            | `local` | `torch.distributed` | `mpi` |
 | ---------: | :-----: | :-----------------: | :---: |
 | `proc_num` |    ✓    |          ✓          |   ✓   |
-| `node_num` |    ✘    |          ✓          |   ✘   |
+| `node_num` |    ✘    |          ✓          |   ✓   |
 | `prefix`   |    ✓    |          ✘          |   ✘   |
 | `env`      |    ✓    |          ✓          |   ✓   |
 | `mca`      |    ✘    |          ✘          |   ✓   |
@@ -414,7 +414,7 @@ Each process will run an individual benchmark, how processes communicate depends

 ### `node_num`

-Node number to run in the mode. Defaults to all nodes in the run.
+Node number to run in the mode. Defaults to all nodes provided by host file in the run.
 Will be ignored in `local` mode.

 For example, assuming you are running model benchmark on 4 nodes,

--- a/superbench/runner/runner.py
+++ b/superbench/runner/runner.py
@@ -143,12 +143,12 @@ class SuperBenchRunner():
                'mpirun '    # use default OpenMPI in image
                '-tag-output '    # tag mpi output with [jobid,rank]<stdout/stderr> prefix
                '-allow-run-as-root '    # allow mpirun to run when executed by root user
-                '-hostfile hostfile '    # use prepared hostfile
-                '-map-by ppr:{proc_num}:node '    # launch {proc_num} processes on each node
+                '{host_list} '    # use prepared hostfile and launch {proc_num} processes on each node
                '-bind-to numa '    # bind processes to numa
                '{mca_list} {env_list} {command}'
            ).format(
-                proc_num=mode.proc_num,
+                host_list=f'-host localhost:{mode.proc_num}'
+                if mode.node_num == 1 else f'-hostfile hostfile -map-by ppr:{mode.proc_num}:node',
                mca_list=' '.join(f'-mca {k} {v}' for k, v in mode.mca.items()),
                env_list=' '.join(
                    f'-x {k}={str(v).format(proc_rank=mode.proc_rank, proc_num=mode.proc_num)}'
@@ -402,7 +402,7 @@ class SuperBenchRunner():
        ansible_runner_config = self._ansible_client.get_shell_config(
            fcmd.format(env_list=env_list, command=self.__get_mode_command(benchmark_name, mode, timeout))
        )
-        if mode.name == 'mpi':
+        if mode.name == 'mpi' and mode.node_num != 1:
            ansible_runner_config = self._ansible_client.update_mpi_config(ansible_runner_config)

        ansible_runner_config['timeout'] = timeout

--- a/tests/runner/test_runner.py
+++ b/tests/runner/test_runner.py
@@ -168,6 +168,30 @@ class RunnerTestCase(unittest.TestCase):
                    f'sb exec --output-dir {self.sb_output_dir} -c sb.config.yaml -C superbench.enable=foo'
                ),
            },
+            {
+                'benchmark_name':
+                'foo',
+                'mode': {
+                    'name': 'mpi',
+                    'node_num': 1,
+                    'proc_num': 8,
+                    'proc_rank': 2,
+                    'mca': {
+                        'coll_hcoll_enable': 0,
+                    },
+                    'env': {
+                        'SB_MICRO_PATH': '/sb',
+                        'FOO': 'BAR',
+                        'RANK': '{proc_rank}',
+                        'NUM': '{proc_num}',
+                    },
+                },
+                'expected_command': (
+                    'mpirun -tag-output -allow-run-as-root -host localhost:8 -bind-to numa '
+                    '-mca coll_hcoll_enable 0 -x SB_MICRO_PATH=/sb -x FOO=BAR -x RANK=2 -x NUM=8 '
+                    f'sb exec --output-dir {self.sb_output_dir} -c sb.config.yaml -C superbench.enable=foo'
+                ),
+            },
        ]
        for test_case in test_cases:
            with self.subTest(msg='Testing with case', test_case=test_case):