Unverified Commit e00a8180 authored by Yifan Xiong's avatar Yifan Xiong Committed by GitHub
Browse files

Support node_num=1 in mpi mode (#372)

Support `node_num: 1` in mpi mode, so that we can run mpi benchmarks in
both 1 node and all nodes in one config by changing `node_num`.
Update docs and add test case accordingly.
parent 9f03d568
......@@ -396,7 +396,7 @@ Some attributes may only be suitable for specific mode.
| | `local` | `torch.distributed` | `mpi` |
| ---------: | :-----: | :-----------------: | :---: |
| `proc_num` | ✓ | ✓ | ✓ |
| `node_num` | ✘ | ✓ | |
| `node_num` | ✘ | ✓ | |
| `prefix` | ✓ | ✘ | ✘ |
| `env` | ✓ | ✓ | ✓ |
| `mca` | ✘ | ✘ | ✓ |
......@@ -414,7 +414,7 @@ Each process will run an individual benchmark, how processes communicate depends
### `node_num`
Node number to run in the mode. Defaults to all nodes in the run.
Node number to run in the mode. Defaults to all nodes provided by host file in the run.
Will be ignored in `local` mode.
For example, assuming you are running model benchmark on 4 nodes,
......
......@@ -143,12 +143,12 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None):
'mpirun ' # use default OpenMPI in image
'-tag-output ' # tag mpi output with [jobid,rank]<stdout/stderr> prefix
'-allow-run-as-root ' # allow mpirun to run when executed by root user
'-hostfile hostfile ' # use prepared hostfile
'-map-by ppr:{proc_num}:node ' # launch {proc_num} processes on each node
'{host_list} ' # use prepared hostfile and launch {proc_num} processes on each node
'-bind-to numa ' # bind processes to numa
'{mca_list} {env_list} {command}'
).format(
proc_num=mode.proc_num,
host_list=f'-host localhost:{mode.proc_num}'
if mode.node_num == 1 else f'-hostfile hostfile -map-by ppr:{mode.proc_num}:node',
mca_list=' '.join(f'-mca {k} {v}' for k, v in mode.mca.items()),
env_list=' '.join(
f'-x {k}={str(v).format(proc_rank=mode.proc_rank, proc_num=mode.proc_num)}'
......@@ -402,7 +402,7 @@ def _run_proc(self, benchmark_name, mode, vars):
ansible_runner_config = self._ansible_client.get_shell_config(
fcmd.format(env_list=env_list, command=self.__get_mode_command(benchmark_name, mode, timeout))
)
if mode.name == 'mpi':
if mode.name == 'mpi' and mode.node_num != 1:
ansible_runner_config = self._ansible_client.update_mpi_config(ansible_runner_config)
ansible_runner_config['timeout'] = timeout
......
......@@ -168,6 +168,30 @@ def test_get_mode_command(self):
f'sb exec --output-dir {self.sb_output_dir} -c sb.config.yaml -C superbench.enable=foo'
),
},
{
'benchmark_name':
'foo',
'mode': {
'name': 'mpi',
'node_num': 1,
'proc_num': 8,
'proc_rank': 2,
'mca': {
'coll_hcoll_enable': 0,
},
'env': {
'SB_MICRO_PATH': '/sb',
'FOO': 'BAR',
'RANK': '{proc_rank}',
'NUM': '{proc_num}',
},
},
'expected_command': (
'mpirun -tag-output -allow-run-as-root -host localhost:8 -bind-to numa '
'-mca coll_hcoll_enable 0 -x SB_MICRO_PATH=/sb -x FOO=BAR -x RANK=2 -x NUM=8 '
f'sb exec --output-dir {self.sb_output_dir} -c sb.config.yaml -C superbench.enable=foo'
),
},
]
for test_case in test_cases:
with self.subTest(msg='Testing with case', test_case=test_case):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment