Unverified Commit 8e748d56 authored by Yang Wang's avatar Yang Wang Committed by GitHub
Browse files

Runner - Generate host groups file in mpi mode (#458)

**Major Revision**

- Add an option for pattern to generate mpi_pattern.txt file if
specified the path.
- In mpi pattern, serial_index and parallel_index will add in each
benchmark as environment variables.

**Minor Revision**
- Fix typo
parent 5197cdf5
...@@ -72,6 +72,7 @@ Here're the details about work directory structure for SuperBench Runner. ...@@ -72,6 +72,7 @@ Here're the details about work directory structure for SuperBench Runner.
│ └── sb-exec.log # collected SuperBench Executor log │ └── sb-exec.log # collected SuperBench Executor log
├── sb-run.log # SuperBench Runner log ├── sb-run.log # SuperBench Runner log
├── sb.config.yaml # SuperBench configuration snapshot ├── sb.config.yaml # SuperBench configuration snapshot
├── mpi_pattern.txt # generated host groups file under specified patterns in mpi mode (optional)
├── ssh_config # generated SSH config file ├── ssh_config # generated SSH config file
├── id_ed25519 # generated SSH private key for each run ├── id_ed25519 # generated SSH private key for each run
└── id_ed25519.pub # generated SSH public key for each run └── id_ed25519.pub # generated SSH public key for each run
......
...@@ -463,10 +463,12 @@ Pattern variables to run benchmarks with nodes in specified traffic pattern comb ...@@ -463,10 +463,12 @@ Pattern variables to run benchmarks with nodes in specified traffic pattern comb
Only available for `mpi` mode. Only available for `mpi` mode.
Available variables in formatted string includes: Available variables in formatted string includes:
+ `type`: the traffic pattern type, required. + `type(str)`: the traffic pattern type, required.
* accepted values: `all-nodes`, `pair-wise`, `k-batch`, `topo-aware` * accepted values: `all-nodes`, `pair-wise`, `k-batch`, `topo-aware`
+ `batch`: the scale of batch, required in `k-batch` pattern. + `mpi_pattern(bool)`: generate pattern config file in `./output/mpi_pattern.txt` for diagnosis, required.
+ `ibstat`: the path of ibstat output, wil be auto-generated in `./output/ibstat_file.txt` if not specified, optional in `topo-aware` pattern + `batch(int)`: the scale of batch, required in `k-batch` pattern.
+ `ibnetdiscover`: the path of ibnetdiscover output `ibnetdiscover_file.txt`, required in `topo-aware` pattern. + `ibstat(str)`: the path of ibstat output, wil be auto-generated in `./output/ibstat_file.txt` if not specified, optional in `topo-aware` pattern
+ `min_dist`: minimum distance of VM pair, required in `topo-aware` pattern. + `ibnetdiscover(str)`: the path of ibnetdiscover output `ibnetdiscover_file.txt`, required in `topo-aware` pattern.
+ `max_dist`: maximum distance of VM pair, required in `topo-aware` pattern. + `min_dist(int)`: minimum distance of VM pair, required in `topo-aware` pattern.
+ `max_dist(int)`: maximum distance of VM pair, required in `topo-aware` pattern.
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
from superbench.common.utils.process import run_command from superbench.common.utils.process import run_command
from superbench.common.utils.topo_aware import gen_topo_aware_config from superbench.common.utils.topo_aware import gen_topo_aware_config
from superbench.common.utils.gen_traffic_pattern_config import ( from superbench.common.utils.gen_traffic_pattern_config import (
gen_pair_wise_config, gen_traffic_pattern_host_group, gen_ibstat gen_pair_wise_config, gen_traffic_pattern_host_groups, gen_ibstat
) )
device_manager = LazyImport('superbench.common.utils.device_manager') device_manager = LazyImport('superbench.common.utils.device_manager')
...@@ -31,6 +31,6 @@ ...@@ -31,6 +31,6 @@
'run_command', 'run_command',
'gen_topo_aware_config', 'gen_topo_aware_config',
'gen_pair_wise_config', 'gen_pair_wise_config',
'gen_traffic_pattern_host_group', 'gen_traffic_pattern_host_groups',
'gen_ibstat', 'gen_ibstat',
] ]
...@@ -154,15 +154,17 @@ def _ibstat_parser(artifact_dir): ...@@ -154,15 +154,17 @@ def _ibstat_parser(artifact_dir):
return ibstat_path return ibstat_path
def gen_traffic_pattern_host_group(host_list, pattern): def gen_traffic_pattern_host_groups(host_list, pattern, mpi_pattern_path, benchmark_name):
"""Generate host group from specified traffic pattern. """Generate host group from specified traffic pattern and write in specified path.
Args: Args:
host_list (list): the list of hostnames read from hostfile. host_list (list): the list of hostnames read from hostfile.
pattern (DictConfig): the mpi pattern dict. pattern (DictConfig): the mpi pattern dict.
mpi_pattern_path (str): the path of mpi pattern config file.
benchmark_name (str): the name of benchmark.
Returns: Returns:
host_group (list): the host group generated from traffic pattern. host_groups (list): the host groups generated from traffic pattern.
""" """
config = [] config = []
n = len(host_list) n = len(host_list)
...@@ -178,5 +180,17 @@ def gen_traffic_pattern_host_group(host_list, pattern): ...@@ -178,5 +180,17 @@ def gen_traffic_pattern_host_group(host_list, pattern):
) )
else: else:
logger.error('Unsupported traffic pattern: {}'.format(pattern.type)) logger.error('Unsupported traffic pattern: {}'.format(pattern.type))
host_group = __convert_config_to_host_group(config, host_list) host_groups = __convert_config_to_host_group(config, host_list)
return host_group # write traffic pattern host groups to specified path
if pattern.mpi_pattern:
with open(mpi_pattern_path, 'a') as f:
f.write('benchmark_name: {} pattern_type: {}'.format(benchmark_name, pattern.type) + '\n')
for host_group in host_groups:
row = []
for host_list in host_group:
group = ','.join(host_list)
row.append(group)
group = ';'.join(row)
f.write(group + '\n')
f.write('\n')
return host_groups
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
from joblib import Parallel, delayed from joblib import Parallel, delayed
from omegaconf import ListConfig, OmegaConf from omegaconf import ListConfig, OmegaConf
from superbench.common.utils import SuperBenchLogger, logger, gen_ibstat, gen_traffic_pattern_host_group from superbench.common.utils import SuperBenchLogger, logger, gen_ibstat, gen_traffic_pattern_host_groups
from superbench.runner.ansible import AnsibleClient from superbench.runner.ansible import AnsibleClient
from superbench.benchmarks import ReduceType, Reducer from superbench.benchmarks import ReduceType, Reducer
from superbench.monitor import MonitorRecord from superbench.monitor import MonitorRecord
...@@ -405,6 +405,8 @@ def _run_proc(self, benchmark_name, mode, vars): ...@@ -405,6 +405,8 @@ def _run_proc(self, benchmark_name, mode, vars):
int: Process return code. int: Process return code.
""" """
mode.update(vars) mode.update(vars)
if mode.name == 'mpi' and mode.pattern:
mode.env.update({'SB_MODE_SERIAL_INDEX': mode.serial_index, 'SB_MODE_PARALLEL_INDEX': mode.parallel_index})
logger.info('Runner is going to run %s in %s mode, proc rank %d.', benchmark_name, mode.name, mode.proc_rank) logger.info('Runner is going to run %s in %s mode, proc rank %d.', benchmark_name, mode.name, mode.proc_rank)
timeout = self._sb_benchmarks[benchmark_name].timeout timeout = self._sb_benchmarks[benchmark_name].timeout
...@@ -460,14 +462,21 @@ def run(self): ...@@ -460,14 +462,21 @@ def run(self):
continue continue
with open(self._output_path / 'hostfile', 'r') as f: with open(self._output_path / 'hostfile', 'r') as f:
host_list = f.read().splitlines() host_list = f.read().splitlines()
pattern_hostx = gen_traffic_pattern_host_group(host_list, mode.pattern) host_groups = gen_traffic_pattern_host_groups(
for host_groups in pattern_hostx: host_list, mode.pattern, self._output_path / 'mpi_pattern.txt', benchmark_name
para_rc_list = Parallel(n_jobs=len(host_groups))( )
delayed(self._run_proc) for serial_index, host_group in enumerate(host_groups):
(benchmark_name, mode, vars={ para_rc_list = Parallel(n_jobs=len(host_group))(
'proc_rank': 0, delayed(self._run_proc)(
'host_list': host_group, benchmark_name,
}) for host_group in host_groups mode,
vars={
'proc_rank': 0,
'host_list': host_list,
'serial_index': str(serial_index),
'parallel_index': str(parallel_index),
}
) for parallel_index, host_list in enumerate(host_group)
) )
ansible_rc = ansible_rc + sum(para_rc_list) ansible_rc = ansible_rc + sum(para_rc_list)
else: else:
......
...@@ -4,27 +4,40 @@ ...@@ -4,27 +4,40 @@
"""Tests for traffic pattern config generation module.""" """Tests for traffic pattern config generation module."""
import argparse import argparse
import unittest import unittest
import tempfile
from tests.helper import decorator from tests.helper import decorator
from superbench.common.utils import gen_traffic_pattern_host_group from superbench.common.utils import gen_traffic_pattern_host_groups
class GenConfigTest(unittest.TestCase): class GenConfigTest(unittest.TestCase):
"""Test the utils for generating config.""" """Test the utils for generating config."""
@decorator.load_data('tests/data/mpi_pattern.txt') # noqa: C901
@decorator.load_data('tests/data/ib_traffic_topo_aware_hostfile') # noqa: C901 @decorator.load_data('tests/data/ib_traffic_topo_aware_hostfile') # noqa: C901
def test_gen_traffic_pattern_host_group(self, tp_hostfile): def test_gen_traffic_pattern_host_group(self, expected_mpi_pattern, tp_hostfile):
"""Test the function of generating traffic pattern config from specified mode.""" """Test the function of generating traffic pattern config from specified mode."""
# Test for all-nodes pattern # Test for all-nodes pattern
test_config_file = tempfile.NamedTemporaryFile()
test_config_path = test_config_file.name
test_benchmark_name = 'test_benchmark'
hostx = ['node0', 'node1', 'node2', 'node3', 'node4', 'node5', 'node6', 'node7'] hostx = ['node0', 'node1', 'node2', 'node3', 'node4', 'node5', 'node6', 'node7']
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument( parser.add_argument(
'--type', '--type',
type=str, type=str,
default='all-nodes', default='all-nodes',
) )
parser.add_argument(
'--mpi_pattern',
type=bool,
default=True,
)
pattern, _ = parser.parse_known_args() pattern, _ = parser.parse_known_args()
expected_host_group = [[['node0', 'node1', 'node2', 'node3', 'node4', 'node5', 'node6', 'node7']]] expected_host_group = [[['node0', 'node1', 'node2', 'node3', 'node4', 'node5', 'node6', 'node7']]]
self.assertEqual(gen_traffic_pattern_host_group(hostx, pattern), expected_host_group) self.assertEqual(
gen_traffic_pattern_host_groups(hostx, pattern, test_config_path, test_benchmark_name), expected_host_group
)
# Test for pair-wise pattern # Test for pair-wise pattern
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
...@@ -33,6 +46,11 @@ def test_gen_traffic_pattern_host_group(self, tp_hostfile): ...@@ -33,6 +46,11 @@ def test_gen_traffic_pattern_host_group(self, tp_hostfile):
type=str, type=str,
default='pair-wise', default='pair-wise',
) )
parser.add_argument(
'--mpi_pattern',
type=bool,
default=True,
)
pattern, _ = parser.parse_known_args() pattern, _ = parser.parse_known_args()
expected_host_group = [ expected_host_group = [
[['node0', 'node7'], ['node1', 'node6'], ['node2', 'node5'], ['node3', 'node4']], [['node0', 'node7'], ['node1', 'node6'], ['node2', 'node5'], ['node3', 'node4']],
...@@ -43,7 +61,9 @@ def test_gen_traffic_pattern_host_group(self, tp_hostfile): ...@@ -43,7 +61,9 @@ def test_gen_traffic_pattern_host_group(self, tp_hostfile):
[['node0', 'node5'], ['node6', 'node4'], ['node7', 'node3'], ['node1', 'node2']], [['node0', 'node5'], ['node6', 'node4'], ['node7', 'node3'], ['node1', 'node2']],
[['node0', 'node6'], ['node7', 'node5'], ['node1', 'node4'], ['node2', 'node3']] [['node0', 'node6'], ['node7', 'node5'], ['node1', 'node4'], ['node2', 'node3']]
] ]
self.assertEqual(gen_traffic_pattern_host_group(hostx, pattern), expected_host_group) self.assertEqual(
gen_traffic_pattern_host_groups(hostx, pattern, test_config_path, test_benchmark_name), expected_host_group
)
# Test for k-batch pattern # Test for k-batch pattern
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
...@@ -57,9 +77,16 @@ def test_gen_traffic_pattern_host_group(self, tp_hostfile): ...@@ -57,9 +77,16 @@ def test_gen_traffic_pattern_host_group(self, tp_hostfile):
type=int, type=int,
default=3, default=3,
) )
parser.add_argument(
'--mpi_pattern',
type=bool,
default=True,
)
pattern, _ = parser.parse_known_args() pattern, _ = parser.parse_known_args()
expected_host_group = [[['node0', 'node1', 'node2'], ['node3', 'node4', 'node5']]] expected_host_group = [[['node0', 'node1', 'node2'], ['node3', 'node4', 'node5']]]
self.assertEqual(gen_traffic_pattern_host_group(hostx, pattern), expected_host_group) self.assertEqual(
gen_traffic_pattern_host_groups(hostx, pattern, test_config_path, test_benchmark_name), expected_host_group
)
# Test for topo-aware pattern # Test for topo-aware pattern
tp_ibstat_path = 'tests/data/ib_traffic_topo_aware_ibstat.txt' tp_ibstat_path = 'tests/data/ib_traffic_topo_aware_ibstat.txt'
...@@ -90,6 +117,11 @@ def test_gen_traffic_pattern_host_group(self, tp_hostfile): ...@@ -90,6 +117,11 @@ def test_gen_traffic_pattern_host_group(self, tp_hostfile):
type=int, type=int,
default=6, default=6,
) )
parser.add_argument(
'--mpi_pattern',
type=bool,
default=True,
)
hostx = tp_hostfile.split() hostx = tp_hostfile.split()
pattern, _ = parser.parse_known_args() pattern, _ = parser.parse_known_args()
expected_host_group = [ expected_host_group = [
...@@ -107,7 +139,15 @@ def test_gen_traffic_pattern_host_group(self, tp_hostfile): ...@@ -107,7 +139,15 @@ def test_gen_traffic_pattern_host_group(self, tp_hostfile):
['vma414bbc00005K', 'vma414bbc00005Q'], ['vma414bbc00005L', 'vma414bbc00005R'] ['vma414bbc00005K', 'vma414bbc00005Q'], ['vma414bbc00005L', 'vma414bbc00005R']
] ]
] ]
self.assertEqual(gen_traffic_pattern_host_group(hostx, pattern), expected_host_group) self.assertEqual(
gen_traffic_pattern_host_groups(hostx, pattern, test_config_path, test_benchmark_name), expected_host_group
)
# Test for mpi_pattern file
with open(test_config_path, 'r') as f:
content = f.read()
self.assertEqual(content, expected_mpi_pattern)
test_config_file.close()
# Test for invalid pattern # Test for invalid pattern
hostx = ['node0', 'node1', 'node2', 'node3', 'node4', 'node5', 'node6', 'node7'] hostx = ['node0', 'node1', 'node2', 'node3', 'node4', 'node5', 'node6', 'node7']
...@@ -117,5 +157,10 @@ def test_gen_traffic_pattern_host_group(self, tp_hostfile): ...@@ -117,5 +157,10 @@ def test_gen_traffic_pattern_host_group(self, tp_hostfile):
type=str, type=str,
default='invalid pattern', default='invalid pattern',
) )
parser.add_argument(
'--mpi_pattern',
type=bool,
default=True,
)
pattern, _ = parser.parse_known_args() pattern, _ = parser.parse_known_args()
gen_traffic_pattern_host_group(hostx, pattern) gen_traffic_pattern_host_groups(hostx, pattern, test_config_path, test_benchmark_name)
benchmark_name: test_benchmark pattern_type: all-nodes
node0,node1,node2,node3,node4,node5,node6,node7
benchmark_name: test_benchmark pattern_type: pair-wise
node0,node7;node1,node6;node2,node5;node3,node4
node0,node1;node2,node7;node3,node6;node4,node5
node0,node2;node3,node1;node4,node7;node5,node6
node0,node3;node4,node2;node5,node1;node6,node7
node0,node4;node5,node3;node6,node2;node7,node1
node0,node5;node6,node4;node7,node3;node1,node2
node0,node6;node7,node5;node1,node4;node2,node3
benchmark_name: test_benchmark pattern_type: k-batch
node0,node1,node2;node3,node4,node5
benchmark_name: test_benchmark pattern_type: topo-aware
vma414bbc00005I,vma414bbc00005J;vma414bbc00005K,vma414bbc00005L;vma414bbc00005M,vma414bbc00005N;vma414bbc00005O,vma414bbc00005P;vma414bbc00005Q,vma414bbc00005R
vma414bbc00005I,vma414bbc00005K;vma414bbc00005J,vma414bbc00005L;vma414bbc00005O,vma414bbc00005Q;vma414bbc00005P,vma414bbc00005R
vma414bbc00005I,vma414bbc00005O;vma414bbc00005J,vma414bbc00005P;vma414bbc00005K,vma414bbc00005Q;vma414bbc00005L,vma414bbc00005R
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment