Unverified Commit 8e748d56 authored by Yang Wang's avatar Yang Wang Committed by GitHub
Browse files

Runner - Generate host groups file in mpi mode (#458)

**Major Revision**

- Add an option for pattern to generate mpi_pattern.txt file if
specified the path.
- In mpi pattern, serial_index and parallel_index will add in each
benchmark as environment variables.

**Minor Revision**
- Fix typo
parent 5197cdf5
......@@ -72,6 +72,7 @@ Here're the details about work directory structure for SuperBench Runner.
│ └── sb-exec.log # collected SuperBench Executor log
├── sb-run.log # SuperBench Runner log
├── sb.config.yaml # SuperBench configuration snapshot
├── mpi_pattern.txt # generated host groups file under specified patterns in mpi mode (optional)
├── ssh_config # generated SSH config file
├── id_ed25519 # generated SSH private key for each run
└── id_ed25519.pub # generated SSH public key for each run
......
......@@ -463,10 +463,12 @@ Pattern variables to run benchmarks with nodes in specified traffic pattern comb
Only available for `mpi` mode.
Available variables in formatted string includes:
+ `type`: the traffic pattern type, required.
+ `type(str)`: the traffic pattern type, required.
* accepted values: `all-nodes`, `pair-wise`, `k-batch`, `topo-aware`
+ `batch`: the scale of batch, required in `k-batch` pattern.
+ `ibstat`: the path of ibstat output, wil be auto-generated in `./output/ibstat_file.txt` if not specified, optional in `topo-aware` pattern
+ `ibnetdiscover`: the path of ibnetdiscover output `ibnetdiscover_file.txt`, required in `topo-aware` pattern.
+ `min_dist`: minimum distance of VM pair, required in `topo-aware` pattern.
+ `max_dist`: maximum distance of VM pair, required in `topo-aware` pattern.
+ `mpi_pattern(bool)`: generate pattern config file in `./output/mpi_pattern.txt` for diagnosis, required.
+ `batch(int)`: the scale of batch, required in `k-batch` pattern.
+ `ibstat(str)`: the path of ibstat output, wil be auto-generated in `./output/ibstat_file.txt` if not specified, optional in `topo-aware` pattern
+ `ibnetdiscover(str)`: the path of ibnetdiscover output `ibnetdiscover_file.txt`, required in `topo-aware` pattern.
+ `min_dist(int)`: minimum distance of VM pair, required in `topo-aware` pattern.
+ `max_dist(int)`: maximum distance of VM pair, required in `topo-aware` pattern.
......@@ -11,7 +11,7 @@
from superbench.common.utils.process import run_command
from superbench.common.utils.topo_aware import gen_topo_aware_config
from superbench.common.utils.gen_traffic_pattern_config import (
gen_pair_wise_config, gen_traffic_pattern_host_group, gen_ibstat
gen_pair_wise_config, gen_traffic_pattern_host_groups, gen_ibstat
)
device_manager = LazyImport('superbench.common.utils.device_manager')
......@@ -31,6 +31,6 @@
'run_command',
'gen_topo_aware_config',
'gen_pair_wise_config',
'gen_traffic_pattern_host_group',
'gen_traffic_pattern_host_groups',
'gen_ibstat',
]
......@@ -154,15 +154,17 @@ def _ibstat_parser(artifact_dir):
return ibstat_path
def gen_traffic_pattern_host_group(host_list, pattern):
"""Generate host group from specified traffic pattern.
def gen_traffic_pattern_host_groups(host_list, pattern, mpi_pattern_path, benchmark_name):
"""Generate host group from specified traffic pattern and write in specified path.
Args:
host_list (list): the list of hostnames read from hostfile.
pattern (DictConfig): the mpi pattern dict.
mpi_pattern_path (str): the path of mpi pattern config file.
benchmark_name (str): the name of benchmark.
Returns:
host_group (list): the host group generated from traffic pattern.
host_groups (list): the host groups generated from traffic pattern.
"""
config = []
n = len(host_list)
......@@ -178,5 +180,17 @@ def gen_traffic_pattern_host_group(host_list, pattern):
)
else:
logger.error('Unsupported traffic pattern: {}'.format(pattern.type))
host_group = __convert_config_to_host_group(config, host_list)
return host_group
host_groups = __convert_config_to_host_group(config, host_list)
# write traffic pattern host groups to specified path
if pattern.mpi_pattern:
with open(mpi_pattern_path, 'a') as f:
f.write('benchmark_name: {} pattern_type: {}'.format(benchmark_name, pattern.type) + '\n')
for host_group in host_groups:
row = []
for host_list in host_group:
group = ','.join(host_list)
row.append(group)
group = ';'.join(row)
f.write(group + '\n')
f.write('\n')
return host_groups
......@@ -15,7 +15,7 @@
from joblib import Parallel, delayed
from omegaconf import ListConfig, OmegaConf
from superbench.common.utils import SuperBenchLogger, logger, gen_ibstat, gen_traffic_pattern_host_group
from superbench.common.utils import SuperBenchLogger, logger, gen_ibstat, gen_traffic_pattern_host_groups
from superbench.runner.ansible import AnsibleClient
from superbench.benchmarks import ReduceType, Reducer
from superbench.monitor import MonitorRecord
......@@ -405,6 +405,8 @@ def _run_proc(self, benchmark_name, mode, vars):
int: Process return code.
"""
mode.update(vars)
if mode.name == 'mpi' and mode.pattern:
mode.env.update({'SB_MODE_SERIAL_INDEX': mode.serial_index, 'SB_MODE_PARALLEL_INDEX': mode.parallel_index})
logger.info('Runner is going to run %s in %s mode, proc rank %d.', benchmark_name, mode.name, mode.proc_rank)
timeout = self._sb_benchmarks[benchmark_name].timeout
......@@ -460,14 +462,21 @@ def run(self):
continue
with open(self._output_path / 'hostfile', 'r') as f:
host_list = f.read().splitlines()
pattern_hostx = gen_traffic_pattern_host_group(host_list, mode.pattern)
for host_groups in pattern_hostx:
para_rc_list = Parallel(n_jobs=len(host_groups))(
delayed(self._run_proc)
(benchmark_name, mode, vars={
host_groups = gen_traffic_pattern_host_groups(
host_list, mode.pattern, self._output_path / 'mpi_pattern.txt', benchmark_name
)
for serial_index, host_group in enumerate(host_groups):
para_rc_list = Parallel(n_jobs=len(host_group))(
delayed(self._run_proc)(
benchmark_name,
mode,
vars={
'proc_rank': 0,
'host_list': host_group,
}) for host_group in host_groups
'host_list': host_list,
'serial_index': str(serial_index),
'parallel_index': str(parallel_index),
}
) for parallel_index, host_list in enumerate(host_group)
)
ansible_rc = ansible_rc + sum(para_rc_list)
else:
......
......@@ -4,27 +4,40 @@
"""Tests for traffic pattern config generation module."""
import argparse
import unittest
import tempfile
from tests.helper import decorator
from superbench.common.utils import gen_traffic_pattern_host_group
from superbench.common.utils import gen_traffic_pattern_host_groups
class GenConfigTest(unittest.TestCase):
"""Test the utils for generating config."""
@decorator.load_data('tests/data/mpi_pattern.txt') # noqa: C901
@decorator.load_data('tests/data/ib_traffic_topo_aware_hostfile') # noqa: C901
def test_gen_traffic_pattern_host_group(self, tp_hostfile):
def test_gen_traffic_pattern_host_group(self, expected_mpi_pattern, tp_hostfile):
"""Test the function of generating traffic pattern config from specified mode."""
# Test for all-nodes pattern
test_config_file = tempfile.NamedTemporaryFile()
test_config_path = test_config_file.name
test_benchmark_name = 'test_benchmark'
hostx = ['node0', 'node1', 'node2', 'node3', 'node4', 'node5', 'node6', 'node7']
parser = argparse.ArgumentParser()
parser.add_argument(
'--type',
type=str,
default='all-nodes',
)
parser.add_argument(
'--mpi_pattern',
type=bool,
default=True,
)
pattern, _ = parser.parse_known_args()
expected_host_group = [[['node0', 'node1', 'node2', 'node3', 'node4', 'node5', 'node6', 'node7']]]
self.assertEqual(gen_traffic_pattern_host_group(hostx, pattern), expected_host_group)
self.assertEqual(
gen_traffic_pattern_host_groups(hostx, pattern, test_config_path, test_benchmark_name), expected_host_group
)
# Test for pair-wise pattern
parser = argparse.ArgumentParser()
......@@ -33,6 +46,11 @@ def test_gen_traffic_pattern_host_group(self, tp_hostfile):
type=str,
default='pair-wise',
)
parser.add_argument(
'--mpi_pattern',
type=bool,
default=True,
)
pattern, _ = parser.parse_known_args()
expected_host_group = [
[['node0', 'node7'], ['node1', 'node6'], ['node2', 'node5'], ['node3', 'node4']],
......@@ -43,7 +61,9 @@ def test_gen_traffic_pattern_host_group(self, tp_hostfile):
[['node0', 'node5'], ['node6', 'node4'], ['node7', 'node3'], ['node1', 'node2']],
[['node0', 'node6'], ['node7', 'node5'], ['node1', 'node4'], ['node2', 'node3']]
]
self.assertEqual(gen_traffic_pattern_host_group(hostx, pattern), expected_host_group)
self.assertEqual(
gen_traffic_pattern_host_groups(hostx, pattern, test_config_path, test_benchmark_name), expected_host_group
)
# Test for k-batch pattern
parser = argparse.ArgumentParser()
......@@ -57,9 +77,16 @@ def test_gen_traffic_pattern_host_group(self, tp_hostfile):
type=int,
default=3,
)
parser.add_argument(
'--mpi_pattern',
type=bool,
default=True,
)
pattern, _ = parser.parse_known_args()
expected_host_group = [[['node0', 'node1', 'node2'], ['node3', 'node4', 'node5']]]
self.assertEqual(gen_traffic_pattern_host_group(hostx, pattern), expected_host_group)
self.assertEqual(
gen_traffic_pattern_host_groups(hostx, pattern, test_config_path, test_benchmark_name), expected_host_group
)
# Test for topo-aware pattern
tp_ibstat_path = 'tests/data/ib_traffic_topo_aware_ibstat.txt'
......@@ -90,6 +117,11 @@ def test_gen_traffic_pattern_host_group(self, tp_hostfile):
type=int,
default=6,
)
parser.add_argument(
'--mpi_pattern',
type=bool,
default=True,
)
hostx = tp_hostfile.split()
pattern, _ = parser.parse_known_args()
expected_host_group = [
......@@ -107,7 +139,15 @@ def test_gen_traffic_pattern_host_group(self, tp_hostfile):
['vma414bbc00005K', 'vma414bbc00005Q'], ['vma414bbc00005L', 'vma414bbc00005R']
]
]
self.assertEqual(gen_traffic_pattern_host_group(hostx, pattern), expected_host_group)
self.assertEqual(
gen_traffic_pattern_host_groups(hostx, pattern, test_config_path, test_benchmark_name), expected_host_group
)
# Test for mpi_pattern file
with open(test_config_path, 'r') as f:
content = f.read()
self.assertEqual(content, expected_mpi_pattern)
test_config_file.close()
# Test for invalid pattern
hostx = ['node0', 'node1', 'node2', 'node3', 'node4', 'node5', 'node6', 'node7']
......@@ -117,5 +157,10 @@ def test_gen_traffic_pattern_host_group(self, tp_hostfile):
type=str,
default='invalid pattern',
)
parser.add_argument(
'--mpi_pattern',
type=bool,
default=True,
)
pattern, _ = parser.parse_known_args()
gen_traffic_pattern_host_group(hostx, pattern)
gen_traffic_pattern_host_groups(hostx, pattern, test_config_path, test_benchmark_name)
benchmark_name: test_benchmark pattern_type: all-nodes
node0,node1,node2,node3,node4,node5,node6,node7
benchmark_name: test_benchmark pattern_type: pair-wise
node0,node7;node1,node6;node2,node5;node3,node4
node0,node1;node2,node7;node3,node6;node4,node5
node0,node2;node3,node1;node4,node7;node5,node6
node0,node3;node4,node2;node5,node1;node6,node7
node0,node4;node5,node3;node6,node2;node7,node1
node0,node5;node6,node4;node7,node3;node1,node2
node0,node6;node7,node5;node1,node4;node2,node3
benchmark_name: test_benchmark pattern_type: k-batch
node0,node1,node2;node3,node4,node5
benchmark_name: test_benchmark pattern_type: topo-aware
vma414bbc00005I,vma414bbc00005J;vma414bbc00005K,vma414bbc00005L;vma414bbc00005M,vma414bbc00005N;vma414bbc00005O,vma414bbc00005P;vma414bbc00005Q,vma414bbc00005R
vma414bbc00005I,vma414bbc00005K;vma414bbc00005J,vma414bbc00005L;vma414bbc00005O,vma414bbc00005Q;vma414bbc00005P,vma414bbc00005R
vma414bbc00005I,vma414bbc00005O;vma414bbc00005J,vma414bbc00005P;vma414bbc00005K,vma414bbc00005Q;vma414bbc00005L,vma414bbc00005R
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment