Commit dd244343 authored by Antoine Kaufmann's avatar Antoine Kaufmann
Browse files

experiments: add --auto-dist to distribute non-distributed experiments

parent 4b43be30
...@@ -90,7 +90,9 @@ g_dist = parser.add_argument_group('Distributed Runtime') ...@@ -90,7 +90,9 @@ g_dist = parser.add_argument_group('Distributed Runtime')
g_par.add_argument('--dist', dest='runtime', action='store_const', g_par.add_argument('--dist', dest='runtime', action='store_const',
const='dist', default='sequential', const='dist', default='sequential',
help='Use sequential distributed runtime instead of local') help='Use sequential distributed runtime instead of local')
g_par.add_argument('--auto-dist', action='store_const', const=True,
default=False,
help='Automatically distribute non-distributed experiments')
args = parser.parse_args() args = parser.parse_args()
...@@ -168,6 +170,8 @@ if not args.pickled: ...@@ -168,6 +170,8 @@ if not args.pickled:
experiments += mod.experiments experiments += mod.experiments
for e in experiments: for e in experiments:
if args.auto_dist and not isinstance(e, exp.DistributedExperiment):
e = runtime.auto_dist(e, executors)
# apply filter if any specified # apply filter if any specified
if (args.filter) and (len(args.filter) > 0): if (args.filter) and (len(args.filter) > 0):
match = False match = False
......
...@@ -23,4 +23,4 @@ ...@@ -23,4 +23,4 @@
from simbricks.runtime.common import (Run, Runtime) from simbricks.runtime.common import (Run, Runtime)
from simbricks.runtime.local import (LocalSimpleRuntime, LocalParallelRuntime) from simbricks.runtime.local import (LocalSimpleRuntime, LocalParallelRuntime)
from simbricks.runtime.slurm import SlurmRuntime from simbricks.runtime.slurm import SlurmRuntime
from simbricks.runtime.distributed import DistributedSimpleRuntime from simbricks.runtime.distributed import (DistributedSimpleRuntime, auto_dist)
...@@ -26,6 +26,7 @@ import pathlib ...@@ -26,6 +26,7 @@ import pathlib
from simbricks.runtime.common import * from simbricks.runtime.common import *
import simbricks.experiments as exp import simbricks.experiments as exp
import simbricks.exectools as exectools import simbricks.exectools as exectools
import simbricks.proxy as proxy
class DistributedSimpleRuntime(Runtime): class DistributedSimpleRuntime(Runtime):
def __init__(self, execs, verbose=False): def __init__(self, execs, verbose=False):
...@@ -35,6 +36,9 @@ class DistributedSimpleRuntime(Runtime): ...@@ -35,6 +36,9 @@ class DistributedSimpleRuntime(Runtime):
self.execs = execs self.execs = execs
def add_run(self, run): def add_run(self, run):
if not isinstance(run.experiment, exp.DistributedExperiment):
raise RuntimeError('Only distributed experiments supported')
self.runnable.append(run) self.runnable.append(run)
async def do_run(self, run): async def do_run(self, run):
...@@ -52,4 +56,56 @@ class DistributedSimpleRuntime(Runtime): ...@@ -52,4 +56,56 @@ class DistributedSimpleRuntime(Runtime):
def start(self): def start(self):
for run in self.runnable: for run in self.runnable:
asyncio.run(self.do_run(run)) asyncio.run(self.do_run(run))
\ No newline at end of file
def auto_dist(e, execs):
""" Converts an Experiment into a DistributedExperiment. Assigns network to
executor zero, and then round-robin assignment of hosts to executors,
while also assigning all nics for a host to the same executor.
"""
if len(execs) < 2:
raise RuntimeError('auto_dist needs at least two hosts')
elif len(execs) > 2:
print('Warning: currently auto_dist only uses the first two hosts')
# Create the distributed experiment
de = exp.DistributedExperiment(e.name, 2)
de.timeout = e.timeout
de.checkpoint = e.checkpoint
de.no_simbricks = e.no_simbricks
de.metadata = e.metadata.copy()
# create listening proxy on host 0
lp = proxy.RDMANetProxyListener()
lp.name = 'listener'
de.add_proxy(lp)
de.assign_sim_host(lp, 0)
# assign networks to first host
for net in e.networks:
de.add_network(net)
de.assign_sim_host(net, 0)
# create connecting proxy on host 1
cp = proxy.RDMANetProxyConnecter(lp)
cp.name = 'connecter'
de.add_proxy(cp)
de.assign_sim_host(cp, 1)
# round-robin assignment for hosts
k = 0
for h in e.hosts:
de.add_host(h)
de.assign_sim_host(h, k)
for nic in h.nics:
de.assign_sim_host(nic, k)
if k != 0:
cp.add_nic(nic)
k = (k + 1) % 2
for nic in e.nics:
de.add_nic(nic)
return de
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment