"stubs/torch/vscode:/vscode.git/clone" did not exist on "f0a400464b4aeebc1421a128b5f48c8da358265a"
Commit dd244343 authored by Antoine Kaufmann's avatar Antoine Kaufmann
Browse files

experiments: add --auto-dist to distribute non-distributed experiments

parent 4b43be30
......@@ -90,7 +90,9 @@ g_dist = parser.add_argument_group('Distributed Runtime')
g_par.add_argument('--dist', dest='runtime', action='store_const',
const='dist', default='sequential',
help='Use sequential distributed runtime instead of local')
g_par.add_argument('--auto-dist', action='store_const', const=True,
default=False,
help='Automatically distribute non-distributed experiments')
args = parser.parse_args()
......@@ -168,6 +170,8 @@ if not args.pickled:
experiments += mod.experiments
for e in experiments:
if args.auto_dist and not isinstance(e, exp.DistributedExperiment):
e = runtime.auto_dist(e, executors)
# apply filter if any specified
if (args.filter) and (len(args.filter) > 0):
match = False
......
......@@ -23,4 +23,4 @@
from simbricks.runtime.common import (Run, Runtime)
from simbricks.runtime.local import (LocalSimpleRuntime, LocalParallelRuntime)
from simbricks.runtime.slurm import SlurmRuntime
from simbricks.runtime.distributed import DistributedSimpleRuntime
from simbricks.runtime.distributed import (DistributedSimpleRuntime, auto_dist)
......@@ -26,6 +26,7 @@ import pathlib
from simbricks.runtime.common import *
import simbricks.experiments as exp
import simbricks.exectools as exectools
import simbricks.proxy as proxy
class DistributedSimpleRuntime(Runtime):
def __init__(self, execs, verbose=False):
......@@ -35,6 +36,9 @@ class DistributedSimpleRuntime(Runtime):
self.execs = execs
def add_run(self, run):
if not isinstance(run.experiment, exp.DistributedExperiment):
raise RuntimeError('Only distributed experiments supported')
self.runnable.append(run)
async def do_run(self, run):
......@@ -52,4 +56,56 @@ class DistributedSimpleRuntime(Runtime):
def start(self):
for run in self.runnable:
asyncio.run(self.do_run(run))
\ No newline at end of file
asyncio.run(self.do_run(run))
def auto_dist(e, execs):
""" Converts an Experiment into a DistributedExperiment. Assigns network to
executor zero, and then round-robin assignment of hosts to executors,
while also assigning all nics for a host to the same executor.
"""
if len(execs) < 2:
raise RuntimeError('auto_dist needs at least two hosts')
elif len(execs) > 2:
print('Warning: currently auto_dist only uses the first two hosts')
# Create the distributed experiment
de = exp.DistributedExperiment(e.name, 2)
de.timeout = e.timeout
de.checkpoint = e.checkpoint
de.no_simbricks = e.no_simbricks
de.metadata = e.metadata.copy()
# create listening proxy on host 0
lp = proxy.RDMANetProxyListener()
lp.name = 'listener'
de.add_proxy(lp)
de.assign_sim_host(lp, 0)
# assign networks to first host
for net in e.networks:
de.add_network(net)
de.assign_sim_host(net, 0)
# create connecting proxy on host 1
cp = proxy.RDMANetProxyConnecter(lp)
cp.name = 'connecter'
de.add_proxy(cp)
de.assign_sim_host(cp, 1)
# round-robin assignment for hosts
k = 0
for h in e.hosts:
de.add_host(h)
de.assign_sim_host(h, k)
for nic in h.nics:
de.assign_sim_host(nic, k)
if k != 0:
cp.add_nic(nic)
k = (k + 1) % 2
for nic in e.nics:
de.add_nic(nic)
return de
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment