experiments: add --auto-dist to distribute non-distributed experiments

dd244343 · Antoine Kaufmann · 4b43be30 · dd244343 · dd244343 · dd244343
Commit dd244343 authored Aug 06, 2021 by Antoine Kaufmann
3 changed files
--- a/experiments/run.py
+++ b/experiments/run.py
@@ -90,7 +90,9 @@ g_dist = parser.add_argument_group('Distributed Runtime')
 g_par.add_argument('--dist', dest='runtime', action='store_const',
        const='dist', default='sequential',
        help='Use sequential distributed runtime instead of local')
+g_par.add_argument('--auto-dist', action='store_const', const=True,
+        default=False,
+        help='Automatically distribute non-distributed experiments')
 args = parser.parse_args()
@@ -168,6 +170,8 @@ if not args.pickled:
        experiments += mod.experiments
    for e in experiments:
+        if args.auto_dist and not isinstance(e, exp.DistributedExperiment):
+            e = runtime.auto_dist(e, executors)
        # apply filter if any specified
        if (args.filter) and (len(args.filter) > 0):
            match = False

--- a/experiments/simbricks/runtime/__init__.py
+++ b/experiments/simbricks/runtime/__init__.py
@@ -23,4 +23,4 @@
 from simbricks.runtime.common import (Run, Runtime)
 from simbricks.runtime.local import (LocalSimpleRuntime, LocalParallelRuntime)
 from simbricks.runtime.slurm import SlurmRuntime
-from simbricks.runtime.distributed import DistributedSimpleRuntime
+from simbricks.runtime.distributed import (DistributedSimpleRuntime, auto_dist)
--- a/experiments/simbricks/runtime/distributed.py
+++ b/experiments/simbricks/runtime/distributed.py
@@ -26,6 +26,7 @@ import pathlib
 from simbricks.runtime.common import *
 import simbricks.experiments as exp
 import simbricks.exectools as exectools
+import simbricks.proxy as proxy
 class DistributedSimpleRuntime(Runtime):
    def __init__(self, execs, verbose=False):
@@ -35,6 +36,9 @@ class DistributedSimpleRuntime(Runtime):
        self.execs = execs
    def add_run(self, run):
+        if not isinstance(run.experiment, exp.DistributedExperiment):
+            raise RuntimeError('Only distributed experiments supported')
        self.runnable.append(run)
    async def do_run(self, run):
@@ -52,4 +56,56 @@ class DistributedSimpleRuntime(Runtime):
    def start(self):
        for run in self.runnable:
            asyncio.run(self.do_run(run))
\ No newline at end of file
+def auto_dist(e, execs):
+    """ Converts an Experiment into a DistributedExperiment. Assigns network to
+        executor zero, and then round-robin assignment of hosts to executors,
+        while also assigning all nics for a host to the same executor.
+    """
+    if len(execs) < 2:
+        raise RuntimeError('auto_dist needs at least two hosts')
+    elif len(execs) > 2:
+        print('Warning: currently auto_dist only uses the first two hosts')
+    # Create the distributed experiment
+    de = exp.DistributedExperiment(e.name, 2)
+    de.timeout = e.timeout
+    de.checkpoint = e.checkpoint
+    de.no_simbricks = e.no_simbricks
+    de.metadata = e.metadata.copy()
+    # create listening proxy on host 0
+    lp = proxy.RDMANetProxyListener()
+    lp.name = 'listener'
+    de.add_proxy(lp)
+    de.assign_sim_host(lp, 0)
+    # assign networks to first host
+    for net in e.networks:
+        de.add_network(net)
+        de.assign_sim_host(net, 0)
+    # create connecting proxy on host 1
+    cp = proxy.RDMANetProxyConnecter(lp)
+    cp.name = 'connecter'
+    de.add_proxy(cp)
+    de.assign_sim_host(cp, 1)
+    # round-robin assignment for hosts
+    k = 0
+    for h in e.hosts:
+        de.add_host(h)
+        de.assign_sim_host(h, k)
+        for nic in h.nics:
+            de.assign_sim_host(nic, k)
+            if k != 0:
+                cp.add_nic(nic)
+        k = (k + 1) % 2
+    for nic in e.nics:
+        de.add_nic(nic)
+    return de
\ No newline at end of file