simbricks-run: update to work with changes to simulation executor, throw out...

simbricks-run: update to work with changes to simulation executor, throw out all things that are no longer supported

simbricks-run: update to work with changes to simulation executor, throw out...
simbricks-run: update to work with changes to simulation executor, throw out all things that are no longer supported
eacbf429 · Jonas Kaufmann · 588595ae · eacbf429 · eacbf429
Commit eacbf429 authored Jan 16, 2025 by Jonas Kaufmann
Showing with 169 additions and 218 deletions

symphony/local/simbricks/local/__main__.py symphony/local/simbricks/local/__main__.py +63 -194

symphony/runtime/simbricks/runtime/runs/local.py symphony/runtime/simbricks/runtime/runs/local.py +106 -24

No files found.
--- a/symphony/local/simbricks/local/__main__.py
+++ b/symphony/local/simbricks/local/__main__.py
@@ -20,25 +20,22 @@
 # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 """This is the top-level module of the SimBricks orchestration framework that
-users interact with."""
+users interact with for running simulations locally."""

 import argparse
 import asyncio
 import fnmatch
 import importlib
 import importlib.util
-import json
 import os
+import pathlib
 import signal
 import sys

-
-from simbricks.runtime import output as sim_out
 from simbricks.orchestration.instantiation import base as inst_base
-from simbricks.runtime.runs import base as runs_base
+from simbricks.runtime import output as sim_out
 from simbricks.runtime.runs import base as runs_base
 from simbricks.runtime.runs import local as rt_local
-from simbricks.runtime import command_executor


 def parse_args() -> argparse.Namespace:
@@ -65,13 +62,6 @@ def parse_args() -> argparse.Namespace:
        nargs="+",
        help="Only run experiments matching the given Unix shell style patterns",
    )
-    parser.add_argument(
-        "--pickled",
-        action="store_const",
-        const=True,
-        default=False,
-        help="Interpret experiment modules as pickled runs instead of .py files",
-    )
    parser.add_argument(
        "--runs",
        metavar="N",
@@ -79,9 +69,7 @@ def parse_args() -> argparse.Namespace:
        default=1,
        help="Number of repetition of each experiment",
    )
-    parser.add_argument(
-        "--firstrun", metavar="N", type=int, default=1, help="ID for first run"
-    )
+    parser.add_argument("--firstrun", metavar="N", type=int, default=1, help="ID for first run")
    parser.add_argument(
        "--force",
        action="store_const",
@@ -116,45 +104,17 @@ def parse_args() -> argparse.Namespace:
    g_env.add_argument(
        "--repo",
        metavar="DIR",
-        type=str,
-        default=os.path.dirname(__file__) + "/..",
+        type=pathlib.Path,
+        default=pathlib.Path("/simbricks"),
        help="SimBricks repository directory",
    )
    g_env.add_argument(
        "--workdir",
        metavar="DIR",
-        type=str,
-        default="./out/",
+        type=pathlib.Path,
+        default=pathlib.Path("./out/"),
        help="Work directory base",
    )
-    g_env.add_argument(
-        "--outdir",
-        metavar="DIR",
-        type=str,
-        default="./out/",
-        help="Output directory base",
-    )
-    g_env.add_argument(
-        "--cpdir",
-        metavar="DIR",
-        type=str,
-        default="./out/",
-        help="Checkpoint directory base",
-    )
-    g_env.add_argument(
-        "--hosts",
-        metavar="JSON_FILE",
-        type=str,
-        default=None,
-        help="List of hosts to use (json)",
-    )
-    g_env.add_argument(
-        "--shmdir",
-        metavar="DIR",
-        type=str,
-        default=None,
-        help="Shared memory directory base (workdir if not set)",
-    )

    # arguments for the parallel runtime
    g_par = parser.add_argument_group("Parallel Runtime")
@@ -181,89 +141,17 @@ def parse_args() -> argparse.Namespace:
        help="Memory limit for parallel runs (in MB)",
    )

-    # arguments for the slurm runtime
-    g_slurm = parser.add_argument_group("Slurm Runtime")
-    g_slurm.add_argument(
-        "--slurm",
-        dest="runtime",
-        action="store_const",
-        const="slurm",
-        default="sequential",
-        help="Use slurm instead of sequential runtime",
-    )
-    g_slurm.add_argument(
-        "--slurmdir",
-        metavar="DIR",
-        type=str,
-        default="./slurm/",
-        help="Slurm communication directory",
-    )
-
-    # arguments for the distributed runtime
-    g_dist = parser.add_argument_group("Distributed Runtime")
-    g_dist.add_argument(
-        "--dist",
-        dest="runtime",
-        action="store_const",
-        const="dist",
-        default="sequential",
-        help="Use sequential distributed runtime instead of local",
-    )
-    g_dist.add_argument(
-        "--auto-dist",
-        action="store_const",
-        const=True,
-        default=False,
-        help="Automatically distribute non-distributed experiments",
-    )
-    g_dist.add_argument(
-        "--proxy-type",
-        metavar="TYPE",
-        type=str,
-        default="sockets",
-        help="Proxy type to use (sockets,rdma) for auto distribution",
-    )
-
    return parser.parse_args()


-def load_executors(path: str) -> list[command_executor.Executor]:
-    """Load hosts list from json file and return list of executors."""
-    with open(path, "r", encoding="utf-8") as f:
-        hosts = json.load(f)
-
-        exs = []
-        for h in hosts:
-            if h["type"] == "local":
-                ex = command_executor.LocalExecutor()
-            elif h["type"] == "remote":
-                ex = command_executor.RemoteExecutor(h["host"], h["workdir"])
-                if "ssh_args" in h:
-                    ex.ssh_extra_args += h["ssh_args"]
-                if "scp_args" in h:
-                    ex.scp_extra_args += h["scp_args"]
-            else:
-                raise RuntimeError('invalid host type "' + h["type"] + '"')
-            ex.ip = h["ip"]
-            exs.append(ex)
-    return exs
-
-
-def warn_multi_exec(executors: list[command_executor.Executor]):
-    if len(executors) > 1:
-        print(
-            "Warning: multiple hosts specified, only using first one for now",
-            file=sys.stderr,
-        )
-
-
 def add_exp(
    instantiation: inst_base.Instantiation,
    prereq: runs_base.Run | None,
    rt: runs_base.Runtime,
+    args: argparse.Namespace,
 ) -> runs_base.Run:

-    env = inst_base.InstantiationEnvironment() # TODO: set from args
+    env = inst_base.InstantiationEnvironment(args.workdir, args.repo)
    instantiation.env = env

    output = sim_out.SimulationOutput(instantiation.simulation)
@@ -274,31 +162,17 @@ def add_exp(

 def main():
    args = parse_args()
-    if args.hosts is None:
-        executors = [command_executor.LocalExecutor()]
-    else:
-        executors = load_executors(args.hosts)

    # initialize runtime
    if args.runtime == "parallel":
-        warn_multi_exec(executors)
-        rt = rt_local.LocalParallelRuntime(
-            cores=args.cores, mem=args.mem, verbose=args.verbose, executor=executors[0]
-        )
-    # elif args.runtime == "slurm":
-    #     rt = runs.SlurmRuntime(args.slurmdir, args, verbose=args.verbose)
-    # elif args.runtime == "dist":
-    #     rt = runs.DistributedSimpleRuntime(executors, verbose=args.verbose)
+        rt = rt_local.LocalParallelRuntime(cores=args.cores, mem=args.mem, verbose=args.verbose)
    else:
-        warn_multi_exec(executors)
-        rt = rt_local.LocalSimpleRuntime(verbose=args.verbose, executor=executors[0])
+        rt = rt_local.LocalSimpleRuntime(verbose=args.verbose)

    if args.profile_int:
        rt.enable_profiler(args.profile_int)

-    # load experiments
-    if not args.pickled:
-        # default: load python modules with experiments
+    # load python modules with experiments
    instantiations: list[inst_base.Instantiation] = []
    for path in args.experiments:
        modname, _ = os.path.splitext(os.path.basename(path))
@@ -345,19 +219,14 @@ def main():
            inst.create_checkpoint = False
            inst.restore_checkpoint = True

-                prereq = add_exp(instantiation=checkpointing_inst, rt=rt, prereq=None)
+            prereq = add_exp(instantiation=checkpointing_inst, rt=rt, prereq=None, args=args)

        for index in range(args.firstrun, args.firstrun + args.runs):
            inst_copy = inst.copy()
            inst_copy.preserve_tmp_folder = False
            if index == args.firstrun + args.runs - 1:
                inst_copy._preserve_checkpoints = False
-                add_exp(instantiation=inst_copy, rt=rt, prereq=prereq)
-    # else:
-    #     # otherwise load pickled run object
-    #     for path in args.experiments:
-    #         with open(path, "rb") as f:
-    #             rt.add_run(pickle.load(f))
+            add_exp(instantiation=inst_copy, rt=rt, prereq=prereq, args=args)

    # register interrupt handler
    signal.signal(signal.SIGINT, lambda *_: rt.interrupt())

--- a/symphony/runtime/simbricks/runtime/runs/local.py
+++ b/symphony/runtime/simbricks/runtime/runs/local.py
@@ -23,11 +23,97 @@
 from __future__ import annotations

 import asyncio
+import typing

-from simbricks.utils import artifatcs as art
-from simbricks.runtime import simulation_executor
-from simbricks.runtime import command_executor
+from simbricks.runtime import simulation_executor as sim_exec
 from simbricks.runtime.runs import base as run_base
+from simbricks.utils import artifatcs as art
+
+if typing.TYPE_CHECKING:
+    from simbricks.orchestration.instantiation import base as inst_base
+    from simbricks.orchestration.instantiation import proxy as inst_proxy
+    from simbricks.orchestration.simulation import base as sim_base
+
+
+class LocalSimulationExecutorCallbacks(sim_exec.SimulationExecutorCallbacks):
+
+    def __init__(self, instantiation: inst_base.Instantiation, verbose: bool):
+        super().__init__(instantiation)
+        self._instantiation = instantiation
+        self._verbose = verbose
+
+    # ---------------------------------------
+    # Callbacks related to whole simulation -
+    # ---------------------------------------
+
+    async def simulation_prepare_cmd_start(self, cmd: str) -> None:
+        await super().simulation_prepare_cmd_start(cmd)
+        if self._verbose:
+            print(f"+ [prepare] {cmd}")
+
+    async def simulation_prepare_cmd_stdout(self, cmd: str, lines: list[str]) -> None:
+        await super().simulation_prepare_cmd_stdout(cmd, lines)
+        if self._verbose:
+            for line in lines:
+                print(f"[prepare] {line}")
+
+    async def simulation_prepare_cmd_stderr(self, cmd: str, lines: list[str]) -> None:
+        await super().simulation_prepare_cmd_stderr(cmd, lines)
+        if self._verbose:
+            for line in lines:
+                print(f"[prepare] {line}")
+
+    # -----------------------------
+    # Simulator-related callbacks -
+    # -----------------------------
+
+    async def simulator_started(self, sim: sim_base.Simulator, cmd: str) -> None:
+        await super().simulator_started(sim, cmd)
+        if self._verbose:
+            print(f"+ [{sim.full_name()}] {cmd}")
+
+    async def simulator_exited(self, sim: sim_base.Simulator, exit_code: int) -> None:
+        await super().simulator_exited(sim, exit_code)
+        if self._verbose:
+            print(f"- [{sim.full_name()}] exited with code {exit_code}")
+
+    async def simulator_stdout(self, sim: sim_base.Simulator, lines: list[str]) -> None:
+        await super().simulator_stdout(sim, lines)
+        if self._verbose:
+            for line in lines:
+                print(f"[{sim.full_name()}] {line}")
+
+    async def simulator_stderr(self, sim: sim_base.Simulator, lines: list[str]) -> None:
+        await super().simulator_stderr(sim, lines)
+        if self._verbose:
+            for line in lines:
+                print(f"[{sim.full_name()}] {line}")
+
+    # -------------------------
+    # Proxy-related callbacks -
+    # -------------------------
+
+    async def proxy_started(self, proxy: inst_proxy.Proxy, cmd: str) -> None:
+        await super().proxy_started(proxy, cmd)
+        if self._verbose:
+            print(f"+ [{proxy.name}] {cmd}")
+
+    async def proxy_exited(self, proxy: inst_proxy.Proxy, exit_code: int) -> None:
+        await super().proxy_exited(proxy, exit_code)
+        if self._verbose:
+            print(f"- [{proxy.name}] exited with code {exit_code}")
+
+    async def proxy_stdout(self, proxy: inst_proxy.Proxy, lines: list[str]) -> None:
+        await super().proxy_stdout(proxy, lines)
+        if self._verbose:
+            for line in lines:
+                print(f"[{proxy.name}] {line}")
+
+    async def proxy_stderr(self, proxy: inst_proxy.Proxy, lines: list[str]) -> None:
+        await super().proxy_stderr(proxy, lines)
+        if self._verbose:
+            for line in lines:
+                print(f"[{proxy.name}] {line}")


 class LocalSimpleRuntime(run_base.Runtime):
@@ -36,13 +122,11 @@ class LocalSimpleRuntime(run_base.Runtime):
    def __init__(
        self,
        verbose=False,
-        executor: command_executor.Executor = command_executor.LocalExecutor(),
    ):
        super().__init__()
        self._runnable: list[run_base.Run] = []
        self._complete: list[run_base.Run] = []
        self._verbose: bool = verbose
-        self._executor: command_executor.Executor = executor
        self._running: asyncio.Task | None = None

    def add_run(self, run: run_base.Run) -> None:
@@ -52,18 +136,17 @@ class LocalSimpleRuntime(run_base.Runtime):
        """Actually executes `run`."""

        try:
-            runner = simulation_executor.SimulationSimpleRunner(self._executor, run.instantiation, self._verbose)
-            if self._profile_int:
-                runner.profile_int = self.profile_int
-            await runner.prepare()
-            for sim in run.instantiation.simulation.all_simulators():
-                runner.add_listener(sim, command_executor.LegacyOutputListener())
+            callbacks = LocalSimulationExecutorCallbacks(run.instantiation, self._verbose)
+            sim_executor = sim_exec.SimulationExecutor(
+                run.instantiation, callbacks, self._verbose, self._profile_int
+            )
+            await sim_executor.prepare()
        except asyncio.CancelledError:
            # it is safe to just exit here because we are not running any
            # simulators yet
            return

-        run._output = await runner.run()  # handles CancelledError
+        run._output = await sim_executor.run()  # handles CancelledError
        self._complete.append(run)

        # if the log is huge, this step takes some time
@@ -75,10 +158,11 @@ class LocalSimpleRuntime(run_base.Runtime):
        run._output.dump(outpath=output_path)
        if run.instantiation.create_artifact:
            art.create_artifact(
-                artifact_name=run.instantiation.artifact_name, paths_to_include=run.instantiation.artifact_paths
+                artifact_name=run.instantiation.artifact_name,
+                paths_to_include=run.instantiation.artifact_paths,
            )

-        await runner.cleanup()
+        await sim_executor.cleanup()

    async def start(self) -> None:
        """Execute the runs defined in `self.runnable`."""
@@ -102,7 +186,6 @@ class LocalParallelRuntime(run_base.Runtime):
        cores: int,
        mem: int | None = None,
        verbose: bool = False,
-        executor: command_executor.Executor = command_executor.LocalExecutor(),
    ):
        super().__init__()
        self._runs_noprereq: list[run_base.Run] = []
@@ -113,7 +196,6 @@ class LocalParallelRuntime(run_base.Runtime):
        self._cores: int = cores
        self._mem: int | None = mem
        self._verbose: bool = verbose
-        self._executor = executor

        self._pending_jobs: set[asyncio.Task] = set()
        self._starter_task: asyncio.Task
@@ -133,19 +215,17 @@ class LocalParallelRuntime(run_base.Runtime):
    async def do_run(self, run: run_base.Run) -> run_base.Run | None:
        """Actually executes `run`."""
        try:
-            runner = simulation_executor.SimulationSimpleRunner(self._executor, run.instantiation, self._verbose)
+            sim_executor = sim_exec.SimulationExecutor(run.instantiation, self._verbose)
            if self._profile_int is not None:
-                runner._profile_int = self._profile_int
-            await runner.prepare()
-            for sim in run.instantiation.simulation.all_simulators():
-                runner.add_listener(sim, command_executor.LegacyOutputListener())
+                sim_executor._profile_int = self._profile_int
+            await sim_executor.prepare()
        except asyncio.CancelledError:
            # it is safe to just exit here because we are not running any
            # simulators yet
            return None

        print("starting run ", run.name())
-        run._output = await runner.run()  # already handles CancelledError
+        run._output = await sim_executor.run()  # already handles CancelledError

        # if the log is huge, this step takes some time
        if self._verbose:
@@ -154,7 +234,7 @@ class LocalParallelRuntime(run_base.Runtime):
        output_path = run.instantiation.get_simulation_output_path()
        run._output.dump(outpath=output_path)

-        await runner.cleanup()
+        await sim_executor.cleanup()

        print("finished run ", run.name())
        return run
@@ -163,7 +243,9 @@ class LocalParallelRuntime(run_base.Runtime):
        """Wait for any run to terminate and return."""
        assert self._pending_jobs

-        done, self._pending_jobs = await asyncio.wait(self._pending_jobs, return_when=asyncio.FIRST_COMPLETED)
+        done, self._pending_jobs = await asyncio.wait(
+            self._pending_jobs, return_when=asyncio.FIRST_COMPLETED
+        )

        for r_awaitable in done:
            run = await r_awaitable