Commit eacbf429 authored by Jonas Kaufmann's avatar Jonas Kaufmann
Browse files

simbricks-run: update to work with changes to simulation executor, throw out...

simbricks-run: update to work with changes to simulation executor, throw out all things that are no longer supported
parent 588595ae
......@@ -20,25 +20,22 @@
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
"""This is the top-level module of the SimBricks orchestration framework that
users interact with."""
users interact with for running simulations locally."""
import argparse
import asyncio
import fnmatch
import importlib
import importlib.util
import json
import os
import pathlib
import signal
import sys
from simbricks.runtime import output as sim_out
from simbricks.orchestration.instantiation import base as inst_base
from simbricks.runtime.runs import base as runs_base
from simbricks.runtime import output as sim_out
from simbricks.runtime.runs import base as runs_base
from simbricks.runtime.runs import local as rt_local
from simbricks.runtime import command_executor
def parse_args() -> argparse.Namespace:
......@@ -65,13 +62,6 @@ def parse_args() -> argparse.Namespace:
nargs="+",
help="Only run experiments matching the given Unix shell style patterns",
)
parser.add_argument(
"--pickled",
action="store_const",
const=True,
default=False,
help="Interpret experiment modules as pickled runs instead of .py files",
)
parser.add_argument(
"--runs",
metavar="N",
......@@ -79,9 +69,7 @@ def parse_args() -> argparse.Namespace:
default=1,
help="Number of repetition of each experiment",
)
parser.add_argument(
"--firstrun", metavar="N", type=int, default=1, help="ID for first run"
)
parser.add_argument("--firstrun", metavar="N", type=int, default=1, help="ID for first run")
parser.add_argument(
"--force",
action="store_const",
......@@ -116,45 +104,17 @@ def parse_args() -> argparse.Namespace:
g_env.add_argument(
"--repo",
metavar="DIR",
type=str,
default=os.path.dirname(__file__) + "/..",
type=pathlib.Path,
default=pathlib.Path("/simbricks"),
help="SimBricks repository directory",
)
g_env.add_argument(
"--workdir",
metavar="DIR",
type=str,
default="./out/",
type=pathlib.Path,
default=pathlib.Path("./out/"),
help="Work directory base",
)
g_env.add_argument(
"--outdir",
metavar="DIR",
type=str,
default="./out/",
help="Output directory base",
)
g_env.add_argument(
"--cpdir",
metavar="DIR",
type=str,
default="./out/",
help="Checkpoint directory base",
)
g_env.add_argument(
"--hosts",
metavar="JSON_FILE",
type=str,
default=None,
help="List of hosts to use (json)",
)
g_env.add_argument(
"--shmdir",
metavar="DIR",
type=str,
default=None,
help="Shared memory directory base (workdir if not set)",
)
# arguments for the parallel runtime
g_par = parser.add_argument_group("Parallel Runtime")
......@@ -181,89 +141,17 @@ def parse_args() -> argparse.Namespace:
help="Memory limit for parallel runs (in MB)",
)
# arguments for the slurm runtime
g_slurm = parser.add_argument_group("Slurm Runtime")
g_slurm.add_argument(
"--slurm",
dest="runtime",
action="store_const",
const="slurm",
default="sequential",
help="Use slurm instead of sequential runtime",
)
g_slurm.add_argument(
"--slurmdir",
metavar="DIR",
type=str,
default="./slurm/",
help="Slurm communication directory",
)
# arguments for the distributed runtime
g_dist = parser.add_argument_group("Distributed Runtime")
g_dist.add_argument(
"--dist",
dest="runtime",
action="store_const",
const="dist",
default="sequential",
help="Use sequential distributed runtime instead of local",
)
g_dist.add_argument(
"--auto-dist",
action="store_const",
const=True,
default=False,
help="Automatically distribute non-distributed experiments",
)
g_dist.add_argument(
"--proxy-type",
metavar="TYPE",
type=str,
default="sockets",
help="Proxy type to use (sockets,rdma) for auto distribution",
)
return parser.parse_args()
def load_executors(path: str) -> list[command_executor.Executor]:
"""Load hosts list from json file and return list of executors."""
with open(path, "r", encoding="utf-8") as f:
hosts = json.load(f)
exs = []
for h in hosts:
if h["type"] == "local":
ex = command_executor.LocalExecutor()
elif h["type"] == "remote":
ex = command_executor.RemoteExecutor(h["host"], h["workdir"])
if "ssh_args" in h:
ex.ssh_extra_args += h["ssh_args"]
if "scp_args" in h:
ex.scp_extra_args += h["scp_args"]
else:
raise RuntimeError('invalid host type "' + h["type"] + '"')
ex.ip = h["ip"]
exs.append(ex)
return exs
def warn_multi_exec(executors: list[command_executor.Executor]):
if len(executors) > 1:
print(
"Warning: multiple hosts specified, only using first one for now",
file=sys.stderr,
)
def add_exp(
instantiation: inst_base.Instantiation,
prereq: runs_base.Run | None,
rt: runs_base.Runtime,
args: argparse.Namespace,
) -> runs_base.Run:
env = inst_base.InstantiationEnvironment() # TODO: set from args
env = inst_base.InstantiationEnvironment(args.workdir, args.repo)
instantiation.env = env
output = sim_out.SimulationOutput(instantiation.simulation)
......@@ -274,31 +162,17 @@ def add_exp(
def main():
args = parse_args()
if args.hosts is None:
executors = [command_executor.LocalExecutor()]
else:
executors = load_executors(args.hosts)
# initialize runtime
if args.runtime == "parallel":
warn_multi_exec(executors)
rt = rt_local.LocalParallelRuntime(
cores=args.cores, mem=args.mem, verbose=args.verbose, executor=executors[0]
)
# elif args.runtime == "slurm":
# rt = runs.SlurmRuntime(args.slurmdir, args, verbose=args.verbose)
# elif args.runtime == "dist":
# rt = runs.DistributedSimpleRuntime(executors, verbose=args.verbose)
rt = rt_local.LocalParallelRuntime(cores=args.cores, mem=args.mem, verbose=args.verbose)
else:
warn_multi_exec(executors)
rt = rt_local.LocalSimpleRuntime(verbose=args.verbose, executor=executors[0])
rt = rt_local.LocalSimpleRuntime(verbose=args.verbose)
if args.profile_int:
rt.enable_profiler(args.profile_int)
# load experiments
if not args.pickled:
# default: load python modules with experiments
# load python modules with experiments
instantiations: list[inst_base.Instantiation] = []
for path in args.experiments:
modname, _ = os.path.splitext(os.path.basename(path))
......@@ -345,19 +219,14 @@ def main():
inst.create_checkpoint = False
inst.restore_checkpoint = True
prereq = add_exp(instantiation=checkpointing_inst, rt=rt, prereq=None)
prereq = add_exp(instantiation=checkpointing_inst, rt=rt, prereq=None, args=args)
for index in range(args.firstrun, args.firstrun + args.runs):
inst_copy = inst.copy()
inst_copy.preserve_tmp_folder = False
if index == args.firstrun + args.runs - 1:
inst_copy._preserve_checkpoints = False
add_exp(instantiation=inst_copy, rt=rt, prereq=prereq)
# else:
# # otherwise load pickled run object
# for path in args.experiments:
# with open(path, "rb") as f:
# rt.add_run(pickle.load(f))
add_exp(instantiation=inst_copy, rt=rt, prereq=prereq, args=args)
# register interrupt handler
signal.signal(signal.SIGINT, lambda *_: rt.interrupt())
......
......@@ -23,11 +23,97 @@
from __future__ import annotations
import asyncio
import typing
from simbricks.utils import artifatcs as art
from simbricks.runtime import simulation_executor
from simbricks.runtime import command_executor
from simbricks.runtime import simulation_executor as sim_exec
from simbricks.runtime.runs import base as run_base
from simbricks.utils import artifatcs as art
if typing.TYPE_CHECKING:
from simbricks.orchestration.instantiation import base as inst_base
from simbricks.orchestration.instantiation import proxy as inst_proxy
from simbricks.orchestration.simulation import base as sim_base
class LocalSimulationExecutorCallbacks(sim_exec.SimulationExecutorCallbacks):
def __init__(self, instantiation: inst_base.Instantiation, verbose: bool):
super().__init__(instantiation)
self._instantiation = instantiation
self._verbose = verbose
# ---------------------------------------
# Callbacks related to whole simulation -
# ---------------------------------------
async def simulation_prepare_cmd_start(self, cmd: str) -> None:
await super().simulation_prepare_cmd_start(cmd)
if self._verbose:
print(f"+ [prepare] {cmd}")
async def simulation_prepare_cmd_stdout(self, cmd: str, lines: list[str]) -> None:
await super().simulation_prepare_cmd_stdout(cmd, lines)
if self._verbose:
for line in lines:
print(f"[prepare] {line}")
async def simulation_prepare_cmd_stderr(self, cmd: str, lines: list[str]) -> None:
await super().simulation_prepare_cmd_stderr(cmd, lines)
if self._verbose:
for line in lines:
print(f"[prepare] {line}")
# -----------------------------
# Simulator-related callbacks -
# -----------------------------
async def simulator_started(self, sim: sim_base.Simulator, cmd: str) -> None:
await super().simulator_started(sim, cmd)
if self._verbose:
print(f"+ [{sim.full_name()}] {cmd}")
async def simulator_exited(self, sim: sim_base.Simulator, exit_code: int) -> None:
await super().simulator_exited(sim, exit_code)
if self._verbose:
print(f"- [{sim.full_name()}] exited with code {exit_code}")
async def simulator_stdout(self, sim: sim_base.Simulator, lines: list[str]) -> None:
await super().simulator_stdout(sim, lines)
if self._verbose:
for line in lines:
print(f"[{sim.full_name()}] {line}")
async def simulator_stderr(self, sim: sim_base.Simulator, lines: list[str]) -> None:
await super().simulator_stderr(sim, lines)
if self._verbose:
for line in lines:
print(f"[{sim.full_name()}] {line}")
# -------------------------
# Proxy-related callbacks -
# -------------------------
async def proxy_started(self, proxy: inst_proxy.Proxy, cmd: str) -> None:
await super().proxy_started(proxy, cmd)
if self._verbose:
print(f"+ [{proxy.name}] {cmd}")
async def proxy_exited(self, proxy: inst_proxy.Proxy, exit_code: int) -> None:
await super().proxy_exited(proxy, exit_code)
if self._verbose:
print(f"- [{proxy.name}] exited with code {exit_code}")
async def proxy_stdout(self, proxy: inst_proxy.Proxy, lines: list[str]) -> None:
await super().proxy_stdout(proxy, lines)
if self._verbose:
for line in lines:
print(f"[{proxy.name}] {line}")
async def proxy_stderr(self, proxy: inst_proxy.Proxy, lines: list[str]) -> None:
await super().proxy_stderr(proxy, lines)
if self._verbose:
for line in lines:
print(f"[{proxy.name}] {line}")
class LocalSimpleRuntime(run_base.Runtime):
......@@ -36,13 +122,11 @@ class LocalSimpleRuntime(run_base.Runtime):
def __init__(
self,
verbose=False,
executor: command_executor.Executor = command_executor.LocalExecutor(),
):
super().__init__()
self._runnable: list[run_base.Run] = []
self._complete: list[run_base.Run] = []
self._verbose: bool = verbose
self._executor: command_executor.Executor = executor
self._running: asyncio.Task | None = None
def add_run(self, run: run_base.Run) -> None:
......@@ -52,18 +136,17 @@ class LocalSimpleRuntime(run_base.Runtime):
"""Actually executes `run`."""
try:
runner = simulation_executor.SimulationSimpleRunner(self._executor, run.instantiation, self._verbose)
if self._profile_int:
runner.profile_int = self.profile_int
await runner.prepare()
for sim in run.instantiation.simulation.all_simulators():
runner.add_listener(sim, command_executor.LegacyOutputListener())
callbacks = LocalSimulationExecutorCallbacks(run.instantiation, self._verbose)
sim_executor = sim_exec.SimulationExecutor(
run.instantiation, callbacks, self._verbose, self._profile_int
)
await sim_executor.prepare()
except asyncio.CancelledError:
# it is safe to just exit here because we are not running any
# simulators yet
return
run._output = await runner.run() # handles CancelledError
run._output = await sim_executor.run() # handles CancelledError
self._complete.append(run)
# if the log is huge, this step takes some time
......@@ -75,10 +158,11 @@ class LocalSimpleRuntime(run_base.Runtime):
run._output.dump(outpath=output_path)
if run.instantiation.create_artifact:
art.create_artifact(
artifact_name=run.instantiation.artifact_name, paths_to_include=run.instantiation.artifact_paths
artifact_name=run.instantiation.artifact_name,
paths_to_include=run.instantiation.artifact_paths,
)
await runner.cleanup()
await sim_executor.cleanup()
async def start(self) -> None:
"""Execute the runs defined in `self.runnable`."""
......@@ -102,7 +186,6 @@ class LocalParallelRuntime(run_base.Runtime):
cores: int,
mem: int | None = None,
verbose: bool = False,
executor: command_executor.Executor = command_executor.LocalExecutor(),
):
super().__init__()
self._runs_noprereq: list[run_base.Run] = []
......@@ -113,7 +196,6 @@ class LocalParallelRuntime(run_base.Runtime):
self._cores: int = cores
self._mem: int | None = mem
self._verbose: bool = verbose
self._executor = executor
self._pending_jobs: set[asyncio.Task] = set()
self._starter_task: asyncio.Task
......@@ -133,19 +215,17 @@ class LocalParallelRuntime(run_base.Runtime):
async def do_run(self, run: run_base.Run) -> run_base.Run | None:
"""Actually executes `run`."""
try:
runner = simulation_executor.SimulationSimpleRunner(self._executor, run.instantiation, self._verbose)
sim_executor = sim_exec.SimulationExecutor(run.instantiation, self._verbose)
if self._profile_int is not None:
runner._profile_int = self._profile_int
await runner.prepare()
for sim in run.instantiation.simulation.all_simulators():
runner.add_listener(sim, command_executor.LegacyOutputListener())
sim_executor._profile_int = self._profile_int
await sim_executor.prepare()
except asyncio.CancelledError:
# it is safe to just exit here because we are not running any
# simulators yet
return None
print("starting run ", run.name())
run._output = await runner.run() # already handles CancelledError
run._output = await sim_executor.run() # already handles CancelledError
# if the log is huge, this step takes some time
if self._verbose:
......@@ -154,7 +234,7 @@ class LocalParallelRuntime(run_base.Runtime):
output_path = run.instantiation.get_simulation_output_path()
run._output.dump(outpath=output_path)
await runner.cleanup()
await sim_executor.cleanup()
print("finished run ", run.name())
return run
......@@ -163,7 +243,9 @@ class LocalParallelRuntime(run_base.Runtime):
"""Wait for any run to terminate and return."""
assert self._pending_jobs
done, self._pending_jobs = await asyncio.wait(self._pending_jobs, return_when=asyncio.FIRST_COMPLETED)
done, self._pending_jobs = await asyncio.wait(
self._pending_jobs, return_when=asyncio.FIRST_COMPLETED
)
for r_awaitable in done:
run = await r_awaitable
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment