experiments: support for checkpoints (maybe)

3f91e459 · Antoine Kaufmann · c6139684 · 3f91e459 · 3f91e459
Commit 3f91e459 authored Nov 04, 2020 by Antoine Kaufmann
Hide whitespace changes
Inline Side-by-side

Showing with 24 additions and 11 deletions

experiments/modes/runtime.py experiments/modes/runtime.py +1 -2

experiments/run.py experiments/run.py +23 -9

No files found.
--- a/experiments/modes/runtime.py
+++ b/experiments/modes/runtime.py
@@ -127,7 +127,7 @@ class LocalParallelRuntime(Runtime):
        runs = self.runs_noprereq + self.runs_prereq
        for run in runs:
            # check if we first have to wait for memory or cores
-            while not self.enough_resources(run) or not self.prereq_ready(run):
+            while not self.enough_resources(run):
                print('waiting for resources')
                await self.wait_completion()

@@ -205,7 +205,6 @@ class SlurmRuntime(Runtime):

            script = self.prep_run(run)

-            cmd = 'sbatch ' + script
            stream = os.popen('sbatch %s %s' % (dep_cmd, script))
            output = stream.read()
            result = stream.close()

--- a/experiments/run.py
+++ b/experiments/run.py
@@ -62,6 +62,22 @@ elif args.runtime == 'slurm':
 else:
    rt = runtime.LocalSimpleRuntime(verbose=args.verbose)

+def add_exp(e, run, prereq, create_cp, restore_cp):
+    outpath = '%s/%s-%d.json' % (args.outdir, e.name, run)
+    if os.path.exists(outpath):
+        print('skip %s run %d' % (e.name, run))
+        return None
+
+    workdir = '%s/%s/%d' % (args.workdir, e.name, run)
+
+    env = exp.ExpEnv(args.repo, workdir)
+    env.create_cp = create_cp
+    env.restore_cp = restore_cp
+
+    run = runtime.Run(e, run, env, outpath, prereq)
+    rt.add_run(run)
+    return run
+
 # load experiments
 if not args.pickled:
    # default: load python modules with experiments
@@ -75,16 +91,14 @@ if not args.pickled:
        experiments += mod.experiments

    for e in experiments:
-        for run in range(args.firstrun, args.firstrun + args.runs):
-            outpath = '%s/%s-%d.json' % (args.outdir, e.name, run)
-            if os.path.exists(outpath):
-                print('skip %s run %d' % (e.name, run))
-                continue
-
-            workdir = '%s/%s/%d' % (args.workdir, e.name, run)
+        # if this is an experiment with a checkpoint we might have to create it
+        if e.checkpoint:
+            prereq = add_exp(e, 0, None, True, False)
+        else:
+            prereq = None

-            env = exp.ExpEnv(args.repo, workdir)
-            rt.add_run(runtime.Run(e, run, env, outpath))
+        for run in range(args.firstrun, args.firstrun + args.runs):
+            add_exp(e, run, prereq, False, e.checkpoint)
 else:
    # otherwise load pickled run object
    for path in args.experiments: