Commit 3f91e459 authored by Antoine Kaufmann's avatar Antoine Kaufmann
Browse files

experiments: support for checkpoints (maybe)

parent c6139684
...@@ -127,7 +127,7 @@ class LocalParallelRuntime(Runtime): ...@@ -127,7 +127,7 @@ class LocalParallelRuntime(Runtime):
runs = self.runs_noprereq + self.runs_prereq runs = self.runs_noprereq + self.runs_prereq
for run in runs: for run in runs:
# check if we first have to wait for memory or cores # check if we first have to wait for memory or cores
while not self.enough_resources(run) or not self.prereq_ready(run): while not self.enough_resources(run):
print('waiting for resources') print('waiting for resources')
await self.wait_completion() await self.wait_completion()
...@@ -205,7 +205,6 @@ class SlurmRuntime(Runtime): ...@@ -205,7 +205,6 @@ class SlurmRuntime(Runtime):
script = self.prep_run(run) script = self.prep_run(run)
cmd = 'sbatch ' + script
stream = os.popen('sbatch %s %s' % (dep_cmd, script)) stream = os.popen('sbatch %s %s' % (dep_cmd, script))
output = stream.read() output = stream.read()
result = stream.close() result = stream.close()
......
...@@ -62,6 +62,22 @@ elif args.runtime == 'slurm': ...@@ -62,6 +62,22 @@ elif args.runtime == 'slurm':
else: else:
rt = runtime.LocalSimpleRuntime(verbose=args.verbose) rt = runtime.LocalSimpleRuntime(verbose=args.verbose)
def add_exp(e, run, prereq, create_cp, restore_cp):
outpath = '%s/%s-%d.json' % (args.outdir, e.name, run)
if os.path.exists(outpath):
print('skip %s run %d' % (e.name, run))
return None
workdir = '%s/%s/%d' % (args.workdir, e.name, run)
env = exp.ExpEnv(args.repo, workdir)
env.create_cp = create_cp
env.restore_cp = restore_cp
run = runtime.Run(e, run, env, outpath, prereq)
rt.add_run(run)
return run
# load experiments # load experiments
if not args.pickled: if not args.pickled:
# default: load python modules with experiments # default: load python modules with experiments
...@@ -75,16 +91,14 @@ if not args.pickled: ...@@ -75,16 +91,14 @@ if not args.pickled:
experiments += mod.experiments experiments += mod.experiments
for e in experiments: for e in experiments:
for run in range(args.firstrun, args.firstrun + args.runs): # if this is an experiment with a checkpoint we might have to create it
outpath = '%s/%s-%d.json' % (args.outdir, e.name, run) if e.checkpoint:
if os.path.exists(outpath): prereq = add_exp(e, 0, None, True, False)
print('skip %s run %d' % (e.name, run)) else:
continue prereq = None
workdir = '%s/%s/%d' % (args.workdir, e.name, run)
env = exp.ExpEnv(args.repo, workdir) for run in range(args.firstrun, args.firstrun + args.runs):
rt.add_run(runtime.Run(e, run, env, outpath)) add_exp(e, run, prereq, False, e.checkpoint)
else: else:
# otherwise load pickled run object # otherwise load pickled run object
for path in args.experiments: for path in args.experiments:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment