Commit f012bd04 authored by Antoine Kaufmann's avatar Antoine Kaufmann
Browse files

experiments: add slurm runtime

parent 24ee28c3
import asyncio import asyncio
import pickle
import os
import pathlib
import modes.experiments as exp import modes.experiments as exp
...@@ -11,7 +14,7 @@ class Run(object): ...@@ -11,7 +14,7 @@ class Run(object):
self.output = None self.output = None
def name(self): def name(self):
return self.experiment.name + '[' + str(self.index) + ']' return self.experiment.name + '.' + str(self.index)
class Runtime(object): class Runtime(object):
def add_run(self, run): def add_run(self, run):
...@@ -120,3 +123,56 @@ class LocalParallelRuntime(Runtime): ...@@ -120,3 +123,56 @@ class LocalParallelRuntime(Runtime):
def start(self): def start(self):
asyncio.run(self.do_start()) asyncio.run(self.do_start())
class SlurmRuntime(Runtime):
def __init__(self, slurmdir, args, verbose=False, cleanup=True):
self.runnable = []
self.slurmdir = slurmdir
self.args = args
self.verbose = verbose
self.cleanup = cleanup
def add_run(self, run):
self.runnable.append(run)
def prep_run(self, run):
exp = run.experiment
exp_path = '%s/%s-%d.exp' % (self.slurmdir, exp.name, run.index)
exp_log = '%s/%s-%d.log' % (self.slurmdir, exp.name, run.index)
exp_script = '%s/%s-%d.sh' % (self.slurmdir, exp.name, run.index)
# write out pickled experiment
with open(exp_path, 'wb') as f:
pickle.dump(exp, f)
# create slurm batch script
with open(exp_script, 'w') as f:
f.write('#!/bin/sh\n')
f.write('#SBATCH -o %s -e %s\n' % (exp_log, exp_log))
f.write('#SBATCH -c %d\n' % (exp.resreq_cores(),))
f.write('#SBATCH --mem=%dM\n' % (exp.resreq_mem(),))
f.write('#SBATCH --job-name="%s"\n' % (run.name(),))
if exp.timeout is not None:
h = int(exp.timeout / 3600)
m = int((exp.timeout % 3600) / 60)
s = int(exp.timeout % 60)
f.write('#SBATCH --time=%02d:%02d:%02d\n' % (h, m, s))
f.write('mkdir -p %s\n' % (self.args.workdir))
f.write(('python3 run.py --repo=%s --workdir=%s --outdir=%s '
'--firstrun=%d --runs=1 %s\n') % (self.args.repo,
self.args.workdir, self.args.outdir, run.index,
exp_path))
f.write('status=$?\n')
if self.cleanup:
f.write('rm -rf %s\n' % (run.env.workdir))
f.write('exit $status\n')
return exp_script
def start(self):
pathlib.Path(self.slurmdir).mkdir(parents=True, exist_ok=True)
for run in self.runnable:
script = self.prep_run(run)
os.system('sbatch ' + script)
...@@ -16,6 +16,8 @@ parser.add_argument('experiments', metavar='EXP', type=str, nargs='+', ...@@ -16,6 +16,8 @@ parser.add_argument('experiments', metavar='EXP', type=str, nargs='+',
help='An experiment file to run') help='An experiment file to run')
parser.add_argument('--runs', metavar='N', type=int, default=1, parser.add_argument('--runs', metavar='N', type=int, default=1,
help='Number of repetition for each experiment') help='Number of repetition for each experiment')
parser.add_argument('--firstrun', metavar='N', type=int, default=1,
help='ID for first run')
parser.add_argument('--verbose', action='store_const', const=True, parser.add_argument('--verbose', action='store_const', const=True,
default=False, default=False,
help='Verbose output') help='Verbose output')
...@@ -38,6 +40,13 @@ g_par.add_argument('--cores', metavar='N', type=int, ...@@ -38,6 +40,13 @@ g_par.add_argument('--cores', metavar='N', type=int,
g_par.add_argument('--mem', metavar='N', type=int, default=None, g_par.add_argument('--mem', metavar='N', type=int, default=None,
help='Memory limit for parallel runs (in MB)') help='Memory limit for parallel runs (in MB)')
g_slurm = parser.add_argument_group('Slurm Runtime')
g_slurm.add_argument('--slurm', dest='runtime', action='store_const',
const='slurm', default='sequential',
help='Use slurm instead of sequential runtime')
g_slurm.add_argument('--slurmdir', metavar='DIR', type=str,
default='./slurm/', help='Slurm communication directory')
args = parser.parse_args() args = parser.parse_args()
experiments = [] experiments = []
...@@ -53,27 +62,32 @@ for path in args.experiments: ...@@ -53,27 +62,32 @@ for path in args.experiments:
with open(path, 'rb') as f: with open(path, 'rb') as f:
experiments.append(pickle.load(f)) experiments.append(pickle.load(f))
mkdir_if_not_exists(args.workdir) if args.runtime != 'slurm':
mkdir_if_not_exists(args.workdir)
mkdir_if_not_exists(args.outdir) mkdir_if_not_exists(args.outdir)
if args.runtime == 'parallel': if args.runtime == 'parallel':
rt = runtime.LocalParallelRuntime(cores=args.cores, mem=args.mem, rt = runtime.LocalParallelRuntime(cores=args.cores, mem=args.mem,
verbose=args.verbose) verbose=args.verbose)
elif args.runtime == 'slurm':
rt = runtime.SlurmRuntime(args.slurmdir, args, verbose=args.verbose)
else: else:
rt = runtime.LocalSimpleRuntime(verbose=args.verbose) rt = runtime.LocalSimpleRuntime(verbose=args.verbose)
for e in experiments: for e in experiments:
workdir_base = '%s/%s' % (args.workdir, e.name) workdir_base = '%s/%s' % (args.workdir, e.name)
mkdir_if_not_exists(workdir_base) if args.runtime != 'slurm':
mkdir_if_not_exists(workdir_base)
for run in range(0, args.runs): for run in range(args.firstrun, args.firstrun + args.runs):
outpath = '%s/%s-%d.json' % (args.outdir, e.name, run) outpath = '%s/%s-%d.json' % (args.outdir, e.name, run)
if os.path.exists(outpath): if os.path.exists(outpath):
print('skip %s run %d' % (e.name, run)) print('skip %s run %d' % (e.name, run))
continue continue
workdir = '%s/%d' % (workdir_base, run) workdir = '%s/%d' % (workdir_base, run)
mkdir_if_not_exists(workdir) if args.runtime != 'slurm':
mkdir_if_not_exists(workdir)
env = exp.ExpEnv(args.repo, workdir) env = exp.ExpEnv(args.repo, workdir)
rt.add_run(runtime.Run(e, run, env, outpath)) rt.add_run(runtime.Run(e, run, env, outpath))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment