experiments: add slurm runtime

f012bd04 · Antoine Kaufmann · 24ee28c3 · f012bd04 · f012bd04
Commit f012bd04 authored Nov 04, 2020 by Antoine Kaufmann
Hide whitespace changes
Inline Side-by-side

Showing with 75 additions and 5 deletions

experiments/modes/runtime.py experiments/modes/runtime.py +57 -1

experiments/run.py experiments/run.py +18 -4

No files found.
--- a/experiments/modes/runtime.py
+++ b/experiments/modes/runtime.py
 import asyncio
+import pickle
+import os
+import pathlib
 import modes.experiments as exp
@@ -11,7 +14,7 @@ class Run(object):
        self.output = None
    def name(self):
-        return self.experiment.name + '[' + str(self.index) + ']'
+        return self.experiment.name + '.' + str(self.index)
 class Runtime(object):
    def add_run(self, run):
@@ -120,3 +123,56 @@ class LocalParallelRuntime(Runtime):
    def start(self):
        asyncio.run(self.do_start())
+class SlurmRuntime(Runtime):
+    def __init__(self, slurmdir, args, verbose=False, cleanup=True):
+        self.runnable = []
+        self.slurmdir = slurmdir
+        self.args = args
+        self.verbose = verbose
+        self.cleanup = cleanup
+    def add_run(self, run):
+        self.runnable.append(run)
+    def prep_run(self, run):
+        exp = run.experiment
+        exp_path = '%s/%s-%d.exp' % (self.slurmdir, exp.name, run.index)
+        exp_log = '%s/%s-%d.log' % (self.slurmdir, exp.name, run.index)
+        exp_script = '%s/%s-%d.sh' % (self.slurmdir, exp.name, run.index)
+        # write out pickled experiment
+        with open(exp_path, 'wb') as f:
+            pickle.dump(exp, f)
+        # create slurm batch script
+        with open(exp_script, 'w') as f:
+            f.write('#!/bin/sh\n')
+            f.write('#SBATCH -o %s -e %s\n' % (exp_log, exp_log))
+            f.write('#SBATCH -c %d\n' % (exp.resreq_cores(),))
+            f.write('#SBATCH --mem=%dM\n' % (exp.resreq_mem(),))
+            f.write('#SBATCH --job-name="%s"\n' % (run.name(),))
+            if exp.timeout is not None:
+                h = int(exp.timeout / 3600)
+                m = int((exp.timeout % 3600) / 60)
+                s = int(exp.timeout % 60)
+                f.write('#SBATCH --time=%02d:%02d:%02d\n' % (h, m, s))
+            f.write('mkdir -p %s\n' % (self.args.workdir))
+            f.write(('python3 run.py --repo=%s --workdir=%s --outdir=%s '
+                '--firstrun=%d --runs=1 %s\n') % (self.args.repo,
+                    self.args.workdir, self.args.outdir, run.index,
+                    exp_path))
+            f.write('status=$?\n')
+            if self.cleanup:
+                f.write('rm -rf %s\n' % (run.env.workdir))
+            f.write('exit $status\n')
+        return exp_script
+    def start(self):
+        pathlib.Path(self.slurmdir).mkdir(parents=True, exist_ok=True)
+        for run in self.runnable:
+            script = self.prep_run(run)
+            os.system('sbatch ' + script)
--- a/experiments/run.py
+++ b/experiments/run.py
@@ -16,6 +16,8 @@ parser.add_argument('experiments', metavar='EXP', type=str, nargs='+',
        help='An experiment file to run')
 parser.add_argument('--runs', metavar='N', type=int, default=1,
        help='Number of repetition for each experiment')
+parser.add_argument('--firstrun', metavar='N', type=int, default=1,
+        help='ID for first run')
 parser.add_argument('--verbose', action='store_const', const=True,
        default=False,
        help='Verbose output')
@@ -38,6 +40,13 @@ g_par.add_argument('--cores', metavar='N', type=int,
 g_par.add_argument('--mem', metavar='N', type=int, default=None,
        help='Memory limit for parallel runs (in MB)')
+g_slurm = parser.add_argument_group('Slurm Runtime')
+g_slurm.add_argument('--slurm', dest='runtime', action='store_const',
+        const='slurm', default='sequential',
+        help='Use slurm instead of sequential runtime')
+g_slurm.add_argument('--slurmdir', metavar='DIR',  type=str,
+        default='./slurm/', help='Slurm communication directory')
 args = parser.parse_args()
 experiments = []
@@ -53,27 +62,32 @@ for path in args.experiments:
        with open(path, 'rb') as f:
            experiments.append(pickle.load(f))
-mkdir_if_not_exists(args.workdir)
+if args.runtime != 'slurm':
+    mkdir_if_not_exists(args.workdir)
 mkdir_if_not_exists(args.outdir)
 if args.runtime == 'parallel':
    rt = runtime.LocalParallelRuntime(cores=args.cores, mem=args.mem,
            verbose=args.verbose)
+elif args.runtime == 'slurm':
+    rt = runtime.SlurmRuntime(args.slurmdir, args, verbose=args.verbose)
 else:
    rt = runtime.LocalSimpleRuntime(verbose=args.verbose)
 for e in experiments:
    workdir_base = '%s/%s' % (args.workdir, e.name)
-    mkdir_if_not_exists(workdir_base)
+    if args.runtime != 'slurm':
+        mkdir_if_not_exists(workdir_base)
-    for run in range(0, args.runs):
+    for run in range(args.firstrun, args.firstrun + args.runs):
        outpath = '%s/%s-%d.json' % (args.outdir, e.name, run)
        if os.path.exists(outpath):
            print('skip %s run %d' % (e.name, run))
            continue
        workdir = '%s/%d' % (workdir_base, run)
-        mkdir_if_not_exists(workdir)
+        if args.runtime != 'slurm':
+            mkdir_if_not_exists(workdir)
        env = exp.ExpEnv(args.repo, workdir)
        rt.add_run(runtime.Run(e, run, env, outpath))