Commit 21610420 authored by Antoine Kaufmann's avatar Antoine Kaufmann
Browse files

experiments: refactor runtime code into separate files

parent 623eb3e5
# Copyright 2021 Max Planck Institute for Software Systems, and
# National University of Singapore
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
from simbricks.runtime.common import (Run, Runtime)
from simbricks.runtime.local import (LocalSimpleRuntime, LocalParallelRuntime)
from simbricks.runtime.slurm import SlurmRuntime
# Copyright 2021 Max Planck Institute for Software Systems, and
# National University of Singapore
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import shutil
import pathlib
class Run(object):
def __init__(self, experiment, index, env, outpath, prereq=None):
self.experiment = experiment
self.index = index
self.env = env
self.outpath = outpath
self.output = None
self.prereq = prereq
def name(self):
return self.experiment.name + '.' + str(self.index)
def prep_dirs(self):
shutil.rmtree(self.env.workdir, ignore_errors=True)
if self.env.create_cp:
shutil.rmtree(self.env.cpdir, ignore_errors=True)
pathlib.Path(self.env.workdir).mkdir(parents=True, exist_ok=True)
pathlib.Path(self.env.cpdir).mkdir(parents=True, exist_ok=True)
class Runtime(object):
def add_run(self, run):
pass
def start(self):
pass
\ No newline at end of file
......@@ -21,42 +21,11 @@
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import asyncio
import pickle
import os
import pathlib
import shutil
import re
from simbricks.runtime.common import *
import simbricks.experiments as exp
class Run(object):
def __init__(self, experiment, index, env, outpath, prereq=None):
self.experiment = experiment
self.index = index
self.env = env
self.outpath = outpath
self.output = None
self.prereq = prereq
def name(self):
return self.experiment.name + '.' + str(self.index)
def prep_dirs(self):
shutil.rmtree(self.env.workdir, ignore_errors=True)
if self.env.create_cp:
shutil.rmtree(self.env.cpdir, ignore_errors=True)
pathlib.Path(self.env.workdir).mkdir(parents=True, exist_ok=True)
pathlib.Path(self.env.cpdir).mkdir(parents=True, exist_ok=True)
class Runtime(object):
def add_run(self, run):
pass
def start(self):
pass
class LocalSimpleRuntime(Runtime):
def __init__(self, verbose=False):
self.runnable = []
......@@ -178,84 +147,3 @@ class LocalParallelRuntime(Runtime):
def start(self):
asyncio.run(self.do_start())
class SlurmRuntime(Runtime):
def __init__(self, slurmdir, args, verbose=False, cleanup=True):
self.runnable = []
self.slurmdir = slurmdir
self.args = args
self.verbose = verbose
self.cleanup = cleanup
def add_run(self, run):
self.runnable.append(run)
def prep_run(self, run):
exp = run.experiment
e_idx = exp.name + f'-{run.index}' + '.exp'
exp_path = os.path.join(self.slurmdir, e_idx)
log_idx = exp.name + f'-{run.index}' + '.log'
exp_log = os.path.join(self.slurmdir, log_idx)
sc_idx = exp.name + f'-{run.index}' + '.sh'
exp_script = os.path.join(self.slurmdir, sc_idx)
print(exp_path)
print(exp_log)
print(exp_script)
# write out pickled run
with open(exp_path, 'wb') as f:
run.prereq = None # we don't want to pull in the prereq too
pickle.dump(run, f)
# create slurm batch script
with open(exp_script, 'w') as f:
f.write('#!/bin/sh\n')
f.write('#SBATCH -o %s -e %s\n' % (exp_log, exp_log))
#f.write('#SBATCH -c %d\n' % (exp.resreq_cores(),))
f.write('#SBATCH --mem=%dM\n' % (exp.resreq_mem(),))
f.write('#SBATCH --job-name="%s"\n' % (run.name(),))
f.write('#SBATCH --exclude=spyder[01-05],spyder16\n')
f.write('#SBATCH -c 32\n')
f.write('#SBATCH --nodes=1\n')
if exp.timeout is not None:
h = int(exp.timeout / 3600)
m = int((exp.timeout % 3600) / 60)
s = int(exp.timeout % 60)
f.write('#SBATCH --time=%02d:%02d:%02d\n' % (h, m, s))
extra = ''
if self.verbose:
extra = '--verbose'
f.write('python3 run.py %s --pickled %s\n' % (extra, exp_path))
f.write('status=$?\n')
if self.cleanup:
f.write('rm -rf %s\n' % (run.env.workdir))
f.write('exit $status\n')
return exp_script
def start(self):
pathlib.Path(self.slurmdir).mkdir(parents=True, exist_ok=True)
jid_re = re.compile(r'Submitted batch job ([0-9]+)')
for run in self.runnable:
if run.prereq is None:
dep_cmd = ''
else:
dep_cmd = '--dependency=afterok:' + str(run.prereq.job_id)
script = self.prep_run(run)
stream = os.popen('sbatch %s %s' % (dep_cmd, script))
output = stream.read()
result = stream.close()
if result is not None:
raise Exception('running sbatch failed')
m = jid_re.search(output)
run.job_id = int(m.group(1))
# Copyright 2021 Max Planck Institute for Software Systems, and
# National University of Singapore
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import pickle
import os
import pathlib
import re
from simbricks.runtime.common import *
class SlurmRuntime(Runtime):
def __init__(self, slurmdir, args, verbose=False, cleanup=True):
self.runnable = []
self.slurmdir = slurmdir
self.args = args
self.verbose = verbose
self.cleanup = cleanup
def add_run(self, run):
self.runnable.append(run)
def prep_run(self, run):
exp = run.experiment
e_idx = exp.name + f'-{run.index}' + '.exp'
exp_path = os.path.join(self.slurmdir, e_idx)
log_idx = exp.name + f'-{run.index}' + '.log'
exp_log = os.path.join(self.slurmdir, log_idx)
sc_idx = exp.name + f'-{run.index}' + '.sh'
exp_script = os.path.join(self.slurmdir, sc_idx)
print(exp_path)
print(exp_log)
print(exp_script)
# write out pickled run
with open(exp_path, 'wb') as f:
run.prereq = None # we don't want to pull in the prereq too
pickle.dump(run, f)
# create slurm batch script
with open(exp_script, 'w') as f:
f.write('#!/bin/sh\n')
f.write('#SBATCH -o %s -e %s\n' % (exp_log, exp_log))
#f.write('#SBATCH -c %d\n' % (exp.resreq_cores(),))
f.write('#SBATCH --mem=%dM\n' % (exp.resreq_mem(),))
f.write('#SBATCH --job-name="%s"\n' % (run.name(),))
f.write('#SBATCH --exclude=spyder[01-05],spyder16\n')
f.write('#SBATCH -c 32\n')
f.write('#SBATCH --nodes=1\n')
if exp.timeout is not None:
h = int(exp.timeout / 3600)
m = int((exp.timeout % 3600) / 60)
s = int(exp.timeout % 60)
f.write('#SBATCH --time=%02d:%02d:%02d\n' % (h, m, s))
extra = ''
if self.verbose:
extra = '--verbose'
f.write('python3 run.py %s --pickled %s\n' % (extra, exp_path))
f.write('status=$?\n')
if self.cleanup:
f.write('rm -rf %s\n' % (run.env.workdir))
f.write('exit $status\n')
return exp_script
def start(self):
pathlib.Path(self.slurmdir).mkdir(parents=True, exist_ok=True)
jid_re = re.compile(r'Submitted batch job ([0-9]+)')
for run in self.runnable:
if run.prereq is None:
dep_cmd = ''
else:
dep_cmd = '--dependency=afterok:' + str(run.prereq.job_id)
script = self.prep_run(run)
stream = os.popen('sbatch %s %s' % (dep_cmd, script))
output = stream.read()
result = stream.close()
if result is not None:
raise Exception('running sbatch failed')
m = jid_re.search(output)
run.job_id = int(m.group(1))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment