Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ycai
simbricks
Commits
a0494ee3
Unverified
Commit
a0494ee3
authored
Oct 04, 2024
by
Jakob Görgen
Browse files
support for gem5 checkpointing + execution of multiple runs
parent
e724a160
Changes
15
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
398 additions
and
447 deletions
+398
-447
experiments/pyexps/netperf_sysconf.py
experiments/pyexps/netperf_sysconf.py
+8
-4
experiments/run_new.py
experiments/run_new.py
+49
-81
experiments/simbricks/orchestration/instantiation/__init__.py
...riments/simbricks/orchestration/instantiation/__init__.py
+2
-0
experiments/simbricks/orchestration/instantiation/base.py
experiments/simbricks/orchestration/instantiation/base.py
+108
-130
experiments/simbricks/orchestration/runtime/local.py
experiments/simbricks/orchestration/runtime/local.py
+141
-141
experiments/simbricks/orchestration/runtime_new/runs/base.py
experiments/simbricks/orchestration/runtime_new/runs/base.py
+4
-6
experiments/simbricks/orchestration/runtime_new/runs/local.py
...riments/simbricks/orchestration/runtime_new/runs/local.py
+26
-15
experiments/simbricks/orchestration/runtime_new/simulation_executor.py
...imbricks/orchestration/runtime_new/simulation_executor.py
+34
-36
experiments/simbricks/orchestration/simulation/base.py
experiments/simbricks/orchestration/simulation/base.py
+7
-15
experiments/simbricks/orchestration/simulation/host.py
experiments/simbricks/orchestration/simulation/host.py
+8
-5
experiments/simbricks/orchestration/simulation/net/net_base.py
...iments/simbricks/orchestration/simulation/net/net_base.py
+0
-7
experiments/simbricks/orchestration/simulation/output.py
experiments/simbricks/orchestration/simulation/output.py
+7
-3
experiments/simbricks/orchestration/system/host/app.py
experiments/simbricks/orchestration/system/host/app.py
+1
-1
experiments/simbricks/orchestration/system/host/base.py
experiments/simbricks/orchestration/system/host/base.py
+2
-2
experiments/simbricks/orchestration/system/host/disk_images.py
...iments/simbricks/orchestration/system/host/disk_images.py
+1
-1
No files found.
experiments/pyexps/netperf_sysconf.py
View file @
a0494ee3
...
...
@@ -14,7 +14,7 @@ This scripts generates the experiments with all the combinations of different ex
host_types
=
[
"gem5"
]
nic_types
=
[
"i40e"
]
net_types
=
[
"switch"
]
experiments
=
[]
instantiations
:
list
[
inst
.
Instantiation
]
=
[]
sys
=
system
.
System
()
...
...
@@ -121,7 +121,8 @@ for host_type in host_types:
else
:
raise
NameError
(
net_type
)
host_inst0
=
sim
.
QemuSim
(
simulation
)
host_inst0
=
sim
.
Gem5Sim
(
simulation
)
# host_inst0 = sim.QemuSim(simulation)
host_inst0
.
add
(
host0
)
host_inst0
.
name
=
"Client-Host"
# host_inst0.wait_terminate = True
...
...
@@ -129,7 +130,7 @@ for host_type in host_types:
# host_inst1 = sim.Gem5Sim(simulation)
host_inst1
=
sim
.
QemuSim
(
simulation
)
host_inst1
.
name
=
"Server-
Simulator
"
host_inst1
.
name
=
"Server-
Host
"
host_inst1
.
add
(
host1
)
# host_inst1.cpu_type = 'X86KvmCPU'
...
...
@@ -158,4 +159,7 @@ for host_type in host_types:
for
s
in
sims
:
print
(
s
)
experiments
.
append
(
simulation
)
instance
=
inst
.
Instantiation
(
sim
=
simulation
)
instance
.
preserve_tmp_folder
=
False
instance
.
create_checkpoint
=
True
instantiations
.
append
(
instance
)
experiments/run_new.py
View file @
a0494ee3
...
...
@@ -34,14 +34,14 @@ import signal
import
sys
from
simbricks.orchestration
import
exectools
from
simbricks.orchestration.experiment
import
experiment_environment
from
simbricks.orchestration.simulation
import
base
as
sim_base
from
simbricks.orchestration.simulation
import
output
from
simbricks.orchestration.simulation
import
output
as
sim_out
from
simbricks.orchestration.instantiation
import
base
as
inst_base
from
simbricks.orchestration.runtime_new
import
runs
from
simbricks.orchestration.runtime_new.runs
import
base
as
runs_base
from
simbricks.orchestration.runtime_new.runs
import
base
as
runs_base
from
simbricks.orchestration.runtime_new.runs
import
local
as
rt_local
from
simbricks.orchestration.runtime_new
import
command_executor
from
simbricks.orchestration.runtime_new
import
simulation_executor
def
parse_args
()
->
argparse
.
Namespace
:
...
...
@@ -261,56 +261,14 @@ def warn_multi_exec(executors: list[command_executor.Executor]):
def
add_exp
(
simulation
:
sim_base
.
Simulation
,
rt
:
runs
.
base
.
Runtime
,
run_number
:
int
,
prereq
:
runs
.
base
.
Run
|
None
,
create_cp
:
bool
,
restore_cp
:
bool
,
args
:
argparse
.
Namespace
,
):
outpath
=
f
"
{
args
.
outdir
}
/
{
simulation
.
name
}
-
{
run_number
}
.json"
if
os
.
path
.
exists
(
outpath
)
and
not
args
.
force
:
print
(
f
"skip
{
simulation
.
name
}
run
{
run_number
}
"
)
return
None
workdir
=
f
"
{
args
.
workdir
}
/
{
simulation
.
name
}
/
{
run_number
}
"
cpdir
=
f
"
{
args
.
workdir
}
/
{
simulation
.
name
}
/0"
if
args
.
shmdir
is
not
None
:
shmdir
=
f
"
{
args
.
shmdir
}
/
{
simulation
.
name
}
/
{
run_number
}
"
shm_base
=
""
# TODO
if
args
.
shmdir
is
not
None
:
env
.
shm_base
=
os
.
path
.
abspath
(
shmdir
)
# TODO: user can specify output base
output_base
=
""
tmp_sim_files
=
""
# TODO
inst_env
=
inst_base
.
InstantiationEnvironment
(
repo_path
=
args
.
repo
,
# workdir=workdir,
# cpdir=cpdir,
# create_cp=create_cp,
# restore_cp=restore_cp,
# shm_base=shm_base,
# output_base=output_base,
# tmp_simulation_files=tmp_sim_files,
)
inst_
=
inst_base
.
Instantiation
(
sim
=
simulation
,
env
=
inst_env
)
output_
=
output
.
SimulationOutput
(
simulation
)
run
=
runs
.
base
.
Run
(
simulation
=
simulation
,
instantiation
=
inst_
,
prereq
=
prereq
,
output
=
output_
)
instantiation
:
inst_base
.
Instantiation
,
prereq
:
runs_base
.
Run
|
None
,
rt
:
runs_base
.
Runtime
,
)
->
runs_base
.
Run
:
output
=
sim_out
.
SimulationOutput
(
instantiation
.
simulation
)
run
=
runs_base
.
Run
(
instantiation
=
instantiation
,
prereq
=
prereq
,
output
=
output
)
rt
.
add_run
(
run
)
return
run
...
...
@@ -322,18 +280,18 @@ def main():
executors
=
load_executors
(
args
.
hosts
)
# initialize runtime
if
args
.
runtime
==
"parallel"
:
if
args
.
runtime
==
"parallel"
:
# TODO: FIXME
warn_multi_exec
(
executors
)
rt
=
r
uns
.
LocalParallelRuntime
(
rt
=
r
t_local
.
LocalParallelRuntime
(
cores
=
args
.
cores
,
mem
=
args
.
mem
,
verbose
=
args
.
verbose
,
executor
=
executors
[
0
]
)
elif
args
.
runtime
==
"slurm"
:
rt
=
runs
.
SlurmRuntime
(
args
.
slurmdir
,
args
,
verbose
=
args
.
verbose
)
elif
args
.
runtime
==
"dist"
:
rt
=
runs
.
DistributedSimpleRuntime
(
executors
,
verbose
=
args
.
verbose
)
#
elif args.runtime == "slurm":
#
rt = runs.SlurmRuntime(args.slurmdir, args, verbose=args.verbose)
#
elif args.runtime == "dist":
#
rt = runs.DistributedSimpleRuntime(executors, verbose=args.verbose)
else
:
warn_multi_exec
(
executors
)
rt
=
r
uns
.
LocalSimpleRuntime
(
verbose
=
args
.
verbose
,
executor
=
executors
[
0
])
rt
=
r
t_local
.
LocalSimpleRuntime
(
verbose
=
args
.
verbose
,
executor
=
executors
[
0
])
if
args
.
profile_int
:
rt
.
enable_profiler
(
args
.
profile_int
)
...
...
@@ -341,7 +299,7 @@ def main():
# load experiments
if
not
args
.
pickled
:
# default: load python modules with experiments
simul
ations
:
list
[
sim
_base
.
Simul
ation
]
=
[]
instanti
ations
:
list
[
inst
_base
.
Instanti
ation
]
=
[]
for
path
in
args
.
experiments
:
modname
,
_
=
os
.
path
.
splitext
(
os
.
path
.
basename
(
path
))
...
...
@@ -355,23 +313,22 @@ def main():
if
spec
.
loader
is
None
:
raise
ExperimentModuleLoadError
(
"spec.loader is None"
)
spec
.
loader
.
exec_module
(
mod
)
simul
ations
+=
mod
.
experiment
s
instanti
ations
+=
mod
.
instantiation
s
if
args
.
list
:
for
sim
in
simul
ations
:
print
(
sim
.
name
)
for
inst
in
instanti
ations
:
print
(
inst
.
simulation
.
name
)
sys
.
exit
(
0
)
for
sim
in
simulations
:
# TODO: do we want a sitributed SImulation class? --> probably not, choose slightly different abstraction
if
args
.
auto_dist
and
not
isinstance
(
sim
,
sim_base
.
DistributedExperiment
):
sim
=
runs
.
auto_dist
(
sim
,
executors
,
args
.
proxy_type
)
for
inst
in
instantiations
:
# if args.auto_dist and not isinstance(sim, sim_base.DistributedExperiment):
# sim = runs_base.auto_dist(sim, executors, args.proxy_type)
# apply filter if any specified
if
(
args
.
filter
)
and
(
len
(
args
.
filter
)
>
0
):
match
=
False
for
f
in
args
.
filter
:
match
=
fnmatch
.
fnmatch
(
sim
.
name
,
f
)
match
=
fnmatch
.
fnmatch
(
inst
.
simulation
.
name
,
f
)
if
match
:
break
...
...
@@ -380,19 +337,30 @@ def main():
# if this is an experiment with a checkpoint we might have to create
# it
# TODO: what to do / how to handel checkpointing
if
sim
.
checkpoint
:
prereq
=
add_exp
(
sim
,
rt
,
0
,
None
,
True
,
False
,
args
)
else
:
prereq
=
None
for
run
in
range
(
args
.
firstrun
,
args
.
firstrun
+
args
.
runs
):
add_exp
(
sim
,
rt
,
run
,
prereq
,
False
,
sim
.
checkpoint
,
args
)
else
:
# otherwise load pickled run object
for
path
in
args
.
experiments
:
with
open
(
path
,
"rb"
)
as
f
:
rt
.
add_run
(
pickle
.
load
(
f
))
prereq
=
None
if
(
inst
.
create_checkpoint
and
inst
.
simulation
.
any_supports_checkpointing
()
):
checkpointing_inst
=
inst
.
copy
()
checkpointing_inst
.
restore_checkpoint
=
False
checkpointing_inst
.
create_checkpoint
=
True
inst
.
create_checkpoint
=
False
inst
.
restore_checkpoint
=
True
prereq
=
add_exp
(
instantiation
=
checkpointing_inst
,
rt
=
rt
,
prereq
=
None
)
for
index
in
range
(
args
.
firstrun
,
args
.
firstrun
+
args
.
runs
):
inst_copy
=
inst
.
copy
()
inst_copy
.
preserve_tmp_folder
=
False
if
index
==
args
.
firstrun
+
args
.
runs
-
1
:
inst_copy
.
_preserve_checkpoints
=
False
add_exp
(
instantiation
=
inst_copy
,
rt
=
rt
,
prereq
=
prereq
)
# else:
# # otherwise load pickled run object
# for path in args.experiments:
# with open(path, "rb") as f:
# rt.add_run(pickle.load(f))
# register interrupt handler
signal
.
signal
(
signal
.
SIGINT
,
lambda
*
_
:
rt
.
interrupt
())
...
...
experiments/simbricks/orchestration/instantiation/__init__.py
View file @
a0494ee3
...
...
@@ -19,3 +19,5 @@
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
from
simbricks.orchestration.instantiation.base
import
*
experiments/simbricks/orchestration/instantiation/base.py
View file @
a0494ee3
...
...
@@ -22,7 +22,6 @@
from
__future__
import
annotations
import
asyncio
import
enum
import
pathlib
import
shutil
...
...
@@ -32,7 +31,6 @@ from simbricks.orchestration.system import base as sys_base
from
simbricks.orchestration.system
import
pcie
as
sys_pcie
from
simbricks.orchestration.system
import
mem
as
sys_mem
from
simbricks.orchestration.system
import
eth
as
sys_eth
from
simbricks.orchestration.system.host
import
base
as
sys_host
from
simbricks.orchestration.system.host
import
disk_images
from
simbricks.orchestration.runtime_new
import
command_executor
...
...
@@ -57,57 +55,23 @@ class InstantiationEnvironment(util_base.IdObj):
def
__init__
(
self
,
repo_path
:
str
=
pathlib
.
Path
(
__file__
).
parents
[
3
].
resolve
(),
repo_path
:
str
=
pathlib
.
Path
(
__file__
).
parents
[
4
].
resolve
(),
workdir
:
str
|
None
=
None
,
output_base
:
str
|
None
=
None
,
cpdir
:
str
|
None
=
None
,
create_cp
:
bool
=
False
,
restore_cp
:
bool
=
False
,
shm_base
:
str
|
None
=
None
,
tmp_simulation_files
:
str
|
None
=
None
,
qemu_img_path
:
str
|
None
=
None
,
qemu_path
:
str
|
None
=
None
,
):
super
().
__init__
()
self
.
_repodir
:
str
=
pathlib
.
Path
(
repo_path
).
resolve
()
self
.
_workdir
:
str
=
(
workdir
if
workdir
else
pathlib
.
Path
(
f
"
{
self
.
_repodir
}
/wrkdir"
).
resolve
()
)
self
.
_output_base
:
str
=
(
output_base
if
output_base
else
pathlib
.
Path
(
f
"
{
self
.
_workdir
}
/output"
).
resolve
()
)
self
.
_cpdir
:
str
=
(
cpdir
if
cpdir
else
pathlib
.
Path
(
f
"
{
self
.
_output_base
}
/checkpoints"
).
resolve
()
)
self
.
_shm_base
:
str
=
(
shm_base
if
shm_base
else
pathlib
.
Path
(
f
"
{
self
.
_workdir
}
/shm"
).
resolve
()
)
self
.
_tmp_simulation_files
:
str
=
(
tmp_simulation_files
if
tmp_simulation_files
else
(
pathlib
.
Path
(
f
"
{
self
.
_workdir
}
/tmp"
).
resolve
())
)
self
.
_create_cp
:
bool
=
create_cp
self
.
_restore_cp
:
bool
=
restore_cp
self
.
_qemu_img_path
:
str
=
(
qemu_img_path
if
qemu_img_path
else
pathlib
.
Path
(
f
"
{
self
.
_repodir
}
/sims/external/qemu/build/qemu-img"
).
resolve
()
)
self
.
_qemu_path
:
str
=
(
qemu_path
if
qemu_path
else
pathlib
.
Path
(
f
"
{
self
.
_repodir
}
/sims/external/qemu/build/x86_64-softmmu/qemu-system-x86_64"
).
resolve
()
)
self
.
_output_base
:
str
=
pathlib
.
Path
(
f
"
{
self
.
_workdir
}
/output"
).
resolve
()
self
.
_tmp_simulation_files
:
str
=
pathlib
.
Path
(
f
"
{
self
.
_workdir
}
/tmp"
).
resolve
()
self
.
_imgdir
:
str
=
pathlib
.
Path
(
f
"
{
self
.
_tmp_simulation_files
}
/imgs"
).
resolve
()
self
.
_cpdir
:
str
=
pathlib
.
Path
(
f
"
{
self
.
_tmp_simulation_files
}
/checkpoints"
).
resolve
()
self
.
_shm_base
:
str
=
pathlib
.
Path
(
f
"
{
self
.
_tmp_simulation_files
}
/shm"
).
resolve
()
class
Instantiation
(
util_base
.
IdObj
):
...
...
@@ -118,13 +82,14 @@ class Instantiation(util_base.IdObj):
env
:
InstantiationEnvironment
=
InstantiationEnvironment
(),
):
super
().
__init__
()
self
.
_
simulation
:
sim_base
.
Simulation
=
sim
self
.
_
env
:
InstantiationEnvironment
=
env
self
.
simulation
:
sim_base
.
Simulation
=
sim
self
.
env
:
InstantiationEnvironment
=
env
self
.
_executor
:
command_executor
.
Executor
|
None
=
None
self
.
_create_checkpoint
:
bool
=
False
self
.
_restore_checkpoint
:
bool
=
False
self
.
_preserve_checkpoints
:
bool
=
True
self
.
preserve_tmp_folder
:
bool
=
False
self
.
_socket_per_interface
:
dict
[
sys_base
.
Interface
,
Socket
]
=
{}
self
.
_simulation_topo
:
(
dict
[
sys_base
.
Interface
,
set
[
sys_base
.
Interface
]]
|
None
)
=
None
self
.
_sim_dependency
:
(
dict
[
sim_base
.
Simulator
,
set
[
sim_base
.
Simulator
]]
|
None
)
=
None
...
...
@@ -144,12 +109,6 @@ class Instantiation(util_base.IdObj):
def
executor
(
self
,
executor
:
command_executor
.
Executor
):
self
.
_executor
=
executor
def
qemu_img_path
(
self
)
->
str
:
return
self
.
_env
.
_qemu_img_path
def
qemu_path
(
self
)
->
str
:
return
self
.
_env
.
_qemu_path
def
_get_opposing_interface
(
self
,
interface
:
sys_base
.
Interface
)
->
sys_base
.
Interface
:
...
...
@@ -203,7 +162,7 @@ class Instantiation(util_base.IdObj):
raise
Exception
(
"cannot create socket path for given interface type"
)
assert
queue_type
is
not
None
print
(
f
"_interface_to_sock_path: self.
_
env._shm_base=
{
self
.
shm_base_dir
()
}
"
)
print
(
f
"_interface_to_sock_path: self.env._shm_base=
{
self
.
shm_base_dir
()
}
"
)
return
self
.
_join_paths
(
base
=
self
.
shm_base_dir
(),
relative_path
=
f
"
{
queue_type
}
-
{
queue_ident
}
"
,
...
...
@@ -311,7 +270,7 @@ class Instantiation(util_base.IdObj):
self
.
_get_socket
(
interface
=
sim_b
,
socket_type
=
SockType
.
LISTEN
)
# build dependency graph
for
sim
in
self
.
_
simulation
.
all_simulators
():
for
sim
in
self
.
simulation
.
all_simulators
():
for
comp
in
sim
.
_components
:
for
sim_inf
in
comp
.
interfaces
():
if
self
.
_opposing_interface_within_same_sim
(
interface
=
sim_inf
):
...
...
@@ -324,21 +283,10 @@ class Instantiation(util_base.IdObj):
def
sim_dependencies
(
self
)
->
dict
[
sim_base
.
Simulator
,
set
[
sim_base
.
Simulator
]]:
if
self
.
_sim_dependency
is
not
None
:
return
self
.
_sim_dependency
self
.
_build_simulation_topology
()
assert
self
.
_sim_dependency
is
not
None
return
self
.
_sim_dependency
async
def
cleanup_sockets
(
self
,
sockets
:
list
[
Socket
]
=
[],
)
->
None
:
scs
=
[]
for
sock
in
sockets
:
scs
.
append
(
asyncio
.
create_task
(
self
.
executor
.
rmtree
(
path
=
sock
.
_path
)))
if
len
(
scs
)
>
0
:
await
asyncio
.
gather
(
*
scs
)
async
def
wait_for_sockets
(
self
,
sockets
:
list
[
Socket
]
=
[],
...
...
@@ -346,68 +294,92 @@ class Instantiation(util_base.IdObj):
wait_socks
=
list
(
map
(
lambda
sock
:
sock
.
_path
,
sockets
))
await
self
.
executor
.
await_files
(
wait_socks
,
verbose
=
True
)
# TODO: add more methods constructing paths as required by methods in simulators or image handling classes
# TODO: fix paths to support mutliple exeriment runs etc.
def
wrkdir
(
self
)
->
str
:
return
pathlib
.
Path
(
self
.
_env
.
_workdir
).
resolve
()
def
shm_base_dir
(
self
)
->
str
:
return
pathlib
.
Path
(
self
.
_env
.
_shm_base
).
resolve
()
def
create_cp
(
self
)
->
bool
:
return
self
.
_env
.
_create_cp
def
restore_cp
(
self
)
->
bool
:
return
self
.
_env
.
_restore_cp
@
property
def
create_checkpoint
(
self
)
->
bool
:
"""
Whether to use checkpoint and restore for simulators.
The most common use-case for this is accelerating host simulator startup
by first running in a less accurate mode, then checkpointing the system
state after boot and running simulations from there.
"""
assert
(
self
.
_create_checkpoint
^
self
.
_restore_checkpoint
)
or
(
not
self
.
_create_checkpoint
and
not
self
.
_restore_checkpoint
)
return
self
.
_create_checkpoint
def
cpdir
(
self
)
->
str
:
return
pathlib
.
Path
(
self
.
_env
.
_cpdir
).
resolve
()
@
create_checkpoint
.
setter
def
create_checkpoint
(
self
,
create_checkpoint
:
bool
)
->
None
:
assert
(
self
.
_create_checkpoint
^
self
.
_restore_checkpoint
)
or
(
not
self
.
_create_checkpoint
and
not
self
.
_restore_checkpoint
)
self
.
_create_checkpoint
=
create_checkpoint
def
wrkdir
(
self
)
->
str
:
return
pathlib
.
Path
(
self
.
_env
.
_workdir
).
resolve
()
@
property
def
restore_checkpoint
(
self
)
->
bool
:
assert
(
self
.
_create_checkpoint
^
self
.
_restore_checkpoint
)
or
(
not
self
.
_create_checkpoint
and
not
self
.
_restore_checkpoint
)
return
self
.
_restore_checkpoint
def
tmp_dir
(
self
)
->
str
:
return
pathlib
.
Path
(
self
.
_env
.
_tmp_simulation_files
).
resolve
()
@
restore_checkpoint
.
setter
def
restore_checkpoint
(
self
,
restore_checkpoint
:
bool
)
->
None
:
assert
(
self
.
_create_checkpoint
^
self
.
_restore_checkpoint
)
or
(
not
self
.
_create_checkpoint
and
not
self
.
_restore_checkpoint
)
self
.
_restore_checkpoint
=
restore_checkpoint
async
def
prepare
(
self
)
->
None
:
wrkdir
=
self
.
wrkdir
()
print
(
f
"wrkdir=
{
wrkdir
}
"
)
shutil
.
rmtree
(
wrkdir
,
ignore_errors
=
True
)
await
self
.
executor
.
rmtree
(
wrkdir
)
def
copy
(
self
)
->
Instantiation
:
copy
=
Instantiation
(
sim
=
self
.
simulation
,
env
=
self
.
env
)
return
copy
shm_base
=
self
.
shm_base_dir
()
print
(
f
"shm_base=
{
shm_base
}
"
)
shutil
.
rmtree
(
shm_base
,
ignore_errors
=
True
)
await
self
.
executor
.
rmtree
(
shm_base
)
def
out_base_dir
(
self
)
->
str
:
return
pathlib
.
Path
(
f
"
{
self
.
env
.
_output_base
}
/
{
self
.
simulation
.
name
}
"
# /{self.run._run_nr}"
).
resolve
(
)
cpdir
=
self
.
cpdir
()
print
(
f
"cpdir=
{
cpdir
}
"
)
if
self
.
create_cp
():
shutil
.
rmtree
(
cpdir
,
ignore_errors
=
True
)
await
self
.
executor
.
rmtree
(
cpdir
)
def
shm_base_dir
(
self
)
->
str
:
return
pathlib
.
Path
(
f
"
{
self
.
env
.
_shm_base
}
/
{
self
.
simulation
.
name
}
"
# /{self.run._run_nr}"
).
resolve
()
tmpdir
=
self
.
tmp_dir
()
print
(
f
"tmpdir=
{
tmpdir
}
"
)
shutil
.
rmtree
(
tmpdir
,
ignore_errors
=
True
)
await
self
.
executor
.
rmtree
(
tmpdir
)
def
imgs_dir
(
self
)
->
str
:
return
pathlib
.
Path
(
f
"
{
self
.
env
.
_imgdir
}
/
{
self
.
simulation
.
name
}
"
# /{self.run._run_nr}"
).
resolve
(
)
pathlib
.
Path
(
wrkdir
).
mkdir
(
parents
=
True
,
exist_ok
=
True
)
await
self
.
executor
.
mkdir
(
wrkdir
)
def
cpdir
(
self
)
->
str
:
return
pathlib
.
Path
(
f
"
{
self
.
env
.
_cpdir
}
/
{
self
.
simulation
.
name
}
"
# /{self.run._run_nr}"
).
resolve
()
pathlib
.
Path
(
cpdir
).
mkdir
(
parents
=
True
,
exist_ok
=
True
)
await
self
.
executor
.
mkdir
(
cpdir
)
def
wrkdir
(
self
)
->
str
:
return
pathlib
.
Path
(
f
"
{
self
.
env
.
_workdir
}
/
{
self
.
simulation
.
name
}
"
# /{self.run._run_nr}"
).
resolve
()
pathlib
.
Path
(
shm_base
).
mkdir
(
parents
=
True
,
exist_ok
=
True
)
await
self
.
executor
.
mkdir
(
shm_base
)
async
def
prepare
(
self
)
->
None
:
to_prepare
=
[
self
.
shm_base_dir
(),
self
.
imgs_dir
()]
if
not
self
.
create_checkpoint
and
not
self
.
restore_checkpoint
:
to_prepare
.
append
(
self
.
cpdir
())
for
tp
in
to_prepare
:
shutil
.
rmtree
(
tp
,
ignore_errors
=
True
)
await
self
.
executor
.
rmtree
(
tp
)
pathlib
.
Path
(
t
mpdir
).
mkdir
(
parents
=
True
,
exist_ok
=
True
)
await
self
.
executor
.
mkdir
(
t
mpdir
)
pathlib
.
Path
(
t
p
).
mkdir
(
parents
=
True
,
exist_ok
=
True
)
await
self
.
executor
.
mkdir
(
t
p
)
await
self
.
_
simulation
.
prepare
(
inst
=
self
)
await
self
.
simulation
.
prepare
(
inst
=
self
)
async
def
cleanup
(
self
)
->
None
:
pass
# TODO: implement cleanup functionality (e.g. delete )
if
self
.
preserve_tmp_folder
:
return
to_delete
=
[
self
.
shm_base_dir
(),
self
.
imgs_dir
()]
if
not
self
.
_preserve_checkpoints
:
to_delete
.
append
(
self
.
cpdir
())
for
td
in
to_delete
:
shutil
.
rmtree
(
td
,
ignore_errors
=
True
)
await
self
.
executor
.
rmtree
(
td
)
def
_join_paths
(
self
,
base
:
str
=
""
,
relative_path
:
str
=
""
,
enforce_existence
=
False
...
...
@@ -424,12 +396,12 @@ class Instantiation(util_base.IdObj):
def
join_repo_base
(
self
,
relative_path
:
str
)
->
str
:
return
self
.
_join_paths
(
base
=
self
.
_
env
.
_repodir
,
relative_path
=
relative_path
,
enforce_existence
=
True
base
=
self
.
env
.
_repodir
,
relative_path
=
relative_path
,
enforce_existence
=
True
)
def
join_output_base
(
self
,
relative_path
:
str
)
->
str
:
return
self
.
_join_paths
(
base
=
self
.
_env
.
_output_base
,
base
=
self
.
out_base_dir
()
,
relative_path
=
relative_path
,
enforce_existence
=
True
,
)
...
...
@@ -438,7 +410,7 @@ class Instantiation(util_base.IdObj):
if
Instantiation
.
is_absolute_exists
(
hd_name_or_path
):
return
hd_name_or_path
path
=
self
.
_join_paths
(
base
=
self
.
_
env
.
_repodir
,
base
=
self
.
env
.
_repodir
,
relative_path
=
f
"images/output-
{
hd_name_or_path
}
/
{
hd_name_or_path
}
"
,
enforce_existence
=
True
,
)
...
...
@@ -447,23 +419,29 @@ class Instantiation(util_base.IdObj):
def
cfgtar_path
(
self
,
sim
:
sim_base
.
Simulator
)
->
str
:
return
f
"
{
self
.
wrkdir
()
}
/cfg.
{
sim
.
name
}
.tar"
def
join_tmp_base
(
self
,
relative_path
:
str
)
->
str
:
# def join_tmp_base(self, relative_path: str) -> str:
# return self._join_paths(
# base=self.tmp_dir(),
# relative_path=relative_path,
# )
def
join_imgs_path
(
self
,
relative_path
:
str
)
->
str
:
return
self
.
_join_paths
(
base
=
self
.
tmp
_dir
(),
base
=
self
.
imgs
_dir
(),
relative_path
=
relative_path
,
)
def
dynamic_img_path
(
self
,
img
:
disk_images
.
DiskImage
,
format
:
str
)
->
str
:
filename
=
f
"
{
img
.
_id
}
.
{
format
}
"
return
self
.
_join_paths
(
base
=
self
.
tmp
_dir
(),
base
=
self
.
imgs
_dir
(),
relative_path
=
filename
,
)
def
hdcopy_path
(
self
,
img
:
disk_images
.
DiskImage
,
format
:
str
)
->
str
:
filename
=
f
"
{
img
.
_id
}
_hdcopy.
{
format
}
"
return
self
.
_join_paths
(
base
=
self
.
tmp
_dir
(),
base
=
self
.
imgs
_dir
(),
relative_path
=
filename
,
)
...
...
@@ -475,7 +453,7 @@ class Instantiation(util_base.IdObj):
def
get_simmulator_output_dir
(
self
,
sim
:
sim_base
.
Simulator
)
->
str
:
dir_path
=
f
"output.
{
sim
.
full_name
()
}
-
{
sim
.
_id
}
"
return
self
.
_join_paths
(
base
=
self
.
_env
.
_output_base
,
relative_path
=
dir_path
)
return
self
.
_join_paths
(
base
=
self
.
out_base_dir
()
,
relative_path
=
dir_path
)
def
get_simulator_shm_pool_path
(
self
,
sim
:
sim_base
.
Simulator
)
->
str
:
return
self
.
_join_paths
(
...
...
@@ -483,10 +461,10 @@ class Instantiation(util_base.IdObj):
relative_path
=
f
"
{
sim
.
full_name
()
}
-shm-pool-
{
sim
.
_id
}
"
,
)
def
get_simulation_output_path
(
self
,
run_n
umbe
r
:
int
)
->
str
:
def
get_simulation_output_path
(
self
,
run_nr
:
int
)
->
str
:
return
self
.
_join_paths
(
base
=
self
.
_env
.
_output_base
,
relative_path
=
f
"
out-
{
run_n
umber
}
.json"
,
base
=
self
.
out_base_dir
()
,
relative_path
=
f
"
{
run_n
r
}
/out
.json"
,
)
def
find_sim_by_interface
(
...
...
@@ -496,4 +474,4 @@ class Instantiation(util_base.IdObj):
def
find_sim_by_spec
(
self
,
spec
:
sys_base
.
Component
)
->
sim_base
.
Simulator
:
util_base
.
has_expected_type
(
spec
,
sys_base
.
Component
)
return
self
.
_
simulation
.
find_sim
(
spec
)
return
self
.
simulation
.
find_sim
(
spec
)
experiments/simbricks/orchestration/runtime/local.py
View file @
a0494ee3
...
...
@@ -85,144 +85,144 @@ class LocalSimpleRuntime(Runtime):
self
.
_running
.
cancel
()
class
LocalParallelRuntime
(
Runtime
):
"""Execute runs locally in parallel on multiple cores."""
def
__init__
(
self
,
cores
:
int
,
mem
:
tp
.
Optional
[
int
]
=
None
,
verbose
=
False
,
executor
:
exectools
.
Executor
=
exectools
.
LocalExecutor
()
):
super
().
__init__
()
self
.
runs_noprereq
:
tp
.
List
[
Run
]
=
[]
"""Runs with no prerequesite runs."""
self
.
runs_prereq
:
tp
.
List
[
Run
]
=
[]
"""Runs with prerequesite runs."""
self
.
complete
:
tp
.
Set
[
Run
]
=
set
()
self
.
cores
=
cores
self
.
mem
=
mem
self
.
verbose
=
verbose
self
.
executor
=
executor
self
.
_pending_jobs
:
tp
.
Set
[
asyncio
.
Task
]
=
set
()
self
.
_starter_task
:
asyncio
.
Task
def
add_run
(
self
,
run
:
Run
)
->
None
:
if
run
.
experiment
.
resreq_cores
()
>
self
.
cores
:
raise
RuntimeError
(
'Not enough cores available for run'
)
if
self
.
mem
is
not
None
and
run
.
experiment
.
resreq_mem
()
>
self
.
mem
:
raise
RuntimeError
(
'Not enough memory available for run'
)
if
run
.
prereq
is
None
:
self
.
runs_noprereq
.
append
(
run
)
else
:
self
.
runs_prereq
.
append
(
run
)
async
def
do_run
(
self
,
run
:
Run
)
->
tp
.
Optional
[
Run
]:
"""Actually executes `run`."""
try
:
runner
=
ExperimentSimpleRunner
(
self
.
executor
,
run
.
experiment
,
run
.
env
,
self
.
verbose
)
if
self
.
profile_int
:
runner
.
profile_int
=
self
.
profile_int
await
run
.
prep_dirs
(
executor
=
self
.
executor
)
await
runner
.
prepare
()
except
asyncio
.
CancelledError
:
# it is safe to just exit here because we are not running any
# simulators yet
return
None
print
(
'starting run '
,
run
.
name
())
run
.
output
=
await
runner
.
run
()
# already handles CancelledError
# if the log is huge, this step takes some time
if
self
.
verbose
:
print
(
f
'Writing collected output of run
{
run
.
name
()
}
to JSON file ...'
)
run
.
output
.
dump
(
run
.
outpath
)
print
(
'finished run '
,
run
.
name
())
return
run
async
def
wait_completion
(
self
)
->
None
:
"""Wait for any run to terminate and return."""
assert
self
.
_pending_jobs
done
,
self
.
_pending_jobs
=
await
asyncio
.
wait
(
self
.
_pending_jobs
,
return_when
=
asyncio
.
FIRST_COMPLETED
)
for
run
in
done
:
run
=
await
run
self
.
complete
.
add
(
run
)
self
.
cores_used
-=
run
.
experiment
.
resreq_cores
()
self
.
mem_used
-=
run
.
experiment
.
resreq_mem
()
def
enough_resources
(
self
,
run
:
Run
)
->
bool
:
"""Check if enough cores and mem are available for the run."""
exp
=
run
.
experiment
# pylint: disable=redefined-outer-name
if
self
.
cores
is
not
None
:
enough_cores
=
(
self
.
cores
-
self
.
cores_used
)
>=
exp
.
resreq_cores
()
else
:
enough_cores
=
True
if
self
.
mem
is
not
None
:
enough_mem
=
(
self
.
mem
-
self
.
mem_used
)
>=
exp
.
resreq_mem
()
else
:
enough_mem
=
True
return
enough_cores
and
enough_mem
def
prereq_ready
(
self
,
run
:
Run
)
->
bool
:
"""Check if the prerequesite run for `run` has completed."""
if
run
.
prereq
is
None
:
return
True
return
run
.
prereq
in
self
.
complete
async
def
do_start
(
self
)
->
None
:
"""Asynchronously execute the runs defined in `self.runs_noprereq +
self.runs_prereq."""
#self.completions = asyncio.Queue()
self
.
cores_used
=
0
self
.
mem_used
=
0
runs
=
self
.
runs_noprereq
+
self
.
runs_prereq
for
run
in
runs
:
# if necessary, wait for enough memory or cores
while
not
self
.
enough_resources
(
run
):
print
(
'waiting for resources'
)
await
self
.
wait_completion
()
# if necessary, wait for prerequesite runs to complete
while
not
self
.
prereq_ready
(
run
):
print
(
'waiting for prereq'
)
await
self
.
wait_completion
()
self
.
cores_used
+=
run
.
experiment
.
resreq_cores
()
self
.
mem_used
+=
run
.
experiment
.
resreq_mem
()
job
=
asyncio
.
create_task
(
self
.
do_run
(
run
))
self
.
_pending_jobs
.
add
(
job
)
# wait for all runs to finish
await
asyncio
.
gather
(
*
self
.
_pending_jobs
)
async
def
start
(
self
)
->
None
:
"""Execute all defined runs."""
self
.
_starter_task
=
asyncio
.
create_task
(
self
.
do_start
())
try
:
await
self
.
_starter_task
except
asyncio
.
CancelledError
:
for
job
in
self
.
_pending_jobs
:
job
.
cancel
()
# wait for all runs to finish
await
asyncio
.
gather
(
*
self
.
_pending_jobs
)
def
interrupt_handler
(
self
)
->
None
:
self
.
_starter_task
.
cancel
()
#
class LocalParallelRuntime(Runtime):
#
"""Execute runs locally in parallel on multiple cores."""
#
def __init__(
#
self,
#
cores: int,
#
mem: tp.Optional[int] = None,
#
verbose=False,
#
executor: exectools.Executor = exectools.LocalExecutor()
#
):
#
super().__init__()
#
self.runs_noprereq: tp.List[Run] = []
#
"""Runs with no prerequesite runs."""
#
self.runs_prereq: tp.List[Run] = []
#
"""Runs with prerequesite runs."""
#
self.complete: tp.Set[Run] = set()
#
self.cores = cores
#
self.mem = mem
#
self.verbose = verbose
#
self.executor = executor
#
self._pending_jobs: tp.Set[asyncio.Task] = set()
#
self._starter_task: asyncio.Task
#
def add_run(self, run: Run) -> None:
#
if run.experiment.resreq_cores() > self.cores:
#
raise RuntimeError('Not enough cores available for run')
#
if self.mem is not None and run.experiment.resreq_mem() > self.mem:
#
raise RuntimeError('Not enough memory available for run')
#
if run.prereq is None:
#
self.runs_noprereq.append(run)
#
else:
#
self.runs_prereq.append(run)
#
async def do_run(self, run: Run) -> tp.Optional[Run]:
#
"""Actually executes `run`."""
#
try:
#
runner = ExperimentSimpleRunner(
#
self.executor, run.experiment, run.env, self.verbose
#
)
#
if self.profile_int:
#
runner.profile_int = self.profile_int
#
await run.prep_dirs(executor=self.executor)
#
await runner.prepare()
#
except asyncio.CancelledError:
#
# it is safe to just exit here because we are not running any
#
# simulators yet
#
return None
#
print('starting run ', run.name())
#
run.output = await runner.run() # already handles CancelledError
#
# if the log is huge, this step takes some time
#
if self.verbose:
#
print(
#
f'Writing collected output of run {run.name()} to JSON file ...'
#
)
#
run.output.dump(run.outpath)
#
print('finished run ', run.name())
#
return run
#
async def wait_completion(self) -> None:
#
"""Wait for any run to terminate and return."""
#
assert self._pending_jobs
#
done, self._pending_jobs = await asyncio.wait(
#
self._pending_jobs, return_when=asyncio.FIRST_COMPLETED
#
)
#
for run in done:
#
run = await run
#
self.complete.add(run)
#
self.cores_used -= run.experiment.resreq_cores()
#
self.mem_used -= run.experiment.resreq_mem()
#
def enough_resources(self, run: Run) -> bool:
#
"""Check if enough cores and mem are available for the run."""
#
exp = run.experiment # pylint: disable=redefined-outer-name
#
if self.cores is not None:
#
enough_cores = (self.cores - self.cores_used) >= exp.resreq_cores()
#
else:
#
enough_cores = True
#
if self.mem is not None:
#
enough_mem = (self.mem - self.mem_used) >= exp.resreq_mem()
#
else:
#
enough_mem = True
#
return enough_cores and enough_mem
#
def prereq_ready(self, run: Run) -> bool:
#
"""Check if the prerequesite run for `run` has completed."""
#
if run.prereq is None:
#
return True
#
return run.prereq in self.complete
#
async def do_start(self) -> None:
#
"""Asynchronously execute the runs defined in `self.runs_noprereq +
#
self.runs_prereq."""
#
#self.completions = asyncio.Queue()
#
self.cores_used = 0
#
self.mem_used = 0
#
runs = self.runs_noprereq + self.runs_prereq
#
for run in runs:
#
# if necessary, wait for enough memory or cores
#
while not self.enough_resources(run):
#
print('waiting for resources')
#
await self.wait_completion()
#
# if necessary, wait for prerequesite runs to complete
#
while not self.prereq_ready(run):
#
print('waiting for prereq')
#
await self.wait_completion()
#
self.cores_used += run.experiment.resreq_cores()
#
self.mem_used += run.experiment.resreq_mem()
#
job = asyncio.create_task(self.do_run(run))
#
self._pending_jobs.add(job)
#
# wait for all runs to finish
#
await asyncio.gather(*self._pending_jobs)
#
async def start(self) -> None:
#
"""Execute all defined runs."""
#
self._starter_task = asyncio.create_task(self.do_start())
#
try:
#
await self._starter_task
#
except asyncio.CancelledError:
#
for job in self._pending_jobs:
#
job.cancel()
#
# wait for all runs to finish
#
await asyncio.gather(*self._pending_jobs)
#
def interrupt_handler(self) -> None:
#
self._starter_task.cancel()
experiments/simbricks/orchestration/runtime_new/runs/base.py
View file @
a0494ee3
...
...
@@ -27,9 +27,7 @@ import itertools
import
abc
from
simbricks.orchestration.simulation
import
output
from
simbricks.orchestration.simulation
import
base
as
sim_base
from
simbricks.orchestration.instantiation
import
base
as
inst_base
from
simbricks.orchestration.runtime_new
import
command_executor
class
Run
:
...
...
@@ -39,22 +37,22 @@ class Run:
def
__init__
(
self
,
simulation
:
sim_base
.
Simulation
,
instantiation
:
inst_base
.
Instantiation
,
prereq
:
Run
|
None
=
None
,
output
:
output
.
SimulationOutput
|
None
=
None
,
job_id
:
int
|
None
=
None
,
cp
:
bool
=
False
,
):
self
.
_simul
ation
:
sim
_base
.
Simulation
=
simul
ation
self
.
instanti
ation
:
inst
_base
.
Instantiation
=
instanti
ation
self
.
_run_nr
=
next
(
self
.
__run_nr
)
self
.
_instantiation
:
inst_base
.
Instantiation
=
instantiation
self
.
_output
:
output
.
SimulationOutput
|
None
=
output
self
.
_prereq
:
Run
|
None
=
prereq
self
.
_job_id
:
int
|
None
=
job_id
self
.
checkpoint
:
bool
=
cp
"""Slurm job id."""
def
name
(
self
)
->
str
:
return
self
.
_
simulation
.
name
+
"."
+
str
(
self
.
_run_nr
)
return
self
.
instantiation
.
simulation
.
name
+
"."
+
str
(
self
.
_run_nr
)
class
Runtime
(
metaclass
=
abc
.
ABCMeta
):
...
...
experiments/simbricks/orchestration/runtime_new/runs/local.py
View file @
a0494ee3
...
...
@@ -27,6 +27,7 @@ import asyncio
from
simbricks.orchestration.runtime_new
import
simulation_executor
from
simbricks.orchestration.runtime_new
import
command_executor
from
simbricks.orchestration.runtime_new.runs
import
base
as
run_base
from
simbricks.orchestration.instantiation
import
base
as
inst_base
class
LocalSimpleRuntime
(
run_base
.
Runtime
):
...
...
@@ -50,8 +51,8 @@ class LocalSimpleRuntime(run_base.Runtime):
async
def
do_run
(
self
,
run
:
run_base
.
Run
)
->
None
:
"""Actually executes `run`."""
try
:
runner
=
simulation_executor
.
Experiment
SimpleRunner
(
self
.
_executor
,
run
.
_simulation
,
run
.
_
instantiation
,
self
.
_verbose
runner
=
simulation_executor
.
Simulation
SimpleRunner
(
self
.
_executor
,
run
.
instantiation
,
self
.
_verbose
)
if
self
.
_profile_int
:
runner
.
profile_int
=
self
.
profile_int
...
...
@@ -68,11 +69,13 @@ class LocalSimpleRuntime(run_base.Runtime):
if
self
.
_verbose
:
print
(
f
"Writing collected output of run
{
run
.
name
()
}
to JSON file ..."
)
output_path
=
run
.
_
instantiation
.
get_simulation_output_path
(
run_n
umbe
r
=
run
.
_run_nr
output_path
=
run
.
instantiation
.
get_simulation_output_path
(
run_nr
=
run
.
_run_nr
)
run
.
_output
.
dump
(
outpath
=
output_path
)
await
runner
.
cleanup
()
async
def
start
(
self
)
->
None
:
"""Execute the runs defined in `self.runnable`."""
for
run
in
self
.
_runnable
:
...
...
@@ -112,10 +115,13 @@ class LocalParallelRuntime(run_base.Runtime):
self
.
_starter_task
:
asyncio
.
Task
def
add_run
(
self
,
run
:
run_base
.
Run
)
->
None
:
if
run
.
_
simulation
.
resreq_cores
()
>
self
.
_cores
:
if
run
.
instantiation
.
simulation
.
resreq_cores
()
>
self
.
_cores
:
raise
RuntimeError
(
"Not enough cores available for run"
)
if
self
.
_mem
is
not
None
and
run
.
_simulation
.
resreq_mem
()
>
self
.
_mem
:
if
(
self
.
_mem
is
not
None
and
run
.
instantiation
.
simulation
.
resreq_mem
()
>
self
.
_mem
):
raise
RuntimeError
(
"Not enough memory available for run"
)
if
run
.
_prereq
is
None
:
...
...
@@ -126,8 +132,8 @@ class LocalParallelRuntime(run_base.Runtime):
async
def
do_run
(
self
,
run
:
run_base
.
Run
)
->
run_base
.
Run
|
None
:
"""Actually executes `run`."""
try
:
runner
=
simulation_executor
.
Experiment
SimpleRunner
(
self
.
_executor
,
run
.
_simulation
,
run
.
_inst_env
,
self
.
_verbose
runner
=
simulation_executor
.
Simulation
SimpleRunner
(
self
.
_executor
,
run
.
instantiation
,
self
.
_verbose
)
if
self
.
_profile_int
is
not
None
:
runner
.
_profile_int
=
self
.
_profile_int
...
...
@@ -141,13 +147,16 @@ class LocalParallelRuntime(run_base.Runtime):
run
.
_output
=
await
runner
.
run
()
# already handles CancelledError
# if the log is huge, this step takes some time
if
self
.
verbose
:
if
self
.
_
verbose
:
print
(
f
"Writing collected output of run
{
run
.
name
()
}
to JSON file ..."
)
output_path
=
run
.
_
instantiation
.
get_simulation_output_path
(
output_path
=
run
.
instantiation
.
get_simulation_output_path
(
run_number
=
run
.
_run_nr
)
run
.
_output
.
dump
(
outpath
=
output_path
)
await
runner
.
cleanup
()
print
(
"finished run "
,
run
.
name
())
return
run
...
...
@@ -162,12 +171,14 @@ class LocalParallelRuntime(run_base.Runtime):
for
r_awaitable
in
done
:
run
=
await
r_awaitable
self
.
_complete
.
add
(
run
)
self
.
_cores_used
-=
run
.
_
simulation
.
resreq_cores
()
self
.
_mem_used
-=
run
.
_
simulation
.
resreq_mem
()
self
.
_cores_used
-=
run
.
instantiation
.
simulation
.
resreq_cores
()
self
.
_mem_used
-=
run
.
instantiation
.
simulation
.
resreq_mem
()
def
enough_resources
(
self
,
run
:
run_base
.
Run
)
->
bool
:
"""Check if enough cores and mem are available for the run."""
simulation
=
run
.
_simulation
# pylint: disable=redefined-outer-name
simulation
=
(
run
.
instantiation
.
simulation
)
# pylint: disable=redefined-outer-name
if
self
.
_cores
is
not
None
:
enough_cores
=
(
self
.
_cores
-
self
.
_cores_used
)
>=
simulation
.
resreq_cores
()
...
...
@@ -207,8 +218,8 @@ class LocalParallelRuntime(run_base.Runtime):
print
(
"waiting for prereq"
)
await
self
.
wait_completion
()
self
.
_cores_used
+=
run
.
_
simulation
.
resreq_cores
()
self
.
_mem_used
+=
run
.
_
simulation
.
resreq_mem
()
self
.
_cores_used
+=
run
.
instantiation
.
simulation
.
resreq_cores
()
self
.
_mem_used
+=
run
.
instantiation
.
simulation
.
resreq_mem
()
job
=
asyncio
.
create_task
(
self
.
do_run
(
run
))
self
.
_pending_jobs
.
add
(
job
)
...
...
experiments/simbricks/orchestration/runtime_new/simulation_executor.py
View file @
a0494ee3
...
...
@@ -36,19 +36,17 @@ from simbricks.orchestration.instantiation import base as inst_base
from
simbricks.orchestration.runtime_new
import
command_executor
class
Experiment
BaseRunner
(
abc
.
ABC
):
class
Simulation
BaseRunner
(
abc
.
ABC
):
def
__init__
(
self
,
simulation
:
sim_base
.
Simulation
,
instantiation
:
inst_base
.
Instantiation
,
verbose
:
bool
,
)
->
None
:
self
.
_simulation
:
sim_base
.
Simulation
=
simulation
self
.
_instantiation
:
inst_base
.
Instantiation
=
instantiation
self
.
_verbose
:
bool
=
verbose
self
.
_profile_int
:
int
|
None
=
None
self
.
_out
=
output
.
SimulationOutput
(
self
.
_simulation
)
self
.
_out
=
output
.
SimulationOutput
(
self
.
_
instantiation
.
simulation
)
self
.
_running
:
list
[
tuple
[
sim_base
.
Simulator
,
command_executor
.
SimpleComponent
]
]
=
[]
...
...
@@ -64,16 +62,16 @@ class ExperimentBaseRunner(abc.ABC):
name
=
sim
.
full_name
()
if
self
.
_verbose
:
print
(
f
"
{
self
.
_simulation
.
name
}
: starting
{
name
}
"
)
print
(
f
"
{
self
.
_
instantiation
.
simulation
.
name
}
: starting
{
name
}
"
)
run_cmd
=
sim
.
run_cmd
(
self
.
_instantiation
)
if
run_cmd
is
None
:
if
self
.
_verbose
:
print
(
f
"
{
self
.
_simulation
.
name
}
: started dummy
{
name
}
"
)
print
(
f
"
{
self
.
_
instantiation
.
simulation
.
name
}
: started dummy
{
name
}
"
)
return
# run simulator
executor
=
self
.
_instantiation
.
executor
executor
=
self
.
_instantiation
.
executor
# TODO: this should be a function or something
sc
=
executor
.
create_component
(
name
,
shlex
.
split
(
run_cmd
),
verbose
=
self
.
_verbose
,
canfail
=
True
)
...
...
@@ -88,11 +86,11 @@ class ExperimentBaseRunner(abc.ABC):
wait_socks
=
sim
.
sockets_wait
(
inst
=
self
.
_instantiation
)
if
len
(
wait_socks
)
>
0
:
if
self
.
_verbose
:
print
(
f
"
{
self
.
_simulation
.
name
}
: waiting for sockets
{
name
}
"
)
print
(
f
"
{
self
.
_
instantiation
.
simulation
.
name
}
: waiting for sockets
{
name
}
"
)
await
self
.
_instantiation
.
wait_for_sockets
(
sockets
=
wait_socks
)
if
self
.
_verbose
:
print
(
f
"
{
self
.
_simulation
.
name
}
: waited successfully for sockets
{
name
}
"
f
"
{
self
.
_
instantiation
.
simulation
.
name
}
: waited successfully for sockets
{
name
}
"
)
# add time delay if required
...
...
@@ -104,7 +102,7 @@ class ExperimentBaseRunner(abc.ABC):
self
.
_wait_sims
.
append
(
sc
)
if
self
.
_verbose
:
print
(
f
"
{
self
.
_simulation
.
name
}
: started
{
name
}
"
)
print
(
f
"
{
self
.
_
instantiation
.
simulation
.
name
}
: started
{
name
}
"
)
async
def
before_wait
(
self
)
->
None
:
pass
...
...
@@ -124,15 +122,15 @@ class ExperimentBaseRunner(abc.ABC):
async
def
wait_for_sims
(
self
)
->
None
:
"""Wait for simulators to terminate (the ones marked to wait on)."""
if
self
.
_verbose
:
print
(
f
"
{
self
.
_simulation
.
name
}
: waiting for hosts to terminate"
)
print
(
f
"
{
self
.
_
instantiation
.
simulation
.
name
}
: waiting for hosts to terminate"
)
for
sc
in
self
.
_wait_sims
:
await
sc
.
wait
()
async
def
terminate_collect_sims
(
self
)
->
output
.
SimulationOutput
:
async
def
terminate_collect_sims
(
self
)
->
None
:
#
output.SimulationOutput:
"""Terminates all simulators and collects output."""
self
.
_out
.
set_end
()
if
self
.
_verbose
:
print
(
f
"
{
self
.
_simulation
.
name
}
: cleaning up"
)
print
(
f
"
{
self
.
_
instantiation
.
simulation
.
name
}
: cleaning up"
)
await
self
.
before_cleanup
()
...
...
@@ -146,9 +144,6 @@ class ExperimentBaseRunner(abc.ABC):
for
_
,
sc
in
self
.
_running
:
await
sc
.
wait
()
# remove all sockets
await
self
.
_instantiation
.
cleanup_sockets
(
sockets
=
self
.
_sockets
)
# add all simulator components to the output
for
sim
,
sc
in
self
.
_running
:
self
.
_out
.
add_sim
(
sim
,
sc
)
...
...
@@ -215,8 +210,11 @@ class ExperimentBaseRunner(abc.ABC):
print
(
e
)
pass
async
def
cleanup
(
self
)
->
None
:
await
self
.
_instantiation
.
cleanup
()
class
Experiment
SimpleRunner
(
Experiment
BaseRunner
):
class
Simulation
SimpleRunner
(
Simulation
BaseRunner
):
"""Simple experiment runner with just one executor."""
def
__init__
(
self
,
executor
:
command_executor
.
Executor
,
*
args
,
**
kwargs
)
->
None
:
...
...
@@ -227,27 +225,27 @@ class ExperimentSimpleRunner(ExperimentBaseRunner):
return
self
.
_executor
class
ExperimentDistributedRunner
(
ExperimentBaseRunner
):
"""Simple experiment runner with just one executor."""
#
class ExperimentDistributedRunner(ExperimentBaseRunner):
#
"""Simple experiment runner with just one executor."""
# TODO: FIXME
def
__init__
(
self
,
execs
,
exp
:
DistributedExperiment
,
*
args
,
**
kwargs
)
->
None
:
self
.
execs
=
execs
super
().
__init__
(
exp
,
*
args
,
**
kwargs
)
self
.
exp
=
exp
# overrides the type in the base class
assert
self
.
exp
.
num_hosts
<=
len
(
execs
)
#
# TODO: FIXME
#
def __init__(self, execs, exp: DistributedExperiment, *args, **kwargs) -> None:
#
self.execs = execs
#
super().__init__(exp, *args, **kwargs)
#
self.exp = exp # overrides the type in the base class
#
assert self.exp.num_hosts <= len(execs)
def
sim_executor
(
self
,
sim
)
->
command_executor
.
Executor
:
h_id
=
self
.
exp
.
host_mapping
[
sim
]
return
self
.
execs
[
h_id
]
#
def sim_executor(self, sim) -> command_executor.Executor:
#
h_id = self.exp.host_mapping[sim]
#
return self.execs[h_id]
async
def
prepare
(
self
)
->
None
:
# make sure all simulators are assigned to an executor
assert
self
.
exp
.
all_sims_assigned
()
#
async def prepare(self) -> None:
#
# make sure all simulators are assigned to an executor
#
assert self.exp.all_sims_assigned()
# set IP addresses for proxies based on assigned executors
for
p
in
itertools
.
chain
(
self
.
exp
.
proxies_listen
,
self
.
exp
.
proxies_connect
):
executor
=
self
.
sim_executor
(
p
)
p
.
ip
=
executor
.
ip
#
# set IP addresses for proxies based on assigned executors
#
for p in itertools.chain(self.exp.proxies_listen, self.exp.proxies_connect):
#
executor = self.sim_executor(p)
#
p.ip = executor.ip
await
super
().
prepare
()
#
await super().prepare()
experiments/simbricks/orchestration/simulation/base.py
View file @
a0494ee3
...
...
@@ -287,14 +287,6 @@ class Simulation(utils_base.IdObj):
"""
self
.
timeout
:
int
|
None
=
None
"""Timeout for experiment in seconds."""
self
.
checkpoint
=
False
"""
Whether to use checkpoint and restore for simulators.
The most common use-case for this is accelerating host simulator startup
by first running in a less accurate mode, then checkpointing the system
state after boot and running simulations from there.
"""
self
.
metadata
:
dict
[
str
,
tp
.
Any
]
=
{}
self
.
_sys_sim_map
:
dict
[
sys_conf
.
Component
,
Simulator
]
=
{}
...
...
@@ -368,10 +360,10 @@ class Simulation(utils_base.IdObj):
promises
.
append
(
sim
.
prepare
(
inst
=
inst
))
await
asyncio
.
gather
(
*
promises
)
# TODO: FIXME
def
enable_checkpointing_if_supported
()
->
None
:
raise
Exception
(
"not implemented"
)
# TODO: FIXME
def
is_checkpointing_enabled
(
self
)
->
bool
:
r
aise
Exception
(
"not implemented"
)
def
any_supports_checkpointing
(
self
)
->
bool
:
if
(
len
(
list
(
filter
(
lambda
sim
:
sim
.
supports_checkpointing
(),
self
.
_sim_list
))
)
>
0
):
return
True
r
eturn
False
experiments/simbricks/orchestration/simulation/host.py
View file @
a0494ee3
...
...
@@ -70,6 +70,9 @@ class Gem5Sim(HostSim):
self
.
_variant
:
str
=
"fast"
self
.
_sys_clock
:
str
=
"1GHz"
# TODO: move to system module
def
supports_checkpointing
(
self
)
->
bool
:
return
True
def
resreq_cores
(
self
)
->
int
:
return
1
...
...
@@ -96,7 +99,7 @@ class Gem5Sim(HostSim):
def
run_cmd
(
self
,
inst
:
inst_base
.
Instantiation
)
->
str
:
cpu_type
=
self
.
cpu_type
if
inst
.
create_c
p
()
:
if
inst
.
create_c
heckpoint
:
cpu_type
=
self
.
cpu_type_cp
full_sys_hosts
=
self
.
filter_components_by_type
(
ty
=
sys_host
.
FullSystemHost
)
...
...
@@ -126,10 +129,10 @@ class Gem5Sim(HostSim):
# if self.node_config.kcmd_append:
# cmd += f'--command-line-append="{self.node_config.kcmd_append}" '
if
inst
.
create_c
p
()
:
if
inst
.
create_c
heckpoint
:
cmd
+=
"--max-checkpoints=1 "
if
inst
.
restore_c
p
()
:
if
inst
.
restore_c
heckpoint
:
cmd
+=
"-r 1 "
latency
,
sync_period
,
run_sync
=
(
...
...
@@ -153,7 +156,7 @@ class Gem5Sim(HostSim):
f
":latency=
{
latency
}
ns"
f
":sync_interval=
{
sync_period
}
ns"
)
if
run_sync
:
if
run_sync
and
not
inst
.
create_checkpoint
:
cmd
+=
":sync"
cmd
+=
" "
...
...
@@ -173,7 +176,7 @@ class Gem5Sim(HostSim):
f
":latency=
{
latency
}
ns"
f
":sync_interval=
{
sync_period
}
ns"
)
if
run_sync
:
if
run_sync
and
not
inst
.
create_checkpoint
:
cmd
+=
":sync"
cmd
+=
" "
...
...
experiments/simbricks/orchestration/simulation/net/net_base.py
View file @
a0494ee3
...
...
@@ -138,7 +138,6 @@ class SwitchNet(NetSim):
for
sock
in
listen
:
cmd
+=
" -h "
+
sock
.
_path
print
(
f
"SWITCH NET CMD!!! =====
{
cmd
}
"
)
return
cmd
...
...
@@ -227,9 +226,6 @@ class NS3DumbbellNet(SimpleNS3Sim):
cmd
+=
f
"--SimbricksPortRight=
{
sock
.
_path
}
"
# TODO cmd += f"{self.opt}"
print
(
f
"!!!!!!!!!!!!!!!!!!!!!! NS3DumbbellNet run_cmd:
{
cmd
}
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
)
return
cmd
...
...
@@ -256,7 +252,4 @@ class NS3BridgeNet(SimpleNS3Sim):
cmd
+=
f
"--SimbricksPort=
{
sock
.
_path
}
"
# TODO cmd += f"{self.opt}"
print
(
f
"!!!!!!!!!!!!!!!!!!!!!! NS3BridgeNet run_cmd:
{
cmd
}
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
)
return
cmd
experiments/simbricks/orchestration/simulation/output.py
View file @
a0494ee3
...
...
@@ -20,12 +20,16 @@
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
from
simbricks.orchestration.simulation
import
base
as
sim_base
from
simbricks.orchestration.runtime_new
import
command_executor
from
__future__
import
annotations
import
json
import
time
import
pathlib
import
json
import
typing
from
simbricks.orchestration.runtime_new
import
command_executor
if
typing
.
TYPE_CHECKING
:
from
simbricks.orchestration.simulation
import
base
as
sim_base
class
SimulationOutput
:
"""Manages an experiment's output."""
...
...
experiments/simbricks/orchestration/system/host/app.py
View file @
a0494ee3
...
...
@@ -43,7 +43,7 @@ class BaseLinuxApplication(abc.ABC):
self
.
host
=
h
self
.
start_delay
:
float
|
None
=
None
self
.
end_delay
:
float
|
None
=
None
self
.
wait
=
Tru
e
self
.
wait
=
Fals
e
@
abc
.
abstractmethod
def
run_cmds
(
self
,
inst
:
inst_base
.
Instantiation
)
->
list
[
str
]:
...
...
experiments/simbricks/orchestration/system/host/base.py
View file @
a0494ee3
...
...
@@ -138,8 +138,8 @@ class BaseLinuxHost(FullSystemHost):
)
def
config_str
(
self
,
inst
:
instantiation
.
Instantiation
)
->
str
:
if
inst
.
create_cp
():
sim
=
inst
.
find_sim_by_spec
(
spec
=
self
)
sim
=
inst
.
find_sim_by_spec
(
spec
=
self
)
if
inst
.
create_checkpoint
:
cp_cmd
=
sim
.
checkpoint_commands
()
else
:
cp_cmd
=
[]
...
...
experiments/simbricks/orchestration/system/host/disk_images.py
View file @
a0494ee3
...
...
@@ -50,7 +50,7 @@ class DiskImage(utils_base.IdObj):
async
def
make_qcow_copy
(
self
,
inst
:
inst_base
.
Instantiation
,
format
:
str
)
->
str
:
disk_path
=
pathlib
.
Path
(
self
.
path
(
inst
=
inst
,
format
=
format
))
copy_path
=
inst
.
join_
tmp_base
(
relative_path
=
f
"hdcopy.
{
self
.
_id
}
"
)
copy_path
=
inst
.
join_
imgs_path
(
relative_path
=
f
"hdcopy.
{
self
.
_id
}
"
)
prep_cmds
=
[
(
f
"
{
inst
.
join_repo_base
(
relative_path
=
self
.
_qemu_img_exec
)
}
create -f qcow2 -o "
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment