Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ycai
simbricks
Commits
dd244343
Commit
dd244343
authored
Aug 06, 2021
by
Antoine Kaufmann
Browse files
experiments: add --auto-dist to distribute non-distributed experiments
parent
4b43be30
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
63 additions
and
3 deletions
+63
-3
experiments/run.py
experiments/run.py
+5
-1
experiments/simbricks/runtime/__init__.py
experiments/simbricks/runtime/__init__.py
+1
-1
experiments/simbricks/runtime/distributed.py
experiments/simbricks/runtime/distributed.py
+57
-1
No files found.
experiments/run.py
View file @
dd244343
...
@@ -90,7 +90,9 @@ g_dist = parser.add_argument_group('Distributed Runtime')
...
@@ -90,7 +90,9 @@ g_dist = parser.add_argument_group('Distributed Runtime')
g_par
.
add_argument
(
'--dist'
,
dest
=
'runtime'
,
action
=
'store_const'
,
g_par
.
add_argument
(
'--dist'
,
dest
=
'runtime'
,
action
=
'store_const'
,
const
=
'dist'
,
default
=
'sequential'
,
const
=
'dist'
,
default
=
'sequential'
,
help
=
'Use sequential distributed runtime instead of local'
)
help
=
'Use sequential distributed runtime instead of local'
)
g_par
.
add_argument
(
'--auto-dist'
,
action
=
'store_const'
,
const
=
True
,
default
=
False
,
help
=
'Automatically distribute non-distributed experiments'
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
...
@@ -168,6 +170,8 @@ if not args.pickled:
...
@@ -168,6 +170,8 @@ if not args.pickled:
experiments
+=
mod
.
experiments
experiments
+=
mod
.
experiments
for
e
in
experiments
:
for
e
in
experiments
:
if
args
.
auto_dist
and
not
isinstance
(
e
,
exp
.
DistributedExperiment
):
e
=
runtime
.
auto_dist
(
e
,
executors
)
# apply filter if any specified
# apply filter if any specified
if
(
args
.
filter
)
and
(
len
(
args
.
filter
)
>
0
):
if
(
args
.
filter
)
and
(
len
(
args
.
filter
)
>
0
):
match
=
False
match
=
False
...
...
experiments/simbricks/runtime/__init__.py
View file @
dd244343
...
@@ -23,4 +23,4 @@
...
@@ -23,4 +23,4 @@
from
simbricks.runtime.common
import
(
Run
,
Runtime
)
from
simbricks.runtime.common
import
(
Run
,
Runtime
)
from
simbricks.runtime.local
import
(
LocalSimpleRuntime
,
LocalParallelRuntime
)
from
simbricks.runtime.local
import
(
LocalSimpleRuntime
,
LocalParallelRuntime
)
from
simbricks.runtime.slurm
import
SlurmRuntime
from
simbricks.runtime.slurm
import
SlurmRuntime
from
simbricks.runtime.distributed
import
DistributedSimpleRuntime
from
simbricks.runtime.distributed
import
(
DistributedSimpleRuntime
,
auto_dist
)
experiments/simbricks/runtime/distributed.py
View file @
dd244343
...
@@ -26,6 +26,7 @@ import pathlib
...
@@ -26,6 +26,7 @@ import pathlib
from
simbricks.runtime.common
import
*
from
simbricks.runtime.common
import
*
import
simbricks.experiments
as
exp
import
simbricks.experiments
as
exp
import
simbricks.exectools
as
exectools
import
simbricks.exectools
as
exectools
import
simbricks.proxy
as
proxy
class
DistributedSimpleRuntime
(
Runtime
):
class
DistributedSimpleRuntime
(
Runtime
):
def
__init__
(
self
,
execs
,
verbose
=
False
):
def
__init__
(
self
,
execs
,
verbose
=
False
):
...
@@ -35,6 +36,9 @@ class DistributedSimpleRuntime(Runtime):
...
@@ -35,6 +36,9 @@ class DistributedSimpleRuntime(Runtime):
self
.
execs
=
execs
self
.
execs
=
execs
def
add_run
(
self
,
run
):
def
add_run
(
self
,
run
):
if
not
isinstance
(
run
.
experiment
,
exp
.
DistributedExperiment
):
raise
RuntimeError
(
'Only distributed experiments supported'
)
self
.
runnable
.
append
(
run
)
self
.
runnable
.
append
(
run
)
async
def
do_run
(
self
,
run
):
async
def
do_run
(
self
,
run
):
...
@@ -52,4 +56,56 @@ class DistributedSimpleRuntime(Runtime):
...
@@ -52,4 +56,56 @@ class DistributedSimpleRuntime(Runtime):
def
start
(
self
):
def
start
(
self
):
for
run
in
self
.
runnable
:
for
run
in
self
.
runnable
:
asyncio
.
run
(
self
.
do_run
(
run
))
asyncio
.
run
(
self
.
do_run
(
run
))
\ No newline at end of file
def
auto_dist
(
e
,
execs
):
""" Converts an Experiment into a DistributedExperiment. Assigns network to
executor zero, and then round-robin assignment of hosts to executors,
while also assigning all nics for a host to the same executor.
"""
if
len
(
execs
)
<
2
:
raise
RuntimeError
(
'auto_dist needs at least two hosts'
)
elif
len
(
execs
)
>
2
:
print
(
'Warning: currently auto_dist only uses the first two hosts'
)
# Create the distributed experiment
de
=
exp
.
DistributedExperiment
(
e
.
name
,
2
)
de
.
timeout
=
e
.
timeout
de
.
checkpoint
=
e
.
checkpoint
de
.
no_simbricks
=
e
.
no_simbricks
de
.
metadata
=
e
.
metadata
.
copy
()
# create listening proxy on host 0
lp
=
proxy
.
RDMANetProxyListener
()
lp
.
name
=
'listener'
de
.
add_proxy
(
lp
)
de
.
assign_sim_host
(
lp
,
0
)
# assign networks to first host
for
net
in
e
.
networks
:
de
.
add_network
(
net
)
de
.
assign_sim_host
(
net
,
0
)
# create connecting proxy on host 1
cp
=
proxy
.
RDMANetProxyConnecter
(
lp
)
cp
.
name
=
'connecter'
de
.
add_proxy
(
cp
)
de
.
assign_sim_host
(
cp
,
1
)
# round-robin assignment for hosts
k
=
0
for
h
in
e
.
hosts
:
de
.
add_host
(
h
)
de
.
assign_sim_host
(
h
,
k
)
for
nic
in
h
.
nics
:
de
.
assign_sim_host
(
nic
,
k
)
if
k
!=
0
:
cp
.
add_nic
(
nic
)
k
=
(
k
+
1
)
%
2
for
nic
in
e
.
nics
:
de
.
add_nic
(
nic
)
return
de
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment