Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tsoc
superbenchmark
Commits
6b0ca1cb
Unverified
Commit
6b0ca1cb
authored
Jun 02, 2021
by
Yifan Xiong
Committed by
GitHub
Jun 02, 2021
Browse files
Runner - Support local mode in runner (#88)
* Support local mode in runner.
parent
44c5103b
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
134 additions
and
49 deletions
+134
-49
setup.py
setup.py
+1
-0
superbench/config/default.yaml
superbench/config/default.yaml
+5
-0
superbench/runner/runner.py
superbench/runner/runner.py
+74
-28
tests/runner/test_runner.py
tests/runner/test_runner.py
+54
-21
No files found.
setup.py
View file @
6b0ca1cb
...
@@ -136,6 +136,7 @@ def run(self):
...
@@ -136,6 +136,7 @@ def run(self):
'ansible_base>=2.10.9;os_name=="posix"'
,
'ansible_base>=2.10.9;os_name=="posix"'
,
'ansible_runner>=1.4.7'
,
'ansible_runner>=1.4.7'
,
'colorlog>=4.7.2'
,
'colorlog>=4.7.2'
,
'joblib>=1.0.1'
,
'knack>=0.7.2'
,
'knack>=0.7.2'
,
'omegaconf>=2.0.6'
,
'omegaconf>=2.0.6'
,
],
],
...
...
superbench/config/default.yaml
View file @
6b0ca1cb
...
@@ -4,6 +4,11 @@ superbench:
...
@@ -4,6 +4,11 @@ superbench:
benchmarks
:
benchmarks
:
matmul
:
matmul
:
enable
:
true
enable
:
true
modes
:
-
name
:
local
proc_num
:
8
prefix
:
CUDA_VISIBLE_DEVICES={proc_rank}
parallel
:
no
frameworks
:
frameworks
:
-
pytorch
-
pytorch
parameters
:
parameters
:
...
...
superbench/runner/runner.py
View file @
6b0ca1cb
...
@@ -6,6 +6,7 @@
...
@@ -6,6 +6,7 @@
import
random
import
random
from
pathlib
import
Path
from
pathlib
import
Path
from
joblib
import
Parallel
,
delayed
from
omegaconf
import
ListConfig
,
OmegaConf
from
omegaconf
import
ListConfig
,
OmegaConf
from
superbench.common.utils
import
SuperBenchLogger
,
logger
from
superbench.common.utils
import
SuperBenchLogger
,
logger
...
@@ -34,6 +35,7 @@ def __init__(self, sb_config, docker_config, ansible_config, output_dir):
...
@@ -34,6 +35,7 @@ def __init__(self, sb_config, docker_config, ansible_config, output_dir):
logger
.
info
(
'Runner writes to: %s.'
,
self
.
_output_dir
)
logger
.
info
(
'Runner writes to: %s.'
,
self
.
_output_dir
)
self
.
_sb_benchmarks
=
self
.
_sb_config
.
superbench
.
benchmarks
self
.
_sb_benchmarks
=
self
.
_sb_config
.
superbench
.
benchmarks
self
.
__validate_sb_config
()
self
.
_sb_enabled_benchmarks
=
self
.
__get_enabled_benchmarks
()
self
.
_sb_enabled_benchmarks
=
self
.
__get_enabled_benchmarks
()
logger
.
info
(
'Runner will run: %s'
,
self
.
_sb_enabled_benchmarks
)
logger
.
info
(
'Runner will run: %s'
,
self
.
_sb_enabled_benchmarks
)
...
@@ -45,6 +47,26 @@ def __set_logger(self, filename):
...
@@ -45,6 +47,26 @@ def __set_logger(self, filename):
"""
"""
SuperBenchLogger
.
add_handler
(
logger
.
logger
,
filename
=
str
(
Path
(
self
.
_output_dir
)
/
filename
))
SuperBenchLogger
.
add_handler
(
logger
.
logger
,
filename
=
str
(
Path
(
self
.
_output_dir
)
/
filename
))
def
__validate_sb_config
(
self
):
"""Validate SuperBench config object.
Raise:
InvalidConfigError: If input config is invalid.
"""
# TODO: add validation and defaulting
for
name
in
self
.
_sb_benchmarks
:
if
not
self
.
_sb_benchmarks
[
name
].
modes
:
self
.
_sb_benchmarks
[
name
].
modes
=
[]
for
idx
,
mode
in
enumerate
(
self
.
_sb_benchmarks
[
name
].
modes
):
if
mode
.
name
==
'local'
:
if
not
mode
.
proc_num
:
self
.
_sb_benchmarks
[
name
].
modes
[
idx
].
proc_num
=
1
if
not
mode
.
prefix
:
self
.
_sb_benchmarks
[
name
].
modes
[
idx
].
prefix
=
''
elif
mode
.
name
==
'torch.distributed'
:
if
not
mode
.
proc_num
:
self
.
_sb_benchmarks
[
name
].
modes
[
idx
].
proc_num
=
8
def
__get_enabled_benchmarks
(
self
):
def
__get_enabled_benchmarks
(
self
):
"""Get enabled benchmarks list.
"""Get enabled benchmarks list.
...
@@ -58,29 +80,42 @@ def __get_enabled_benchmarks(self):
...
@@ -58,29 +80,42 @@ def __get_enabled_benchmarks(self):
return
list
(
self
.
_sb_config
.
superbench
.
enable
)
return
list
(
self
.
_sb_config
.
superbench
.
enable
)
return
[
k
for
k
,
v
in
self
.
_sb_benchmarks
.
items
()
if
v
.
enable
]
return
[
k
for
k
,
v
in
self
.
_sb_benchmarks
.
items
()
if
v
.
enable
]
def
__get_mode_command
(
self
,
mode
,
exec_command
):
def
__get_mode_command
(
self
,
benchmark_name
,
mode
):
"""Get runner command for given mode.
"""Get runner command for given mode.
Args:
Args:
benchmark_name (str): Benchmark name.
mode (DictConfig): Runner mode.
mode (DictConfig): Runner mode.
exec_command (str): Executor command.
Return:
Return:
str: Runner command.
str: Runner command.
"""
"""
if
mode
.
name
==
'torch.distributed'
:
exec_command
=
(
'sb exec -c sb.config.yaml -C superbench.enable={name}'
).
format
(
name
=
benchmark_name
)
mode_command
=
exec_command
if
mode
.
name
==
'local'
:
mode_command
=
'{prefix} {command}'
.
format
(
prefix
=
mode
.
prefix
.
format
(
proc_rank
=
mode
.
proc_rank
,
proc_num
=
mode
.
proc_num
),
command
=
exec_command
,
)
elif
mode
.
name
==
'torch.distributed'
:
# TODO: replace with torch.distributed.run in v1.9
# TODO: replace with torch.distributed.run in v1.9
# TODO: only supports node_num=1 and node_num=all currently
# TODO: only supports node_num=1 and node_num=all currently
return
(
mode_command
=
(
'python3 -m torch.distributed.launch '
'python3 -m torch.distributed.launch '
'--use_env --no_python --nproc_per_node={proc_num} '
'--use_env --no_python --nproc_per_node={proc_num} '
'--nnodes={node_num} --node_rank=$NODE_RANK '
'--nnodes={node_num} --node_rank=$NODE_RANK '
'--master_addr=$MASTER_ADDR --master_port=$MASTER_PORT '
'--master_addr=$MASTER_ADDR --master_port=$MASTER_PORT '
'{command}'
'{command}
{torch_distributed_suffix}
'
).
format
(
).
format
(
proc_num
=
mode
.
proc_num
or
8
,
node_num
=
1
if
mode
.
node_num
==
1
else
'$NNODES'
,
command
=
exec_command
proc_num
=
mode
.
proc_num
,
node_num
=
1
if
mode
.
node_num
==
1
else
'$NNODES'
,
command
=
exec_command
,
torch_distributed_suffix
=
(
'superbench.benchmarks.{name}.parameters.distributed_impl=ddp '
'superbench.benchmarks.{name}.parameters.distributed_backend=nccl'
).
format
(
name
=
benchmark_name
),
)
)
return
exec
_command
return
mode
_command
.
strip
()
def
deploy
(
self
):
# pragma: no cover
def
deploy
(
self
):
# pragma: no cover
"""Deploy SuperBench environment."""
"""Deploy SuperBench environment."""
...
@@ -109,32 +144,43 @@ def check_env(self): # pragma: no cover
...
@@ -109,32 +144,43 @@ def check_env(self): # pragma: no cover
self
.
_ansible_client
.
get_playbook_config
(
'check_env.yaml'
,
extravars
=
{
'output_dir'
:
self
.
_output_dir
})
self
.
_ansible_client
.
get_playbook_config
(
'check_env.yaml'
,
extravars
=
{
'output_dir'
:
self
.
_output_dir
})
)
)
def
_run_proc
(
self
,
benchmark_name
,
mode
,
vars
):
"""Run the process.
Args:
benchmark_name (str): Benchmark name.
mode (DictConfig): Runner mode.
vars (dict): Process variables.
Returns:
int: Process return code.
"""
mode
.
update
(
vars
)
logger
.
info
(
'Runner is going to run %s in %s mode, proc rank %d.'
,
benchmark_name
,
mode
.
name
,
mode
.
proc_rank
)
rc
=
self
.
_ansible_client
.
run
(
self
.
_ansible_client
.
get_shell_config
(
(
'docker exec sb-workspace bash -c '
'"set -o allexport && source sb.env && set +o allexport && {command}"'
).
format
(
command
=
self
.
__get_mode_command
(
benchmark_name
,
mode
),
)
),
sudo
=
True
)
return
rc
def
run
(
self
):
def
run
(
self
):
"""Run the SuperBench benchmarks distributedly."""
"""Run the SuperBench benchmarks distributedly."""
self
.
check_env
()
self
.
check_env
()
runner_command
=
(
'docker exec sb-workspace bash -c '
'"set -o allexport && source sb.env && set +o allexport && {}"'
)
for
benchmark_name
in
self
.
_sb_benchmarks
:
for
benchmark_name
in
self
.
_sb_benchmarks
:
if
benchmark_name
not
in
self
.
_sb_enabled_benchmarks
:
if
benchmark_name
not
in
self
.
_sb_enabled_benchmarks
:
continue
continue
benchmark_config
=
self
.
_sb_benchmarks
[
benchmark_name
]
benchmark_config
=
self
.
_sb_benchmarks
[
benchmark_name
]
for
mode
in
benchmark_config
.
modes
or
[]:
for
mode
in
benchmark_config
.
modes
:
if
mode
.
name
==
'torch.distributed'
:
if
mode
.
name
==
'local'
:
logger
.
info
(
'Runner is going to run %s.'
,
benchmark_name
)
Parallel
(
n_jobs
=
mode
.
proc_num
if
mode
.
parallel
else
1
)(
self
.
_ansible_client
.
run
(
delayed
(
self
.
_run_proc
)(
benchmark_name
,
mode
,
{
self
.
_ansible_client
.
get_shell_config
(
'proc_rank'
:
proc_rank
runner_command
.
format
(
})
for
proc_rank
in
range
(
mode
.
proc_num
)
self
.
__get_mode_command
(
mode
,
(
'sb exec -c sb.config.yaml -C '
'superbench.enable={name} '
'superbench.benchmarks.{name}.parameters.distributed_impl=ddp '
'superbench.benchmarks.{name}.parameters.distributed_backend=nccl'
).
format
(
name
=
benchmark_name
)
)
),
sudo
=
True
)
)
)
elif
mode
.
name
==
'torch.distributed'
:
self
.
_run_proc
(
benchmark_name
,
mode
,
{
'proc_rank'
:
0
})
tests/runner/test_runner.py
View file @
6b0ca1cb
...
@@ -7,6 +7,7 @@
...
@@ -7,6 +7,7 @@
import
shutil
import
shutil
import
tempfile
import
tempfile
from
pathlib
import
Path
from
pathlib
import
Path
from
unittest
import
mock
from
omegaconf
import
OmegaConf
from
omegaconf
import
OmegaConf
...
@@ -36,56 +37,78 @@ def test_get_mode_command(self):
...
@@ -36,56 +37,78 @@ def test_get_mode_command(self):
"""Test __get_mode_command."""
"""Test __get_mode_command."""
test_cases
=
[
test_cases
=
[
{
{
'benchmark_name'
:
'foo'
,
'mode'
:
{
'mode'
:
{
'name'
:
'non_exist'
,
'name'
:
'non_exist'
,
},
},
'exec_command'
:
'sb exec'
,
'expected_command'
:
'sb exec -c sb.config.yaml -C superbench.enable=foo'
,
'expected_command'
:
'sb exec'
,
},
},
{
{
'benchmark_name'
:
'foo'
,
'mode'
:
{
'mode'
:
{
'name'
:
'torch.distributed'
,
'name'
:
'local'
,
'proc_num'
:
1
,
'prefix'
:
''
,
},
},
'exec_command'
:
'expected_command'
:
'sb exec -c sb.config.yaml -C superbench.enable=foo'
,
'sb exec'
,
},
'expected_command'
:
(
{
'python3 -m torch.distributed.launch '
'benchmark_name'
:
'--use_env --no_python --nproc_per_node=8 '
'foo'
,
'--nnodes=$NNODES --node_rank=$NODE_RANK '
'mode'
:
{
'--master_addr=$MASTER_ADDR --master_port=$MASTER_PORT '
'name'
:
'local'
,
'sb exec'
'proc_num'
:
8
,
),
'proc_rank'
:
6
,
'prefix'
:
'CUDA_VISIBLE_DEVICES={proc_rank} numactl -c $(({proc_rank}/2))'
},
'expected_command'
:
(
'CUDA_VISIBLE_DEVICES=6 numactl -c $((6/2)) '
'sb exec -c sb.config.yaml -C superbench.enable=foo'
),
},
{
'benchmark_name'
:
'foo'
,
'mode'
:
{
'name'
:
'local'
,
'proc_num'
:
16
,
'proc_rank'
:
1
,
'prefix'
:
'RANK={proc_rank} NUM={proc_num}'
},
'expected_command'
:
'RANK=1 NUM=16 sb exec -c sb.config.yaml -C superbench.enable=foo'
,
},
},
{
{
'benchmark_name'
:
'foo'
,
'mode'
:
{
'mode'
:
{
'name'
:
'torch.distributed'
,
'name'
:
'torch.distributed'
,
'proc_num'
:
1
,
'proc_num'
:
1
,
'node_num'
:
'all'
,
'node_num'
:
'all'
,
},
},
'exec_command'
:
'sb exec'
,
'expected_command'
:
(
'expected_command'
:
(
'python3 -m torch.distributed.launch '
'python3 -m torch.distributed.launch '
'--use_env --no_python --nproc_per_node=1 '
'--use_env --no_python --nproc_per_node=1 '
'--nnodes=$NNODES --node_rank=$NODE_RANK '
'--nnodes=$NNODES --node_rank=$NODE_RANK '
'--master_addr=$MASTER_ADDR --master_port=$MASTER_PORT '
'--master_addr=$MASTER_ADDR --master_port=$MASTER_PORT '
'sb exec'
'sb exec -c sb.config.yaml -C superbench.enable=foo '
'superbench.benchmarks.foo.parameters.distributed_impl=ddp '
'superbench.benchmarks.foo.parameters.distributed_backend=nccl'
),
),
},
},
{
{
'benchmark_name'
:
'foo'
,
'mode'
:
{
'mode'
:
{
'name'
:
'torch.distributed'
,
'name'
:
'torch.distributed'
,
'proc_num'
:
8
,
'proc_num'
:
8
,
'node_num'
:
1
,
'node_num'
:
1
,
},
},
'exec_command'
:
'sb exec'
,
'expected_command'
:
(
'expected_command'
:
(
'python3 -m torch.distributed.launch '
'python3 -m torch.distributed.launch '
'--use_env --no_python --nproc_per_node=8 '
'--use_env --no_python --nproc_per_node=8 '
'--nnodes=1 --node_rank=$NODE_RANK '
'--nnodes=1 --node_rank=$NODE_RANK '
'--master_addr=$MASTER_ADDR --master_port=$MASTER_PORT '
'--master_addr=$MASTER_ADDR --master_port=$MASTER_PORT '
'sb exec'
'sb exec -c sb.config.yaml -C superbench.enable=foo '
'superbench.benchmarks.foo.parameters.distributed_impl=ddp '
'superbench.benchmarks.foo.parameters.distributed_backend=nccl'
),
),
},
},
]
]
...
@@ -93,11 +116,21 @@ def test_get_mode_command(self):
...
@@ -93,11 +116,21 @@ def test_get_mode_command(self):
with
self
.
subTest
(
msg
=
'Testing with case'
,
test_case
=
test_case
):
with
self
.
subTest
(
msg
=
'Testing with case'
,
test_case
=
test_case
):
self
.
assertEqual
(
self
.
assertEqual
(
self
.
runner
.
_SuperBenchRunner__get_mode_command
(
self
.
runner
.
_SuperBenchRunner__get_mode_command
(
OmegaConf
.
create
(
test_case
[
'mode'
])
,
test_case
[
'exec_command'
]
test_case
[
'benchmark_name'
],
OmegaConf
.
create
(
test_case
[
'mode'
])
),
test_case
[
'expected_command'
]
),
test_case
[
'expected_command'
]
)
)
def
test_run
(
self
):
def
test_run
_empty_benchmarks
(
self
):
"""Test run."""
"""Test run
empty benchmarks, nothing should happen
."""
self
.
runner
.
_sb_enabled_benchmarks
=
[]
self
.
runner
.
_sb_enabled_benchmarks
=
[]
self
.
runner
.
run
()
self
.
runner
.
run
()
@
mock
.
patch
(
'superbench.runner.ansible.AnsibleClient.run'
)
def
test_run_default_benchmarks
(
self
,
mock_ansible_client_run
):
"""Test run default benchmarks, mock AnsibleClient.run function.
Args:
mock_ansible_client_run (function): Mocked AnsibleClient.run function.
"""
mock_ansible_client_run
.
return_value
=
0
self
.
runner
.
run
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment