Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
superbenchmark
Commits
6b0ca1cb
Unverified
Commit
6b0ca1cb
authored
Jun 02, 2021
by
Yifan Xiong
Committed by
GitHub
Jun 02, 2021
Browse files
Runner - Support local mode in runner (#88)
* Support local mode in runner.
parent
44c5103b
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
134 additions
and
49 deletions
+134
-49
setup.py
setup.py
+1
-0
superbench/config/default.yaml
superbench/config/default.yaml
+5
-0
superbench/runner/runner.py
superbench/runner/runner.py
+74
-28
tests/runner/test_runner.py
tests/runner/test_runner.py
+54
-21
No files found.
setup.py
View file @
6b0ca1cb
...
@@ -136,6 +136,7 @@ setup(
...
@@ -136,6 +136,7 @@ setup(
'ansible_base>=2.10.9;os_name=="posix"'
,
'ansible_base>=2.10.9;os_name=="posix"'
,
'ansible_runner>=1.4.7'
,
'ansible_runner>=1.4.7'
,
'colorlog>=4.7.2'
,
'colorlog>=4.7.2'
,
'joblib>=1.0.1'
,
'knack>=0.7.2'
,
'knack>=0.7.2'
,
'omegaconf>=2.0.6'
,
'omegaconf>=2.0.6'
,
],
],
...
...
superbench/config/default.yaml
View file @
6b0ca1cb
...
@@ -4,6 +4,11 @@ superbench:
...
@@ -4,6 +4,11 @@ superbench:
benchmarks
:
benchmarks
:
matmul
:
matmul
:
enable
:
true
enable
:
true
modes
:
-
name
:
local
proc_num
:
8
prefix
:
CUDA_VISIBLE_DEVICES={proc_rank}
parallel
:
no
frameworks
:
frameworks
:
-
pytorch
-
pytorch
parameters
:
parameters
:
...
...
superbench/runner/runner.py
View file @
6b0ca1cb
...
@@ -6,6 +6,7 @@
...
@@ -6,6 +6,7 @@
import
random
import
random
from
pathlib
import
Path
from
pathlib
import
Path
from
joblib
import
Parallel
,
delayed
from
omegaconf
import
ListConfig
,
OmegaConf
from
omegaconf
import
ListConfig
,
OmegaConf
from
superbench.common.utils
import
SuperBenchLogger
,
logger
from
superbench.common.utils
import
SuperBenchLogger
,
logger
...
@@ -34,6 +35,7 @@ class SuperBenchRunner():
...
@@ -34,6 +35,7 @@ class SuperBenchRunner():
logger
.
info
(
'Runner writes to: %s.'
,
self
.
_output_dir
)
logger
.
info
(
'Runner writes to: %s.'
,
self
.
_output_dir
)
self
.
_sb_benchmarks
=
self
.
_sb_config
.
superbench
.
benchmarks
self
.
_sb_benchmarks
=
self
.
_sb_config
.
superbench
.
benchmarks
self
.
__validate_sb_config
()
self
.
_sb_enabled_benchmarks
=
self
.
__get_enabled_benchmarks
()
self
.
_sb_enabled_benchmarks
=
self
.
__get_enabled_benchmarks
()
logger
.
info
(
'Runner will run: %s'
,
self
.
_sb_enabled_benchmarks
)
logger
.
info
(
'Runner will run: %s'
,
self
.
_sb_enabled_benchmarks
)
...
@@ -45,6 +47,26 @@ class SuperBenchRunner():
...
@@ -45,6 +47,26 @@ class SuperBenchRunner():
"""
"""
SuperBenchLogger
.
add_handler
(
logger
.
logger
,
filename
=
str
(
Path
(
self
.
_output_dir
)
/
filename
))
SuperBenchLogger
.
add_handler
(
logger
.
logger
,
filename
=
str
(
Path
(
self
.
_output_dir
)
/
filename
))
def
__validate_sb_config
(
self
):
"""Validate SuperBench config object.
Raise:
InvalidConfigError: If input config is invalid.
"""
# TODO: add validation and defaulting
for
name
in
self
.
_sb_benchmarks
:
if
not
self
.
_sb_benchmarks
[
name
].
modes
:
self
.
_sb_benchmarks
[
name
].
modes
=
[]
for
idx
,
mode
in
enumerate
(
self
.
_sb_benchmarks
[
name
].
modes
):
if
mode
.
name
==
'local'
:
if
not
mode
.
proc_num
:
self
.
_sb_benchmarks
[
name
].
modes
[
idx
].
proc_num
=
1
if
not
mode
.
prefix
:
self
.
_sb_benchmarks
[
name
].
modes
[
idx
].
prefix
=
''
elif
mode
.
name
==
'torch.distributed'
:
if
not
mode
.
proc_num
:
self
.
_sb_benchmarks
[
name
].
modes
[
idx
].
proc_num
=
8
def
__get_enabled_benchmarks
(
self
):
def
__get_enabled_benchmarks
(
self
):
"""Get enabled benchmarks list.
"""Get enabled benchmarks list.
...
@@ -58,29 +80,42 @@ class SuperBenchRunner():
...
@@ -58,29 +80,42 @@ class SuperBenchRunner():
return
list
(
self
.
_sb_config
.
superbench
.
enable
)
return
list
(
self
.
_sb_config
.
superbench
.
enable
)
return
[
k
for
k
,
v
in
self
.
_sb_benchmarks
.
items
()
if
v
.
enable
]
return
[
k
for
k
,
v
in
self
.
_sb_benchmarks
.
items
()
if
v
.
enable
]
def
__get_mode_command
(
self
,
mode
,
exec_command
):
def
__get_mode_command
(
self
,
benchmark_name
,
mode
):
"""Get runner command for given mode.
"""Get runner command for given mode.
Args:
Args:
benchmark_name (str): Benchmark name.
mode (DictConfig): Runner mode.
mode (DictConfig): Runner mode.
exec_command (str): Executor command.
Return:
Return:
str: Runner command.
str: Runner command.
"""
"""
if
mode
.
name
==
'torch.distributed'
:
exec_command
=
(
'sb exec -c sb.config.yaml -C superbench.enable={name}'
).
format
(
name
=
benchmark_name
)
mode_command
=
exec_command
if
mode
.
name
==
'local'
:
mode_command
=
'{prefix} {command}'
.
format
(
prefix
=
mode
.
prefix
.
format
(
proc_rank
=
mode
.
proc_rank
,
proc_num
=
mode
.
proc_num
),
command
=
exec_command
,
)
elif
mode
.
name
==
'torch.distributed'
:
# TODO: replace with torch.distributed.run in v1.9
# TODO: replace with torch.distributed.run in v1.9
# TODO: only supports node_num=1 and node_num=all currently
# TODO: only supports node_num=1 and node_num=all currently
return
(
mode_command
=
(
'python3 -m torch.distributed.launch '
'python3 -m torch.distributed.launch '
'--use_env --no_python --nproc_per_node={proc_num} '
'--use_env --no_python --nproc_per_node={proc_num} '
'--nnodes={node_num} --node_rank=$NODE_RANK '
'--nnodes={node_num} --node_rank=$NODE_RANK '
'--master_addr=$MASTER_ADDR --master_port=$MASTER_PORT '
'--master_addr=$MASTER_ADDR --master_port=$MASTER_PORT '
'{command}'
'{command}
{torch_distributed_suffix}
'
).
format
(
).
format
(
proc_num
=
mode
.
proc_num
or
8
,
node_num
=
1
if
mode
.
node_num
==
1
else
'$NNODES'
,
command
=
exec_command
proc_num
=
mode
.
proc_num
,
node_num
=
1
if
mode
.
node_num
==
1
else
'$NNODES'
,
command
=
exec_command
,
torch_distributed_suffix
=
(
'superbench.benchmarks.{name}.parameters.distributed_impl=ddp '
'superbench.benchmarks.{name}.parameters.distributed_backend=nccl'
).
format
(
name
=
benchmark_name
),
)
)
return
exec
_command
return
mode
_command
.
strip
()
def
deploy
(
self
):
# pragma: no cover
def
deploy
(
self
):
# pragma: no cover
"""Deploy SuperBench environment."""
"""Deploy SuperBench environment."""
...
@@ -109,32 +144,43 @@ class SuperBenchRunner():
...
@@ -109,32 +144,43 @@ class SuperBenchRunner():
self
.
_ansible_client
.
get_playbook_config
(
'check_env.yaml'
,
extravars
=
{
'output_dir'
:
self
.
_output_dir
})
self
.
_ansible_client
.
get_playbook_config
(
'check_env.yaml'
,
extravars
=
{
'output_dir'
:
self
.
_output_dir
})
)
)
def
_run_proc
(
self
,
benchmark_name
,
mode
,
vars
):
"""Run the process.
Args:
benchmark_name (str): Benchmark name.
mode (DictConfig): Runner mode.
vars (dict): Process variables.
Returns:
int: Process return code.
"""
mode
.
update
(
vars
)
logger
.
info
(
'Runner is going to run %s in %s mode, proc rank %d.'
,
benchmark_name
,
mode
.
name
,
mode
.
proc_rank
)
rc
=
self
.
_ansible_client
.
run
(
self
.
_ansible_client
.
get_shell_config
(
(
'docker exec sb-workspace bash -c '
'"set -o allexport && source sb.env && set +o allexport && {command}"'
).
format
(
command
=
self
.
__get_mode_command
(
benchmark_name
,
mode
),
)
),
sudo
=
True
)
return
rc
def
run
(
self
):
def
run
(
self
):
"""Run the SuperBench benchmarks distributedly."""
"""Run the SuperBench benchmarks distributedly."""
self
.
check_env
()
self
.
check_env
()
runner_command
=
(
'docker exec sb-workspace bash -c '
'"set -o allexport && source sb.env && set +o allexport && {}"'
)
for
benchmark_name
in
self
.
_sb_benchmarks
:
for
benchmark_name
in
self
.
_sb_benchmarks
:
if
benchmark_name
not
in
self
.
_sb_enabled_benchmarks
:
if
benchmark_name
not
in
self
.
_sb_enabled_benchmarks
:
continue
continue
benchmark_config
=
self
.
_sb_benchmarks
[
benchmark_name
]
benchmark_config
=
self
.
_sb_benchmarks
[
benchmark_name
]
for
mode
in
benchmark_config
.
modes
or
[]:
for
mode
in
benchmark_config
.
modes
:
if
mode
.
name
==
'torch.distributed'
:
if
mode
.
name
==
'local'
:
logger
.
info
(
'Runner is going to run %s.'
,
benchmark_name
)
Parallel
(
n_jobs
=
mode
.
proc_num
if
mode
.
parallel
else
1
)(
self
.
_ansible_client
.
run
(
delayed
(
self
.
_run_proc
)(
benchmark_name
,
mode
,
{
self
.
_ansible_client
.
get_shell_config
(
'proc_rank'
:
proc_rank
runner_command
.
format
(
})
for
proc_rank
in
range
(
mode
.
proc_num
)
self
.
__get_mode_command
(
mode
,
(
'sb exec -c sb.config.yaml -C '
'superbench.enable={name} '
'superbench.benchmarks.{name}.parameters.distributed_impl=ddp '
'superbench.benchmarks.{name}.parameters.distributed_backend=nccl'
).
format
(
name
=
benchmark_name
)
)
),
sudo
=
True
)
)
)
elif
mode
.
name
==
'torch.distributed'
:
self
.
_run_proc
(
benchmark_name
,
mode
,
{
'proc_rank'
:
0
})
tests/runner/test_runner.py
View file @
6b0ca1cb
...
@@ -7,6 +7,7 @@ import unittest
...
@@ -7,6 +7,7 @@ import unittest
import
shutil
import
shutil
import
tempfile
import
tempfile
from
pathlib
import
Path
from
pathlib
import
Path
from
unittest
import
mock
from
omegaconf
import
OmegaConf
from
omegaconf
import
OmegaConf
...
@@ -36,56 +37,78 @@ class RunnerTestCase(unittest.TestCase):
...
@@ -36,56 +37,78 @@ class RunnerTestCase(unittest.TestCase):
"""Test __get_mode_command."""
"""Test __get_mode_command."""
test_cases
=
[
test_cases
=
[
{
{
'benchmark_name'
:
'foo'
,
'mode'
:
{
'mode'
:
{
'name'
:
'non_exist'
,
'name'
:
'non_exist'
,
},
},
'exec_command'
:
'sb exec'
,
'expected_command'
:
'sb exec -c sb.config.yaml -C superbench.enable=foo'
,
'expected_command'
:
'sb exec'
,
},
},
{
{
'benchmark_name'
:
'foo'
,
'mode'
:
{
'mode'
:
{
'name'
:
'torch.distributed'
,
'name'
:
'local'
,
'proc_num'
:
1
,
'prefix'
:
''
,
},
},
'exec_command'
:
'expected_command'
:
'sb exec -c sb.config.yaml -C superbench.enable=foo'
,
'sb exec'
,
},
'expected_command'
:
(
{
'python3 -m torch.distributed.launch '
'benchmark_name'
:
'--use_env --no_python --nproc_per_node=8 '
'foo'
,
'--nnodes=$NNODES --node_rank=$NODE_RANK '
'mode'
:
{
'--master_addr=$MASTER_ADDR --master_port=$MASTER_PORT '
'name'
:
'local'
,
'sb exec'
'proc_num'
:
8
,
),
'proc_rank'
:
6
,
'prefix'
:
'CUDA_VISIBLE_DEVICES={proc_rank} numactl -c $(({proc_rank}/2))'
},
'expected_command'
:
(
'CUDA_VISIBLE_DEVICES=6 numactl -c $((6/2)) '
'sb exec -c sb.config.yaml -C superbench.enable=foo'
),
},
{
'benchmark_name'
:
'foo'
,
'mode'
:
{
'name'
:
'local'
,
'proc_num'
:
16
,
'proc_rank'
:
1
,
'prefix'
:
'RANK={proc_rank} NUM={proc_num}'
},
'expected_command'
:
'RANK=1 NUM=16 sb exec -c sb.config.yaml -C superbench.enable=foo'
,
},
},
{
{
'benchmark_name'
:
'foo'
,
'mode'
:
{
'mode'
:
{
'name'
:
'torch.distributed'
,
'name'
:
'torch.distributed'
,
'proc_num'
:
1
,
'proc_num'
:
1
,
'node_num'
:
'all'
,
'node_num'
:
'all'
,
},
},
'exec_command'
:
'sb exec'
,
'expected_command'
:
(
'expected_command'
:
(
'python3 -m torch.distributed.launch '
'python3 -m torch.distributed.launch '
'--use_env --no_python --nproc_per_node=1 '
'--use_env --no_python --nproc_per_node=1 '
'--nnodes=$NNODES --node_rank=$NODE_RANK '
'--nnodes=$NNODES --node_rank=$NODE_RANK '
'--master_addr=$MASTER_ADDR --master_port=$MASTER_PORT '
'--master_addr=$MASTER_ADDR --master_port=$MASTER_PORT '
'sb exec'
'sb exec -c sb.config.yaml -C superbench.enable=foo '
'superbench.benchmarks.foo.parameters.distributed_impl=ddp '
'superbench.benchmarks.foo.parameters.distributed_backend=nccl'
),
),
},
},
{
{
'benchmark_name'
:
'foo'
,
'mode'
:
{
'mode'
:
{
'name'
:
'torch.distributed'
,
'name'
:
'torch.distributed'
,
'proc_num'
:
8
,
'proc_num'
:
8
,
'node_num'
:
1
,
'node_num'
:
1
,
},
},
'exec_command'
:
'sb exec'
,
'expected_command'
:
(
'expected_command'
:
(
'python3 -m torch.distributed.launch '
'python3 -m torch.distributed.launch '
'--use_env --no_python --nproc_per_node=8 '
'--use_env --no_python --nproc_per_node=8 '
'--nnodes=1 --node_rank=$NODE_RANK '
'--nnodes=1 --node_rank=$NODE_RANK '
'--master_addr=$MASTER_ADDR --master_port=$MASTER_PORT '
'--master_addr=$MASTER_ADDR --master_port=$MASTER_PORT '
'sb exec'
'sb exec -c sb.config.yaml -C superbench.enable=foo '
'superbench.benchmarks.foo.parameters.distributed_impl=ddp '
'superbench.benchmarks.foo.parameters.distributed_backend=nccl'
),
),
},
},
]
]
...
@@ -93,11 +116,21 @@ class RunnerTestCase(unittest.TestCase):
...
@@ -93,11 +116,21 @@ class RunnerTestCase(unittest.TestCase):
with
self
.
subTest
(
msg
=
'Testing with case'
,
test_case
=
test_case
):
with
self
.
subTest
(
msg
=
'Testing with case'
,
test_case
=
test_case
):
self
.
assertEqual
(
self
.
assertEqual
(
self
.
runner
.
_SuperBenchRunner__get_mode_command
(
self
.
runner
.
_SuperBenchRunner__get_mode_command
(
OmegaConf
.
create
(
test_case
[
'mode'
])
,
test_case
[
'exec_command'
]
test_case
[
'benchmark_name'
],
OmegaConf
.
create
(
test_case
[
'mode'
])
),
test_case
[
'expected_command'
]
),
test_case
[
'expected_command'
]
)
)
def
test_run
(
self
):
def
test_run
_empty_benchmarks
(
self
):
"""Test run."""
"""Test run
empty benchmarks, nothing should happen
."""
self
.
runner
.
_sb_enabled_benchmarks
=
[]
self
.
runner
.
_sb_enabled_benchmarks
=
[]
self
.
runner
.
run
()
self
.
runner
.
run
()
@
mock
.
patch
(
'superbench.runner.ansible.AnsibleClient.run'
)
def
test_run_default_benchmarks
(
self
,
mock_ansible_client_run
):
"""Test run default benchmarks, mock AnsibleClient.run function.
Args:
mock_ansible_client_run (function): Mocked AnsibleClient.run function.
"""
mock_ansible_client_run
.
return_value
=
0
self
.
runner
.
run
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment