Unverified Commit c0c43b8f authored by Yifan Xiong's avatar Yifan Xiong Committed by GitHub
Browse files

Bug bash - Fix bugs in multi GPU benchmarks (#98)

* Add `sb deploy` command content.
* Fix inline if-expression syntax in playbook.
* Fix quote escape issue in bash command.
* Add custom env in config.
* Update default config for multi GPU benchmarks.
* Update MANIFEST.in to include jinja2 template.
* Require jinja2 minimum version.
* Fix occasional duplicate output in Ansible runner.
* Fix mixed color from Ansible and Python colorlog.
* Update according to comments.
* Change superbench.env from list to dict in config file.
parent 216c5b5c
include LICENSE README.md include LICENSE README.md
recursive-include superbench *.py recursive-include superbench *.py *.j2 *.yaml
recursive-include superbench *.yaml global-exclude *.py[cod] __pycache__
global-exclude *.pyc
global-exclude __pycache__
...@@ -134,11 +134,13 @@ def run(self): ...@@ -134,11 +134,13 @@ def run(self):
python_requires='>=3.6, <4', python_requires='>=3.6, <4',
install_requires=[ install_requires=[
'ansible_base>=2.10.9;os_name=="posix"', 'ansible_base>=2.10.9;os_name=="posix"',
'ansible_runner>=1.4.7', 'ansible_runner>=2.0.0rc1',
'colorlog>=4.7.2', 'colorlog>=4.7.2',
'jinja2>=2.10.1',
'joblib>=1.0.1', 'joblib>=1.0.1',
'knack>=0.7.2', 'knack>=0.7.2',
'omegaconf==2.0.6', 'omegaconf==2.0.6',
'pyyaml>=5.3',
], ],
extras_require={ extras_require={
'dev': ['pre-commit>=2.10.0'], 'dev': ['pre-commit>=2.10.0'],
......
...@@ -227,8 +227,8 @@ def deploy_command_handler( ...@@ -227,8 +227,8 @@ def deploy_command_handler(
private_key=private_key, private_key=private_key,
) )
SuperBenchRunner(sb_config, docker_config, ansible_config, output_dir) runner = SuperBenchRunner(sb_config, docker_config, ansible_config, output_dir)
raise NotImplementedError runner.deploy()
def run_command_handler( def run_command_handler(
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
from pathlib import Path from pathlib import Path
from datetime import datetime from datetime import datetime
import yaml
from omegaconf import OmegaConf from omegaconf import OmegaConf
...@@ -38,4 +39,5 @@ def get_sb_config(config_file): ...@@ -38,4 +39,5 @@ def get_sb_config(config_file):
p = Path(config_file) if config_file else default_config_file p = Path(config_file) if config_file else default_config_file
if not p.is_file(): if not p.is_file():
return None return None
return OmegaConf.load(str(p)) with p.open() as fp:
return OmegaConf.create(yaml.load(fp, Loader=yaml.SafeLoader))
...@@ -41,6 +41,7 @@ def add_handler(logger, stream=sys.stdout, filename=None, color=False): ...@@ -41,6 +41,7 @@ def add_handler(logger, stream=sys.stdout, filename=None, color=False):
) )
if color: if color:
formatter = colorlog.ColoredFormatter( formatter = colorlog.ColoredFormatter(
'%(reset)s'
'[%(cyan)s%(asctime)s %(hostname)s:%(process)d%(reset)s]' '[%(cyan)s%(asctime)s %(hostname)s:%(process)d%(reset)s]'
'[%(blue)s%(filename)s:%(lineno)s%(reset)s]' '[%(blue)s%(filename)s:%(lineno)s%(reset)s]'
'[%(log_color)s%(levelname)s%(reset)s] %(message)s' '[%(log_color)s%(levelname)s%(reset)s] %(message)s'
......
# SuperBench Config # SuperBench Config
superbench: superbench:
enable: null enable: null
benchmarks: var:
kernel-launch: default_local_mode: &default_local_mode
enable: true
gemm-flops:
enable: true
cudnn-function:
enable: true
cublas-function:
enable: true
matmul:
enable: true enable: true
modes: modes:
- name: local - name: local
proc_num: 8 proc_num: 8
prefix: CUDA_VISIBLE_DEVICES={proc_rank} prefix: CUDA_VISIBLE_DEVICES={proc_rank}
parallel: no parallel: yes
frameworks: default_pytorch_mode: &default_pytorch_mode
- pytorch
gpt_models:
enable: true enable: true
modes: modes:
- name: torch.distributed - name: torch.distributed
proc_num: 8 proc_num: 8
node_num: all node_num: 1
frameworks: frameworks:
- pytorch - pytorch
common_model_config: &common_model_config
duration: 0
num_warmup: 16
num_steps: 128
precision:
- float32
- float16
model_action:
- train
benchmarks:
kernel-launch:
<<: *default_local_mode
gemm-flops:
<<: *default_local_mode
cudnn-function:
<<: *default_local_mode
cublas-function:
<<: *default_local_mode
matmul:
<<: *default_local_mode
frameworks:
- pytorch
sharding-matmul:
<<: *default_pytorch_mode
computation-communication-overlap:
<<: *default_pytorch_mode
gpt_models:
<<: *default_pytorch_mode
models: models:
- gpt2-small - gpt2-small
- gpt2-large - gpt2-large
parameters: parameters:
duration: 0 <<: *common_model_config
num_warmup: 16
num_steps: 128
batch_size: 4 batch_size: 4
precision:
- float32
- float16
model_action:
- train
- inference
bert_models: bert_models:
enable: true <<: *default_pytorch_mode
modes:
- name: torch.distributed
proc_num: 8
node_num: all
frameworks:
- pytorch
models: models:
- bert-base - bert-base
- bert-large - bert-large
parameters: parameters:
duration: 0 <<: *common_model_config
num_warmup: 16 batch_size: 8
num_steps: 128
batch_size: 16
precision:
- float32
- float16
model_action:
- train
- inference
lstm_models: lstm_models:
enable: true <<: *default_pytorch_mode
modes:
- name: torch.distributed
proc_num: 8
node_num: all
frameworks:
- pytorch
models: models:
- lstm - lstm
parameters: parameters:
duration: 0 <<: *common_model_config
num_warmup: 16
num_steps: 128
batch_size: 128 batch_size: 128
precision: resnet_models:
- float32 <<: *default_pytorch_mode
- float16
model_action:
- train
- inference
cnn_models:
enable: true
modes:
- name: torch.distributed
proc_num: 8
node_num: all
frameworks:
- pytorch
models: models:
- resnet50 - resnet50
- resnet101 - resnet101
- resnet152 - resnet152
parameters:
<<: *common_model_config
batch_size: 128
densenet_models:
<<: *default_pytorch_mode
models:
- densenet169 - densenet169
- densenet201 - densenet201
parameters:
<<: *common_model_config
batch_size: 128
vgg_models:
<<: *default_pytorch_mode
models:
- vgg11 - vgg11
- vgg13 - vgg13
- vgg16 - vgg16
- vgg19 - vgg19
parameters: parameters:
duration: 0 <<: *common_model_config
num_warmup: 16
num_steps: 128
batch_size: 128 batch_size: 128
precision:
- float32
- float16
model_action:
- train
- inference
...@@ -22,10 +22,14 @@ ...@@ -22,10 +22,14 @@
container: sb-workspace container: sb-workspace
sb_nodes: '{{ hostvars.values() | map(attribute="ansible_hostname") | sort }}' sb_nodes: '{{ hostvars.values() | map(attribute="ansible_hostname") | sort }}'
sb_env: | sb_env: |
# pytorch env
NNODES={{ sb_nodes | length }} NNODES={{ sb_nodes | length }}
NODE_RANK={{ lookup('ansible.utils.index_of', sb_nodes, 'eq', ansible_hostname) }} NODE_RANK={{ lookup('ansible.utils.index_of', sb_nodes, 'eq', ansible_hostname) }}
MASTER_ADDR={{ sb_nodes | first }} MASTER_ADDR={{ sb_nodes | first }}
MASTER_PORT=29500 MASTER_PORT=29500
OMP_NUM_THREADS=1
# config env
{{ env | default('') }}
tasks: tasks:
- name: Updating Config - name: Updating Config
copy: copy:
......
...@@ -65,8 +65,8 @@ ...@@ -65,8 +65,8 @@
docker rm --force {{ container }} ||: && \ docker rm --force {{ container }} ||: && \
docker run -itd --name={{ container }} \ docker run -itd --name={{ container }} \
--privileged --net=host --ipc=host \ --privileged --net=host --ipc=host \
{{ '--gpus=all' if gpu_vendor == 'nvidia' }} \ {{ '--gpus=all' if gpu_vendor == 'nvidia' else '' }} \
{{ '--security-opt seccomp=unconfined --group-add video' if gpu_vendor == 'amd' }} \ {{ '--security-opt seccomp=unconfined --group-add video' if gpu_vendor == 'amd' else '' }} \
-w /root -v {{ workspace }}:/root -v /mnt:/mnt \ -w /root -v {{ workspace }}:/root -v /mnt:/mnt \
{{ docker_image }} bash && \ {{ docker_image }} bash && \
docker exec {{ container }} bash -c \ docker exec {{ container }} bash -c \
......
...@@ -54,6 +54,8 @@ def __validate_sb_config(self): ...@@ -54,6 +54,8 @@ def __validate_sb_config(self):
InvalidConfigError: If input config is invalid. InvalidConfigError: If input config is invalid.
""" """
# TODO: add validation and defaulting # TODO: add validation and defaulting
if not self._sb_config.superbench.env:
self._sb_config.superbench.env = {}
for name in self._sb_benchmarks: for name in self._sb_benchmarks:
if not self._sb_benchmarks[name].modes: if not self._sb_benchmarks[name].modes:
self._sb_benchmarks[name].modes = [] self._sb_benchmarks[name].modes = []
...@@ -141,7 +143,13 @@ def check_env(self): # pragma: no cover ...@@ -141,7 +143,13 @@ def check_env(self): # pragma: no cover
logger.info('Checking SuperBench environment.') logger.info('Checking SuperBench environment.')
OmegaConf.save(config=self._sb_config, f=str(Path(self._output_dir) / 'sb.config.yaml')) OmegaConf.save(config=self._sb_config, f=str(Path(self._output_dir) / 'sb.config.yaml'))
self._ansible_client.run( self._ansible_client.run(
self._ansible_client.get_playbook_config('check_env.yaml', extravars={'output_dir': self._output_dir}) self._ansible_client.get_playbook_config(
'check_env.yaml',
extravars={
'output_dir': self._output_dir,
'env': '\n'.join(f'{k}={v}' for k, v in self._sb_config.superbench.env.items()),
}
)
) )
def _run_proc(self, benchmark_name, mode, vars): def _run_proc(self, benchmark_name, mode, vars):
...@@ -161,7 +169,7 @@ def _run_proc(self, benchmark_name, mode, vars): ...@@ -161,7 +169,7 @@ def _run_proc(self, benchmark_name, mode, vars):
self._ansible_client.get_shell_config( self._ansible_client.get_shell_config(
( (
'docker exec sb-workspace bash -c ' 'docker exec sb-workspace bash -c '
'"set -o allexport && source sb.env && set +o allexport && {command}"' "'set -o allexport && source sb.env && set +o allexport && {command}'"
).format(command=self.__get_mode_command(benchmark_name, mode), ) ).format(command=self.__get_mode_command(benchmark_name, mode), )
), ),
sudo=True sudo=True
......
...@@ -53,7 +53,11 @@ def test_sb_version(self): ...@@ -53,7 +53,11 @@ def test_sb_version(self):
def test_sb_deploy(self): def test_sb_deploy(self):
"""Test sb deploy.""" """Test sb deploy."""
self.cmd('sb deploy --host-list localhost', expect_failure=True) self.cmd('sb deploy --host-list localhost', checks=[NoneCheck()])
def test_sb_deploy_no_host(self):
"""Test sb deploy, no host_file or host_list provided, should fail."""
self.cmd('sb deploy', expect_failure=True)
def test_sb_exec(self): def test_sb_exec(self):
"""Test sb exec.""" """Test sb exec."""
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
from pathlib import Path from pathlib import Path
from unittest import mock from unittest import mock
import yaml
from omegaconf import OmegaConf from omegaconf import OmegaConf
from superbench.executor import SuperBenchExecutor from superbench.executor import SuperBenchExecutor
...@@ -24,7 +25,8 @@ class ExecutorTestCase(unittest.TestCase): ...@@ -24,7 +25,8 @@ class ExecutorTestCase(unittest.TestCase):
def setUp(self): def setUp(self):
"""Hook method for setting up the test fixture before exercising it.""" """Hook method for setting up the test fixture before exercising it."""
default_config_file = Path(__file__).parent / '../../superbench/config/default.yaml' default_config_file = Path(__file__).parent / '../../superbench/config/default.yaml'
self.default_config = OmegaConf.load(str(default_config_file)) with default_config_file.open() as fp:
self.default_config = OmegaConf.create(yaml.load(fp, Loader=yaml.SafeLoader))
self.output_dir = tempfile.mkdtemp() self.output_dir = tempfile.mkdtemp()
self.executor = SuperBenchExecutor(self.default_config, self.output_dir) self.executor = SuperBenchExecutor(self.default_config, self.output_dir)
...@@ -61,20 +63,32 @@ def test_get_platform(self): ...@@ -61,20 +63,32 @@ def test_get_platform(self):
def test_get_arguments(self): def test_get_arguments(self):
"""Test benchmarks arguments.""" """Test benchmarks arguments."""
expected_matmul_args = '' test_cases = [
self.assertEqual( {
self.executor._SuperBenchExecutor__get_arguments( 'parameters': None,
self.default_config.superbench.benchmarks.matmul.parameters 'expected_args': '',
), expected_matmul_args },
) {
expected_bert_models_args = \ 'parameters': {
'--duration 0 --num_warmup 16 --num_steps 128 --batch_size 16 ' \ 'duration': 0,
'--precision float32 float16 --model_action train inference' 'num_warmup': 16,
self.assertEqual( 'num_steps': 128,
self.executor._SuperBenchExecutor__get_arguments( 'batch_size': 16,
self.default_config.superbench.benchmarks.bert_models.parameters 'precision': ['float32', 'float16'],
), expected_bert_models_args 'model_action': ['train', 'inference'],
) },
'expected_args': (
'--duration 0 --num_warmup 16 --num_steps 128 --batch_size 16 '
'--precision float32 float16 --model_action train inference'
),
},
]
for test_case in test_cases:
with self.subTest(msg='Testing with case', test_case=test_case):
self.assertEqual(
self.executor._SuperBenchExecutor__get_arguments(test_case['parameters']),
test_case['expected_args']
)
def test_create_benchmark_dir(self): def test_create_benchmark_dir(self):
"""Test __create_benchmark_dir.""" """Test __create_benchmark_dir."""
......
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
from pathlib import Path from pathlib import Path
from unittest import mock from unittest import mock
import yaml
from omegaconf import OmegaConf from omegaconf import OmegaConf
from superbench.runner import SuperBenchRunner from superbench.runner import SuperBenchRunner
...@@ -19,7 +20,8 @@ class RunnerTestCase(unittest.TestCase): ...@@ -19,7 +20,8 @@ class RunnerTestCase(unittest.TestCase):
def setUp(self): def setUp(self):
"""Hook method for setting up the test fixture before exercising it.""" """Hook method for setting up the test fixture before exercising it."""
default_config_file = Path(__file__).parent / '../../superbench/config/default.yaml' default_config_file = Path(__file__).parent / '../../superbench/config/default.yaml'
self.default_config = OmegaConf.load(str(default_config_file)) with default_config_file.open() as fp:
self.default_config = OmegaConf.create(yaml.load(fp, Loader=yaml.SafeLoader))
self.output_dir = tempfile.mkdtemp() self.output_dir = tempfile.mkdtemp()
self.runner = SuperBenchRunner(self.default_config, None, None, self.output_dir) self.runner = SuperBenchRunner(self.default_config, None, None, self.output_dir)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment