Unverified Commit a15f773b authored by Yifan Xiong's avatar Yifan Xiong Committed by GitHub
Browse files

Bug - Fix issues for Ansible and benchmarks (#267)

__Description__

Fix issues for Ansible and benchmarks:
* Cleanup Ansible runner private data dir to avoid out of disk space issue when node number is large.
* Support both absolute and relative paths when fecth results.
* Use a deterministic image in Ansible test to avoid image update.
* Update logging format.
* Delete torch models and inputs after export.
parent 682ed06a
...@@ -129,10 +129,11 @@ def export_torchvision_model(self, model_name, batch_size=1): ...@@ -129,10 +129,11 @@ def export_torchvision_model(self, model_name, batch_size=1):
if not self.check_torchvision_model(model_name): if not self.check_torchvision_model(model_name):
return '' return ''
file_name = str(self._onnx_model_path / (model_name + '.onnx')) file_name = str(self._onnx_model_path / (model_name + '.onnx'))
input_shape = (batch_size, 3, 224, 224) model = getattr(torchvision.models, model_name)(pretrained=False).eval().cuda()
dummy_input = torch.randn((batch_size, 3, 224, 224), device='cuda')
torch.onnx.export( torch.onnx.export(
getattr(torchvision.models, model_name)(pretrained=False).eval().cuda(), model,
torch.randn(input_shape, device='cuda'), dummy_input,
file_name, file_name,
opset_version=10, opset_version=10,
operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK, operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
...@@ -147,6 +148,10 @@ def export_torchvision_model(self, model_name, batch_size=1): ...@@ -147,6 +148,10 @@ def export_torchvision_model(self, model_name, batch_size=1):
} }
}, },
) )
del model
del dummy_input
torch.cuda.empty_cache()
return file_name return file_name
def export_benchmark_model(self, model_name, batch_size=1, seq_length=512): def export_benchmark_model(self, model_name, batch_size=1, seq_length=512):
...@@ -163,13 +168,13 @@ def export_benchmark_model(self, model_name, batch_size=1, seq_length=512): ...@@ -163,13 +168,13 @@ def export_benchmark_model(self, model_name, batch_size=1, seq_length=512):
if not self.check_benchmark_model(model_name): if not self.check_benchmark_model(model_name):
return return
file_name = str(self._onnx_model_path / (model_name + '.onnx')) file_name = str(self._onnx_model_path / (model_name + '.onnx'))
input_shape, dtype = (batch_size, seq_length), torch.int64 model = self.benchmark_models[model_name]().eval().cuda()
dummy_input = torch.ones((batch_size, seq_length), dtype=torch.int64, device='cuda')
if model_name == 'lstm': if model_name == 'lstm':
input_shape += (self.lstm_input_size, ) dummy_input = torch.ones((batch_size, seq_length, self.lstm_input_size), device='cuda')
dtype = None
torch.onnx.export( torch.onnx.export(
self.benchmark_models[model_name]().eval().cuda(), model,
torch.ones(input_shape, dtype=dtype, device='cuda'), dummy_input,
file_name, file_name,
opset_version=10, opset_version=10,
do_constant_folding=True, do_constant_folding=True,
...@@ -185,4 +190,8 @@ def export_benchmark_model(self, model_name, batch_size=1, seq_length=512): ...@@ -185,4 +190,8 @@ def export_benchmark_model(self, model_name, batch_size=1, seq_length=512):
} }
}, },
) )
del model
del dummy_input
torch.cuda.empty_cache()
return file_name return file_name
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
"""SuperBench Ansible Client.""" """SuperBench Ansible Client."""
import tempfile
from pathlib import Path from pathlib import Path
import ansible_runner import ansible_runner
...@@ -22,7 +23,6 @@ def __init__(self, config): ...@@ -22,7 +23,6 @@ def __init__(self, config):
""" """
self._playbook_path = Path(__file__).parent / 'playbooks' self._playbook_path = Path(__file__).parent / 'playbooks'
self._config = { self._config = {
'private_data_dir': None,
'host_pattern': 'localhost', 'host_pattern': 'localhost',
'cmdline': '--forks 128', 'cmdline': '--forks 128',
} }
...@@ -69,12 +69,13 @@ def run(self, ansible_config, sudo=False): # pragma: no cover ...@@ -69,12 +69,13 @@ def run(self, ansible_config, sudo=False): # pragma: no cover
if sudo: if sudo:
logger.info('Run as sudo ...') logger.info('Run as sudo ...')
ansible_config['cmdline'] += ' --become' ansible_config['cmdline'] += ' --become'
r = ansible_runner.run(**ansible_config) with tempfile.TemporaryDirectory(prefix='ansible') as tmpdir:
r = ansible_runner.run(private_data_dir=tmpdir, **ansible_config)
logger.debug(r.stats)
if r.rc == 0: if r.rc == 0:
logger.info('Run succeed, return code {}.'.format(r.rc)) logger.info('Run succeed, return code {}.'.format(r.rc))
else: else:
logger.warning('Run failed, return code {}.'.format(r.rc)) logger.warning('Run failed, return code {}.'.format(r.rc))
logger.debug(r.stats)
return r.rc return r.rc
def update_mpi_config(self, ansible_config): def update_mpi_config(self, ansible_config):
......
- name: Fetch Results - name: Fetch Results
hosts: all hosts: all
gather_facts: true gather_facts: true
vars:
workspace: '{{ ansible_user_dir }}/sb-workspace'
tasks: tasks:
- name: Synchronize Output Directory - name: Synchronize Output Directory
ansible.posix.synchronize: ansible.posix.synchronize:
mode: pull mode: pull
src: '{{ sb_output_dir }}/' src: '{{ sb_output_dir if sb_output_dir.startswith("/") else workspace + "/" + sb_output_dir }}/'
dest: '{{ absolute_output_dir }}/nodes/{{ ansible_hostname }}' dest: '{{ absolute_output_dir }}/nodes/{{ ansible_hostname }}'
rsync_opts: rsync_opts:
- --exclude=nodes - --exclude=nodes
...@@ -39,7 +39,7 @@ def __init__(self, sb_config, docker_config, ansible_config, sb_output_dir): ...@@ -39,7 +39,7 @@ def __init__(self, sb_config, docker_config, ansible_config, sb_output_dir):
self._ansible_client = AnsibleClient(ansible_config) self._ansible_client = AnsibleClient(ansible_config)
self.__set_logger('sb-run.log') self.__set_logger('sb-run.log')
logger.info('Runner uses config: %s.', pformat(self._sb_config)) logger.info('Runner uses config: %s.', pformat(OmegaConf.to_container(self._sb_config, resolve=True)))
logger.info('Runner writes to: %s.', str(self._output_path)) logger.info('Runner writes to: %s.', str(self._output_path))
self._sb_benchmarks = self._sb_config.superbench.benchmarks self._sb_benchmarks = self._sb_config.superbench.benchmarks
......
...@@ -14,4 +14,5 @@ ...@@ -14,4 +14,5 @@
vars: vars:
ssh_port: 12345 ssh_port: 12345
output_dir: /tmp/test_ansible output_dir: /tmp/test_ansible
docker_image: superbench/superbench # use a mock superbench image (requires `sb` binary inside)
docker_image: superbench/superbench:v0.3.0-cuda11.1.1
...@@ -47,7 +47,6 @@ def test_init_config(self): ...@@ -47,7 +47,6 @@ def test_init_config(self):
"""Test initial config of client.""" """Test initial config of client."""
self.assertDictEqual( self.assertDictEqual(
self.ansible_client._config, { self.ansible_client._config, {
'private_data_dir': None,
'host_pattern': 'all', 'host_pattern': 'all',
'cmdline': f'--forks 5 --inventory {self.host_file} --user user --ask-pass --ask-become-pass', 'cmdline': f'--forks 5 --inventory {self.host_file} --user user --ask-pass --ask-become-pass',
'passwords': { 'passwords': {
...@@ -71,7 +70,6 @@ def test_get_shell_config(self): ...@@ -71,7 +70,6 @@ def test_get_shell_config(self):
cmd = 'ls -la' cmd = 'ls -la'
self.assertDictEqual( self.assertDictEqual(
self.ansible_client.get_shell_config(cmd), { self.ansible_client.get_shell_config(cmd), {
'private_data_dir': None,
'host_pattern': 'all', 'host_pattern': 'all',
'cmdline': f'--forks 5 --inventory {self.host_file} --user user --ask-pass --ask-become-pass', 'cmdline': f'--forks 5 --inventory {self.host_file} --user user --ask-pass --ask-become-pass',
'passwords': { 'passwords': {
...@@ -87,7 +85,6 @@ def test_get_playbook_config(self): ...@@ -87,7 +85,6 @@ def test_get_playbook_config(self):
"""Test get_playbook_config of client.""" """Test get_playbook_config of client."""
self.assertDictEqual( self.assertDictEqual(
self.ansible_client.get_playbook_config('play', {'foo': 'bar'}), { self.ansible_client.get_playbook_config('play', {'foo': 'bar'}), {
'private_data_dir': None,
'host_pattern': 'all', 'host_pattern': 'all',
'cmdline': f'--forks 5 --inventory {self.host_file} --user user --ask-pass --ask-become-pass', 'cmdline': f'--forks 5 --inventory {self.host_file} --user user --ask-pass --ask-become-pass',
'passwords': { 'passwords': {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment