Unverified Commit a15f773b authored by Yifan Xiong's avatar Yifan Xiong Committed by GitHub
Browse files

Bug - Fix issues for Ansible and benchmarks (#267)

__Description__

Fix issues for Ansible and benchmarks:
* Cleanup Ansible runner private data dir to avoid out of disk space issue when node number is large.
* Support both absolute and relative paths when fecth results.
* Use a deterministic image in Ansible test to avoid image update.
* Update logging format.
* Delete torch models and inputs after export.
parent 682ed06a
......@@ -129,10 +129,11 @@ def export_torchvision_model(self, model_name, batch_size=1):
if not self.check_torchvision_model(model_name):
return ''
file_name = str(self._onnx_model_path / (model_name + '.onnx'))
input_shape = (batch_size, 3, 224, 224)
model = getattr(torchvision.models, model_name)(pretrained=False).eval().cuda()
dummy_input = torch.randn((batch_size, 3, 224, 224), device='cuda')
torch.onnx.export(
getattr(torchvision.models, model_name)(pretrained=False).eval().cuda(),
torch.randn(input_shape, device='cuda'),
model,
dummy_input,
file_name,
opset_version=10,
operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
......@@ -147,6 +148,10 @@ def export_torchvision_model(self, model_name, batch_size=1):
}
},
)
del model
del dummy_input
torch.cuda.empty_cache()
return file_name
def export_benchmark_model(self, model_name, batch_size=1, seq_length=512):
......@@ -163,13 +168,13 @@ def export_benchmark_model(self, model_name, batch_size=1, seq_length=512):
if not self.check_benchmark_model(model_name):
return
file_name = str(self._onnx_model_path / (model_name + '.onnx'))
input_shape, dtype = (batch_size, seq_length), torch.int64
model = self.benchmark_models[model_name]().eval().cuda()
dummy_input = torch.ones((batch_size, seq_length), dtype=torch.int64, device='cuda')
if model_name == 'lstm':
input_shape += (self.lstm_input_size, )
dtype = None
dummy_input = torch.ones((batch_size, seq_length, self.lstm_input_size), device='cuda')
torch.onnx.export(
self.benchmark_models[model_name]().eval().cuda(),
torch.ones(input_shape, dtype=dtype, device='cuda'),
model,
dummy_input,
file_name,
opset_version=10,
do_constant_folding=True,
......@@ -185,4 +190,8 @@ def export_benchmark_model(self, model_name, batch_size=1, seq_length=512):
}
},
)
del model
del dummy_input
torch.cuda.empty_cache()
return file_name
......@@ -3,6 +3,7 @@
"""SuperBench Ansible Client."""
import tempfile
from pathlib import Path
import ansible_runner
......@@ -22,7 +23,6 @@ def __init__(self, config):
"""
self._playbook_path = Path(__file__).parent / 'playbooks'
self._config = {
'private_data_dir': None,
'host_pattern': 'localhost',
'cmdline': '--forks 128',
}
......@@ -69,12 +69,13 @@ def run(self, ansible_config, sudo=False): # pragma: no cover
if sudo:
logger.info('Run as sudo ...')
ansible_config['cmdline'] += ' --become'
r = ansible_runner.run(**ansible_config)
with tempfile.TemporaryDirectory(prefix='ansible') as tmpdir:
r = ansible_runner.run(private_data_dir=tmpdir, **ansible_config)
logger.debug(r.stats)
if r.rc == 0:
logger.info('Run succeed, return code {}.'.format(r.rc))
else:
logger.warning('Run failed, return code {}.'.format(r.rc))
logger.debug(r.stats)
return r.rc
def update_mpi_config(self, ansible_config):
......
- name: Fetch Results
hosts: all
gather_facts: true
vars:
workspace: '{{ ansible_user_dir }}/sb-workspace'
tasks:
- name: Synchronize Output Directory
ansible.posix.synchronize:
mode: pull
src: '{{ sb_output_dir }}/'
src: '{{ sb_output_dir if sb_output_dir.startswith("/") else workspace + "/" + sb_output_dir }}/'
dest: '{{ absolute_output_dir }}/nodes/{{ ansible_hostname }}'
rsync_opts:
- --exclude=nodes
......@@ -39,7 +39,7 @@ def __init__(self, sb_config, docker_config, ansible_config, sb_output_dir):
self._ansible_client = AnsibleClient(ansible_config)
self.__set_logger('sb-run.log')
logger.info('Runner uses config: %s.', pformat(self._sb_config))
logger.info('Runner uses config: %s.', pformat(OmegaConf.to_container(self._sb_config, resolve=True)))
logger.info('Runner writes to: %s.', str(self._output_path))
self._sb_benchmarks = self._sb_config.superbench.benchmarks
......
......@@ -14,4 +14,5 @@
vars:
ssh_port: 12345
output_dir: /tmp/test_ansible
docker_image: superbench/superbench
# use a mock superbench image (requires `sb` binary inside)
docker_image: superbench/superbench:v0.3.0-cuda11.1.1
......@@ -47,7 +47,6 @@ def test_init_config(self):
"""Test initial config of client."""
self.assertDictEqual(
self.ansible_client._config, {
'private_data_dir': None,
'host_pattern': 'all',
'cmdline': f'--forks 5 --inventory {self.host_file} --user user --ask-pass --ask-become-pass',
'passwords': {
......@@ -71,7 +70,6 @@ def test_get_shell_config(self):
cmd = 'ls -la'
self.assertDictEqual(
self.ansible_client.get_shell_config(cmd), {
'private_data_dir': None,
'host_pattern': 'all',
'cmdline': f'--forks 5 --inventory {self.host_file} --user user --ask-pass --ask-become-pass',
'passwords': {
......@@ -87,7 +85,6 @@ def test_get_playbook_config(self):
"""Test get_playbook_config of client."""
self.assertDictEqual(
self.ansible_client.get_playbook_config('play', {'foo': 'bar'}), {
'private_data_dir': None,
'host_pattern': 'all',
'cmdline': f'--forks 5 --inventory {self.host_file} --user user --ask-pass --ask-become-pass',
'passwords': {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment