Bug - Fix issues for Ansible and benchmarks (#267)

__Description__ Fix issues for Ansible and benchmarks: * Cleanup Ansible runner private data dir to avoid out of disk space issue when node number is large. * Support both absolute and relative paths when fecth results. * Use a deterministic image in Ansible test to avoid image update. * Update logging format. * Delete torch models and inputs after export.

Bug - Fix issues for Ansible and benchmarks (#267)
__Description__ Fix issues for Ansible and benchmarks: * Cleanup Ansible runner private data dir to avoid out of disk space issue when node number is large. * Support both absolute and relative paths when fecth results. * Use a deterministic image in Ansible test to avoid image update. * Update logging format. * Delete torch models and inputs after export.
a15f773b · Yifan Xiong · GitHub · 682ed06a · a15f773b · a15f773b
Unverified Commit a15f773b authored Dec 16, 2021 by Yifan Xiong Committed by GitHub Dec 16, 2021
6 changed files
--- a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py
+++ b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py
@@ -129,10 +129,11 @@ def export_torchvision_model(self, model_name, batch_size=1):
        if not self.check_torchvision_model(model_name):
            return ''
        file_name = str(self._onnx_model_path / (model_name + '.onnx'))
-        input_shape = (batch_size, 3, 224, 224)
+        model = getattr(torchvision.models, model_name)(pretrained=False).eval().cuda()
+        dummy_input = torch.randn((batch_size, 3, 224, 224), device='cuda')
        torch.onnx.export(
-            getattr(torchvision.models, model_name)(pretrained=False).eval().cuda(),
-            torch.randn(input_shape, device='cuda'),
+            model,
+            dummy_input,
            file_name,
            opset_version=10,
            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
@@ -147,6 +148,10 @@ def export_torchvision_model(self, model_name, batch_size=1):
                }
            },
        )
+
+        del model
+        del dummy_input
+        torch.cuda.empty_cache()
        return file_name

    def export_benchmark_model(self, model_name, batch_size=1, seq_length=512):
@@ -163,13 +168,13 @@ def export_benchmark_model(self, model_name, batch_size=1, seq_length=512):
        if not self.check_benchmark_model(model_name):
            return
        file_name = str(self._onnx_model_path / (model_name + '.onnx'))
-        input_shape, dtype = (batch_size, seq_length), torch.int64
+        model = self.benchmark_models[model_name]().eval().cuda()
+        dummy_input = torch.ones((batch_size, seq_length), dtype=torch.int64, device='cuda')
        if model_name == 'lstm':
-            input_shape += (self.lstm_input_size, )
-            dtype = None
+            dummy_input = torch.ones((batch_size, seq_length, self.lstm_input_size), device='cuda')
        torch.onnx.export(
-            self.benchmark_models[model_name]().eval().cuda(),
-            torch.ones(input_shape, dtype=dtype, device='cuda'),
+            model,
+            dummy_input,
            file_name,
            opset_version=10,
            do_constant_folding=True,
@@ -185,4 +190,8 @@ def export_benchmark_model(self, model_name, batch_size=1, seq_length=512):
                }
            },
        )
+
+        del model
+        del dummy_input
+        torch.cuda.empty_cache()
        return file_name
--- a/superbench/runner/ansible.py
+++ b/superbench/runner/ansible.py
@@ -3,6 +3,7 @@

 """SuperBench Ansible Client."""

+import tempfile
 from pathlib import Path

 import ansible_runner
@@ -22,7 +23,6 @@ def __init__(self, config):
        """
        self._playbook_path = Path(__file__).parent / 'playbooks'
        self._config = {
-            'private_data_dir': None,
            'host_pattern': 'localhost',
            'cmdline': '--forks 128',
        }
@@ -69,12 +69,13 @@ def run(self, ansible_config, sudo=False):    # pragma: no cover
        if sudo:
            logger.info('Run as sudo ...')
            ansible_config['cmdline'] += ' --become'
-        r = ansible_runner.run(**ansible_config)
+        with tempfile.TemporaryDirectory(prefix='ansible') as tmpdir:
+            r = ansible_runner.run(private_data_dir=tmpdir, **ansible_config)
+            logger.debug(r.stats)
        if r.rc == 0:
            logger.info('Run succeed, return code {}.'.format(r.rc))
        else:
            logger.warning('Run failed, return code {}.'.format(r.rc))
-        logger.debug(r.stats)
        return r.rc

    def update_mpi_config(self, ansible_config):

--- a/superbench/runner/playbooks/fetch_results.yaml
+++ b/superbench/runner/playbooks/fetch_results.yaml
 - name: Fetch Results
  hosts: all
  gather_facts: true
+  vars:
+    workspace: '{{ ansible_user_dir }}/sb-workspace'
  tasks:
    - name: Synchronize Output Directory
      ansible.posix.synchronize:
        mode: pull
-        src: '{{ sb_output_dir }}/'
+        src: '{{ sb_output_dir if sb_output_dir.startswith("/") else workspace + "/" + sb_output_dir  }}/'
        dest: '{{ absolute_output_dir }}/nodes/{{ ansible_hostname }}'
        rsync_opts:
          - --exclude=nodes
--- a/superbench/runner/runner.py
+++ b/superbench/runner/runner.py
@@ -39,7 +39,7 @@ def __init__(self, sb_config, docker_config, ansible_config, sb_output_dir):
        self._ansible_client = AnsibleClient(ansible_config)

        self.__set_logger('sb-run.log')
-        logger.info('Runner uses config: %s.', pformat(self._sb_config))
+        logger.info('Runner uses config: %s.', pformat(OmegaConf.to_container(self._sb_config, resolve=True)))
        logger.info('Runner writes to: %s.', str(self._output_path))

        self._sb_benchmarks = self._sb_config.superbench.benchmarks

--- a/tests/ansible/tests/test_deploy.yaml
+++ b/tests/ansible/tests/test_deploy.yaml
@@ -14,4 +14,5 @@
  vars:
    ssh_port: 12345
    output_dir: /tmp/test_ansible
-    docker_image: superbench/superbench
+    # use a mock superbench image (requires `sb` binary inside)
+    docker_image: superbench/superbench:v0.3.0-cuda11.1.1
--- a/tests/runner/test_ansible.py
+++ b/tests/runner/test_ansible.py
@@ -47,7 +47,6 @@ def test_init_config(self):
        """Test initial config of client."""
        self.assertDictEqual(
            self.ansible_client._config, {
-                'private_data_dir': None,
                'host_pattern': 'all',
                'cmdline': f'--forks 5 --inventory {self.host_file} --user user --ask-pass --ask-become-pass',
                'passwords': {
@@ -71,7 +70,6 @@ def test_get_shell_config(self):
        cmd = 'ls -la'
        self.assertDictEqual(
            self.ansible_client.get_shell_config(cmd), {
-                'private_data_dir': None,
                'host_pattern': 'all',
                'cmdline': f'--forks 5 --inventory {self.host_file} --user user --ask-pass --ask-become-pass',
                'passwords': {
@@ -87,7 +85,6 @@ def test_get_playbook_config(self):
        """Test get_playbook_config of client."""
        self.assertDictEqual(
            self.ansible_client.get_playbook_config('play', {'foo': 'bar'}), {
-                'private_data_dir': None,
                'host_pattern': 'all',
                'cmdline': f'--forks 5 --inventory {self.host_file} --user user --ask-pass --ask-become-pass',
                'passwords': {