Unverified Commit 213ab14b authored by Yifan Xiong's avatar Yifan Xiong Committed by GitHub
Browse files

Bug - Fix issues for distributed runs (#258)

Fix issues for distributed runs:
* fix config for memory bandwidth benchmarks
* add throttling for high concurrency docker pull
* update rsync path and exclude directories
* handle exceptions when creating summary
* tune for logging
parent 44f0270e
...@@ -48,7 +48,12 @@ superbench: ...@@ -48,7 +48,12 @@ superbench:
ngpus: 8 ngpus: 8
operation: allreduce operation: allreduce
mem-bw: mem-bw:
<<: *default_local_mode enable: true
modes:
- name: local
proc_num: 8
prefix: HIP_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
parallel: no
gemm-flops: gemm-flops:
<<: *default_local_mode <<: *default_local_mode
parameters: parameters:
......
...@@ -49,7 +49,12 @@ superbench: ...@@ -49,7 +49,12 @@ superbench:
ngpus: 8 ngpus: 8
operation: allreduce operation: allreduce
mem-bw: mem-bw:
<<: *default_local_mode enable: true
modes:
- name: local
proc_num: 8
prefix: HIP_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
parallel: no
gemm-flops: gemm-flops:
<<: *default_local_mode <<: *default_local_mode
parameters: parameters:
......
...@@ -64,7 +64,7 @@ superbench: ...@@ -64,7 +64,7 @@ superbench:
- name: local - name: local
proc_num: 8 proc_num: 8
prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2)) prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
parallel: yes parallel: no
disk-benchmark: disk-benchmark:
enable: true enable: true
modes: modes:
......
...@@ -60,7 +60,7 @@ superbench: ...@@ -60,7 +60,7 @@ superbench:
- name: local - name: local
proc_num: 8 proc_num: 8
prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2)) prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
parallel: yes parallel: no
disk-benchmark: disk-benchmark:
enable: false enable: false
modes: modes:
......
...@@ -62,7 +62,7 @@ superbench: ...@@ -62,7 +62,7 @@ superbench:
- name: local - name: local
proc_num: 8 proc_num: 8
prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2)) prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
parallel: yes parallel: no
gpu-copy-bw: gpu-copy-bw:
enable: true enable: true
modes: modes:
......
...@@ -28,13 +28,13 @@ def __init__(self, sb_config, sb_output_dir): ...@@ -28,13 +28,13 @@ def __init__(self, sb_config, sb_output_dir):
self._output_path = Path(sb_output_dir).expanduser().resolve() self._output_path = Path(sb_output_dir).expanduser().resolve()
self.__set_logger('sb-exec.log') self.__set_logger('sb-exec.log')
logger.info('Executor uses config: %s.', self._sb_config) logger.debug('Executor uses config: %s.', self._sb_config)
logger.info('Executor writes to: %s.', str(self._output_path)) logger.debug('Executor writes to: %s.', str(self._output_path))
self.__validate_sb_config() self.__validate_sb_config()
self._sb_benchmarks = self._sb_config.superbench.benchmarks self._sb_benchmarks = self._sb_config.superbench.benchmarks
self._sb_enabled = self.__get_enabled_benchmarks() self._sb_enabled = self.__get_enabled_benchmarks()
logger.info('Executor will execute: %s', self._sb_enabled) logger.debug('Executor will execute: %s', self._sb_enabled)
def __set_logger(self, filename): def __set_logger(self, filename):
"""Set logger and add file handler. """Set logger and add file handler.
......
...@@ -74,7 +74,7 @@ def run(self, ansible_config, sudo=False): # pragma: no cover ...@@ -74,7 +74,7 @@ def run(self, ansible_config, sudo=False): # pragma: no cover
logger.info('Run succeed, return code {}.'.format(r.rc)) logger.info('Run succeed, return code {}.'.format(r.rc))
else: else:
logger.warning('Run failed, return code {}.'.format(r.rc)) logger.warning('Run failed, return code {}.'.format(r.rc))
logger.info(r.stats) logger.debug(r.stats)
return r.rc return r.rc
def update_mpi_config(self, ansible_config): def update_mpi_config(self, ansible_config):
......
...@@ -92,6 +92,7 @@ ...@@ -92,6 +92,7 @@
shell: | shell: |
docker pull {{ docker_image }} docker pull {{ docker_image }}
become: yes become: yes
throttle: 32
- name: Starting Container - name: Starting Container
shell: | shell: |
docker rm --force {{ container }} ||: && \ docker rm --force {{ container }} ||: && \
......
- name: Fetch Results - name: Fetch Results
hosts: all hosts: all
gather_facts: true gather_facts: true
vars:
workspace: '{{ ansible_user_dir }}/sb-workspace'
tasks: tasks:
- name: Synchronize Output Directory - name: Synchronize Output Directory
ansible.posix.synchronize: ansible.posix.synchronize:
mode: pull mode: pull
src: '{{ workspace }}/{{ sb_output_dir }}/' src: '{{ sb_output_dir }}/'
dest: '{{ absolute_output_dir }}/nodes/{{ ansible_hostname }}' dest: '{{ absolute_output_dir }}/nodes/{{ ansible_hostname }}'
rsync_opts:
- --exclude=nodes
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
import json import json
import random import random
from pathlib import Path from pathlib import Path
from pprint import pformat
from collections import defaultdict from collections import defaultdict
from natsort import natsorted from natsort import natsorted
...@@ -36,7 +37,7 @@ def __init__(self, sb_config, docker_config, ansible_config, sb_output_dir): ...@@ -36,7 +37,7 @@ def __init__(self, sb_config, docker_config, ansible_config, sb_output_dir):
self._ansible_client = AnsibleClient(ansible_config) self._ansible_client = AnsibleClient(ansible_config)
self.__set_logger('sb-run.log') self.__set_logger('sb-run.log')
logger.info('Runner uses config: %s.', self._sb_config) logger.info('Runner uses config: %s.', pformat(self._sb_config))
logger.info('Runner writes to: %s.', str(self._output_path)) logger.info('Runner writes to: %s.', str(self._output_path))
self._sb_benchmarks = self._sb_config.superbench.benchmarks self._sb_benchmarks = self._sb_config.superbench.benchmarks
...@@ -214,7 +215,7 @@ def __create_results_summary(self): # pragma: no cover ...@@ -214,7 +215,7 @@ def __create_results_summary(self): # pragma: no cover
json.dump(result, f) json.dump(result, f)
f.write('\n') f.write('\n')
def __create_single_node_summary(self, node_path): # pragma: no cover def __create_single_node_summary(self, node_path): # pragma: no cover # noqa: C901
"""Create the result summary file of single node. """Create the result summary file of single node.
Args: Args:
...@@ -235,7 +236,11 @@ def __create_single_node_summary(self, node_path): # pragma: no cover ...@@ -235,7 +236,11 @@ def __create_single_node_summary(self, node_path): # pragma: no cover
continue continue
for result in results: for result in results:
benchmark_name = result['name'] try:
benchmark_name = result['name']
except Exception:
logger.error('Invalid content in JSON file: {}'.format(results_file))
continue
if results_file.parts[-3].endswith('_models'): if results_file.parts[-3].endswith('_models'):
benchmark_name = '{}/{}'.format(results_file.parts[-3], result['name']) benchmark_name = '{}/{}'.format(results_file.parts[-3], result['name'])
if benchmark_name not in results_summary: if benchmark_name not in results_summary:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment